# Data Analysis

In [None]:
import pandas as pd
df = pd.read_csv("C:/Users/ashva/Projects/sentiment_analysis/Twitter US Airline Sentiment/Tweets.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# need the text and sentiment column.
review_df = df[['text','airline_sentiment']]

print(review_df.shape)
review_df.head()

In [None]:
# Check the values of the airline_sentiment column.
review_df["airline_sentiment"].value_counts()

# Data Preprocessing

In [None]:
# convert the categorical values to numeric using the "factorize()" method
sentiment_label = review_df.airline_sentiment.factorize()
# sentiment_label[0].shape
# sentiment_label[1].shape
# sentiment_label[0]
# sentiment_label[1]
sentiment_label

In [None]:
# retrieve all the text data from the dataset
tweet = review_df.text.values

# break down all the words/sentences of a text into small parts called tokens
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=7436)

# create an association between the words and the assigned numbers using fit_on_texts
tokenizer.fit_on_texts(tweet)

# store associations in the form of a dictionary in the tokenizer.word_index attribute
vocab_size = len(tokenizer.word_index) + 1

# replace the words with their assigned numbers using the text_to_sequence() method
encoded_docs = tokenizer.texts_to_sequences(tweet)

# pad the sentences to have equal length
from tensorflow.keras.preprocessing.sequence import pad_sequences
def features_extractor(i_num):
    padded_sequence = pad_sequences(encoded_docs, maxlen=100)
    return padded_sequence[i_num]

In [None]:
# we need to extract the featured from all the tweets, so we use tqdm
from tqdm import tqdm

# Now we iterate through every tweet and extract features
# using Tokenizer

extracted_features=[]
for index_num,row in tqdm(review_df.iterrows()):
    final_class_labels=row["airline_sentiment"]
    data=features_extractor(index_num)
    extracted_features.append([data,final_class_labels])

In [None]:
# converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

# Split Processed Dataset

In [None]:
import numpy as np

# Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

# Label Encoding -> Label Encoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder()
y=to_categorical(label_encoder.fit_transform(y))

# Train Test Split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=0)

# Building Text Classifier

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Activation, SpatialDropout1D
from tensorflow.keras.layers import Embedding

num_labels = sentiment_label[1].shape[0]
print(num_labels)

embedding_vector_length = 32

model = Sequential()

model.add(Embedding(vocab_size, embedding_vector_length, input_length=100))
model.add(SpatialDropout1D(0.25))

model.add(LSTM(50, dropout=0.25, recurrent_dropout=0.25))
model.add(Dropout(0.25))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy', 'Precision', 'Recall'])

print(model.summary())

In [None]:
# Training my model
import os
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

num_epochs = 5
num_batch_size = 32

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=True)

start = datetime.now()

history = model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[cp_callback], verbose=1)

# creates a single collection of TensorFlow checkpoint files that are updated at the end of each epoch
os.listdir(checkpoint_dir)

# Save the entire model to a HDF5 file.
# The '.h5' extension indicates that the model should be saved to HDF5.
model.save('my_sen_ana_model.h5')

duration = datetime.now() - start

print("Training completed in time: ", duration)

In [None]:
# Evaluate the model
test_accuracy =model.evaluate(X_test,y_test,verbose=1)
print("Accuracy: {:.2f}%".format(test_accuracy[1]*100))

In [None]:
#model.predict_classes(X_test)
import numpy as np

predict_x=model.predict(X_test)
classes_x=np.argmax(predict_x,axis=1)
print(classes_x)

# Visualizing the metrics

In [None]:
import matplotlib.pyplot as plt

plt.subplot(211)

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('Model Accuracy')

plt.ylabel('Accuracy')
plt.xlabel('Epoch')

plt.legend(['Training', 'Validation'], loc='lower right')

plt.tight_layout()
plt.show()

plt.savefig("model/images/Accuracy plot.jpg")

In [None]:
plt.subplot(212)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('Model Loss')

plt.ylabel('Loss')
plt.xlabel('Epoch')

plt.legend(['Training', 'Validation'], loc='upper right')

plt.tight_layout()

plt.show()

plt.savefig("model/images/Loss plt.jpg")

In [None]:
import scikitplot as skplt

# convert tests labels in single-digits instead of one-hot encoding!
y_test_arg=np.argmax(y_test,axis=1)
skplt.metrics.plot_confusion_matrix(y_test_arg, classes_x, normalize=False, title = 'Confusion Matrix for CAC w/o norm')

plt.savefig("model/images/Confusion Matrix for CAC w/o norm.jpg")

In [None]:
skplt.metrics.plot_confusion_matrix(y_test_arg, classes_x, normalize=True, title = 'Confusion Matrix for CAC with norm')

plt.savefig("model/images/Confusion Matrix for CAC with norm.jpg")

In [None]:
import seaborn as sns

plt.figure(figsize = (18,8))
sns.heatmap(skplt.metrics.confusion_matrix(y_test_arg, classes_x, normalize='true'), annot = True, cmap = 'plasma')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

plt.savefig("model/images/Confusion Matrix for CAC with norm(sns).jpg")

# Model Execution

In [None]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    print("Predicted label: ", sentiment_label[1][prediction])

choice = 'y'

while 'y':
    test_sentence1 = input("Enter statement: ")
    predict_sentiment(test_sentence1)
    choice = input("Continue (y/n) ?")
    if choice == 'n':
        break