# Data Preprocessing

In [None]:
import pandas as pd
df = pd.read_csv("C:/Users/ashva/Projects/sentiment_analysis/Twitter US Airline Sentiment/Tweets.csv")
df.head()

In [None]:
# need the text and sentiment column.
review_df = df[['text','airline_sentiment']]

print(review_df.shape)
review_df.head()

In [None]:
df.columns

In [None]:
#drop neutral texts; not used in this model
review_df = review_df[review_df['airline_sentiment'] != 'neutral']

print(review_df.shape)
review_df.head()

In [None]:
# Check the values of the airline_sentiment column.
review_df["airline_sentiment"].value_counts()

In [None]:
# convert the categorical values to numeric using the "factorize()" method
sentiment_label = review_df.airline_sentiment.factorize()
sentiment_label

In [None]:
# break down all the words/sentences of a text into small parts called tokens
from tensorflow.keras.preprocessing.text import Tokenizer

# retrieve all the text data from the dataset
tweet = review_df.text.values

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)

vocab_size = len(tokenizer.word_index) + 1

# replace the words with their assigned numbers using the text_to_sequence() method
encoded_docs = tokenizer.texts_to_sequences(tweet)

In [None]:
# pad the sentences to have equal length
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [None]:
print(tokenizer.word_index)

In [None]:
print(tweet[0])
print(encoded_docs[0])

In [None]:
print(padded_sequence[0])

# Building Text Classifier

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding


embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy', 'Precise', 'Recall'])

print(model.summary())

In [None]:
history = model.fit(padded_sequence,sentiment_label[0],validation_split=0.2, epochs=5, batch_size=32)

# Visualizing the metrics

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

plt.savefig("Accuracy plot.jpg")

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')

plt.legend()
plt.show()

plt.savefig("Loss plt.jpg")

# Model Execution

In [None]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    print("Predicted label: ", sentiment_label[1][prediction])


test_sentence1 = "I enjoyed my journey on this flight."
predict_sentiment(test_sentence1)

test_sentence2 = "This is the worst flight experience of my life!"
predict_sentiment(test_sentence2)