In [37]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [38]:
import pickle as pkl
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd
from textblob import Word
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
# import tensorflow
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split 
import wordcloud
import matplotlib.pyplot as plt
import numpy as np
import re

In [39]:
"""
Mounting G-drive
"""

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
"""
Loading dataset for analysis
"""

df = pd.read_csv(r"/content/drive/MyDrive/Airline_Sentiment_analysis/dataset/Usecase3_Customer_Sentiment_Dataset.csv")

In [41]:
df.head()

Unnamed: 0,airline_sentiment,airline,text
0,neutral,Virgin America,@VirginAmerica What @dhepburn said.
1,positive,Virgin America,@VirginAmerica plus you've added commercials t...
2,neutral,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,negative,Virgin America,@VirginAmerica it's really aggressive to blast...
4,negative,Virgin America,@VirginAmerica and it's a really big bad thing...


In [42]:
class_counts = df.airline_sentiment.value_counts()

In [43]:
min_class_count = class_counts.min()
min_class_count


2363

In [44]:
balanced_data = pd.concat([df[df['airline_sentiment'] == class_].sample(min_class_count) for class_ in class_counts.index])


In [45]:
# Shuffle the resulting DataFrame
balanced_data = balanced_data.sample(frac=1, random_state=42)

In [46]:
df = balanced_data

In [47]:
def tweet_preprocessor(tweet):
    # precprcess tweet
    tweet_words = []

    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        
        elif word.startswith('http'):
            word = "http"
        tweet_words.append(word)

    tweet_proc = " ".join(tweet_words)
    return tweet_proc

In [48]:
df['text'] = df['text'].apply(lambda x: tweet_preprocessor(x))

In [49]:
# Encoded the target column
lb=LabelEncoder()
df['Label'] = lb.fit_transform(df['airline_sentiment'])

In [50]:
#Generating Embeddings using tokenizer
tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)

In [51]:
#Model Building
model = Sequential()
model.add(Embedding(500, 120, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(352, activation='LeakyReLU'))
model.add(Dense(3, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 120)           60000     
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 30, 120)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 704)               2323200   
                                                                 
 dense_2 (Dense)             (None, 352)               248160    
                                                                 
 dense_3 (Dense)             (None, 3)                 1059      
                                                                 
Total params: 2,632,419
Trainable params: 2,632,419
Non-trainable params: 0
____________________________________________

In [52]:
#Splitting the data into training and testing
y=pd.get_dummies(df['Label'])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [53]:
#Model Training
batch_size=32
model.fit(X_train, y_train, epochs = 15, batch_size=batch_size, verbose = 'auto')


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f53487c8c70>

In [61]:
# Model Testing
print(r"NN test score on 20% split: ")
model.evaluate(X_test,y_test)

NN test score on 20% split: 


[0.8909470438957214, 0.6939350962638855]

In [55]:
def predict_sentiment(tweet):
    # Tokenize and pad the input tweet
    tweet_seq = tokenizer.texts_to_sequences([tweet])
    tweet_padded = pad_sequences(tweet_seq, maxlen=X.shape[1], padding='post')
    
    # Make the prediction
    sentiment_probs = model.predict(tweet_padded)[0]
    
    # Map the predicted sentiment probabilities to the sentiment labels
    sentiment_labels = ['negative', 'neutral', 'positive']
    predicted_sentiment = sentiment_labels[np.argmax(sentiment_probs)]
    
    return predicted_sentiment

In [56]:
tweet = "This airline is the worst. I will never fly with them again."
predicted_sentiment = predict_sentiment(tweet)
print(predicted_sentiment)

negative


In [57]:
tweet = "I can fly with them again"
predicted_sentiment = predict_sentiment(tweet)
print(predicted_sentiment)

neutral


In [63]:
model.save(r"/content/drive/MyDrive/Airline_Sentiment_analysis/model/LSTM_Sampled.pth")

