https://www.kaggle.com/code/prashant268/sentiment-analysis-lstm \\
https://www.kaggle.com/code/caiyutiansg/twitter-sentiment-analysis-with-word2vec-lstm

In [40]:
# Imports
import csv
import re
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
import gensim as gs
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import torch
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, auc, roc_curve



import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive/')

%cd /content/drive/Shareddrives/'Curriculum Project'

Mounted at /content/drive/
/content/drive/Shareddrives/Curriculum Project


In [3]:
# use GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Parsing Data:

In [4]:
# Use processed data
df = pd.read_csv("processed/processedtweets.csv", encoding="latin-1", header=None)

In [5]:
# Add headers
headers = ['target', 'id', 'date', 'flag', 'username', 'tweet']
df.columns = headers

In [6]:
# Drop unused columns
df = df.drop('id', axis=1).drop('date', axis=1).drop('flag', axis=1).drop('username', axis=1)
df = df[df['tweet'].notnull()]

Splitting into Testing and Training Data

In [7]:
traindf, testdf = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
traindf.head()

Unnamed: 0,target,tweet
382524,0,sad sun run away
1574589,4,finished filming quest stay tuned find im talking
910361,4,stocked tonight
423343,0,needs bestie wheres pep talk
1330471,4,spend night big brother winner rachel Â£500


In [8]:
# User Word2Vec to encode words

# Split into list of lists
tweetLists = []
for tweet in traindf.tweet:
  tweetLists.append(tweet.split())


Load in the Word2Vec Model:

In [9]:
# Load the model
w2v = gs.models.word2vec.Word2Vec.load("w2v_model")

In [None]:
w2v.wv.most_similar("awesome")

[('amazing', 0.7199200391769409),
 ('great', 0.6352713108062744),
 ('fantastic', 0.5598714351654053),
 ('cool', 0.502548098564148),
 ('incredible', 0.4775034189224243),
 ('awsome', 0.4523509740829468),
 ('hilarious', 0.43644797801971436),
 ('love', 0.4201076626777649),
 ('epic', 0.4195583760738373),
 ('blast', 0.41918861865997314)]

In [10]:
# Tokenize (unique token to each word)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(traindf.tweet)

In [11]:
# Number of total words
totalWords = len(tokenizer.word_index) + 1
# print("Total Words: " + str(totalWords))

In [56]:
# Use pad_sequences() to create train and test data
Xtrain = pad_sequences(tokenizer.texts_to_sequences(traindf.tweet), maxlen=256)
Xtest = pad_sequences(tokenizer.texts_to_sequences(testdf.tweet), maxlen=256)

In [None]:
Xtrain.shape

(1279673, 256)

In [57]:
# Now use targets (0 = negative, 4 = positive)
Ytrain = traindf.target
Ytest = testdf.target

In [58]:
# Use encoder for categorical target to be between 0 and 1

labelencoder = LabelEncoder()
Ytrain = labelencoder.fit_transform(Ytrain)
Ytest = labelencoder.fit_transform(Ytest)

Create LSTM Model


In [15]:
# Create an embedding matrix (word to embedding of the word)

emb_mat = np.zeros((totalWords, 256))

for word, i in tokenizer.word_index.items():
  if word in w2v.wv:
    emb_mat[i] = w2v.wv[word]


# Embedding layer
emb_layer = Embedding(totalWords, 256, weights=[emb_mat], input_length=256, trainable=False)

In [None]:
model = Sequential()
model.add(emb_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 256, 256)          168379648 
                                                                 
 dropout (Dropout)           (None, 256, 256)          0         
                                                                 
 lstm (LSTM)                 (None, 100)               142800    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 168,522,549
Trainable params: 142,901
Non-trainable params: 168,379,648
_________________________________________________________________


In [None]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

In [None]:
# Save the model
model.save("LSTMmodel")

Load in the model

In [61]:
# Load in model
model = keras.models.load_model('LSTMmodel')



In [None]:
# Fit the model (regular method, without k-fold cross validate)
model_history=model.fit(Xtrain, Ytrain, batch_size=2048,epochs=15,validation_split=0.1,verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
# Save model history (and save model itself)
model.save("LSTMmodel")
accuracy = model_history.history['accuracy']
val_acc = model_history.history['val_accuracy']
loss = model_history.history['loss']
val_loss = model_history.history['val_loss']

Process Results

In [17]:
# Function to predict sentiment score of a tweet
def preprocess(text):
    # Remove special characters
    special = ['`', '!', '@', '#', '$', '%', '^', '&' ,'*' ,'(',')','_','+','\\','-','=','[',']','{','}',';',':','"','|',',','.','<','>','/','?','~', '\'']
    for s in special:
      while s in text:
        text = text.replace(s, '')
    

    # Remove stop words
    stop = stopwords.words('english')
    ignore = ['http', 'amp', 'quot', 'twitpic', 'tinyurl']
    old = text.split()
    new = []
    for w in old:
      if w.lower() not in stop:
        for i in ignore:
          while i in w.lower():
            w = w.lower().replace(i, '')
        
        new.append(w.lower())
      
    text = ' '.join(new)
    print(text)
    text=pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=256)
    return text

# Function to predict sentiment score of a tweet (returns readable tweet)
def preprocess2(text):
    # Remove special characters
    special = ['`', '!', '@', '#', '$', '%', '^', '&' ,'*' ,'(',')','_','+','\\','-','=','[',']','{','}',';',':','"','|',',','.','<','>','/','?','~', '\'']
    for s in special:
      while s in text:
        text = text.replace(s, '')
    

    # Remove stop words
    stop = stopwords.words('english')
    ignore = ['http', 'amp', 'quot', 'twitpic', 'tinyurl']
    old = text.split()
    new = []
    for w in old:
      if w.lower() not in stop:
        for i in ignore:
          while i in w.lower():
            w = w.lower().replace(i, '')
        
        new.append(w.lower())
      
    text = ' '.join(new)
    return text


def prediction(tweet):
    tweet = preprocess(tweet)
    score = model.predict(tweet)
    score = score[0]
    if score < 0.5:
      print("Negative: " + str(score))
    else:
      print("Positive: " + str(score))


In [18]:
tweet = "Hello there! My name is Aditya Kumar and this is my project: http://somelink.com"
print("Before: " + tweet)
print("After: " + preprocess2(tweet))

Before: Hello there! My name is Aditya Kumar and this is my project: http://somelink.com
After: hello name aditya kumar project somelinkcom


Testing with Real Tweets!


In [62]:
# https://twitter.com/DeadlineDayLive/status/1595323484451635200
tweet = " 𝗢𝗙𝗙𝗜𝗖𝗜𝗔𝗟: France have confirmed that Lucas \
Hernandez torn his ACL in the game against Australia. He will miss the rest \
of the World Cup and the rest of the season. Gutted for him"
prediction(tweet)

𝗢𝗙𝗙𝗜𝗖𝗜𝗔𝗟 france confirmed lucas hernandez torn acl game australia miss rest world cup rest season gutted
Negative: [0.01338398]


In [63]:
# https://twitter.com/elonmusk/status/1595207476936413187
tweet = "Wasn’t Twitter supposed to die by now or something … ?"
prediction(tweet)

wasn’t twitter supposed die something …
Negative: [0.21841903]


In [None]:
# https://twitter.com/doritosce/status/1595124507811147776
tweet = "I need a serotonin boost so I’m gonna watch Chris Evans play with puppies 🫶💞"
prediction(tweet)

need serotonin boost i’m gonna watch chris evans play puppies 🫶💞
Positive: [0.7241439]


In [None]:
# https://twitter.com/TheNBACentral/status/1594815723951779840
tweet = "“I’ll never get to see Kobe again, in real life, forever. And I just should have called. \
He should have called. We both should have called…Call your mom. Call your brother. Call the homeboy \
you used to party with in college. Forever is a long time. - Shaq 😢"
prediction(tweet)

“i’ll never get see kobe real life forever called called called…call mom call brother call homeboy used party college forever long time shaq 😢
Negative: [0.2041248]


In [None]:
# https://twitter.com/jordan_demi1/status/1595275265336385536
tweet = "People are really losing their minds. A manager walked into the break room at the Walmart on Battlefield \
(Chesapeake, VA) and started shooting employees then killed himself. This world is such an evil place; & right before \
the holidays too. We're really not safe anywhere.💔😢"
prediction(tweet)

people really losing minds manager walked break room walmart battlefield chesapeake va started shooting employees killed world evil place right holidays really safe anywhere💔😢
Negative: [0.24312954]


In [None]:
# https://twitter.com/AdamParkhomenko/status/1595140958110511104
tweet = "Breaking: Supreme Court gives greenlight to US House to get Trump tax returns. They better hurry up."
prediction(tweet)

breaking supreme court gives greenlight us house get trump tax returns better hurry
Negative: [0.49760363]


In [None]:
tweet = ""
prediction(tweet)


Positive: [0.53580225]


Measuring Accuracy

In [23]:
# Testing
scores = model.predict(Xtest, verbose=1, batch_size=2048)



In [24]:
Ypred = np.where(scores>0.5,1,0)

In [35]:
scores[6:10]

array([[0.8920888 ],
       [0.45655647],
       [0.61639124],
       [0.96186274]], dtype=float32)

In [34]:
Ytest[6:10]

array([0, 0, 1, 1])

In [29]:
cm=confusion_matrix(Ypred, Ytest)
print(cm)
print("True Positives: " + str(cm[0][0]))
print("False Negatives: " + str(cm[0][1]))
print("False Positives: " + str(cm[1][0]))
print("True Negatives: " + str(cm[1][1]))

[[130321  34664]
 [ 29251 125683]]
True Positives: 130321
False Negatives: 34664
False Positives: 29251
True Negatives: 125683


In [41]:
print(classification_report(Ytest, Ypred))

              precision    recall  f1-score   support

           0       0.79      0.82      0.80    159572
           1       0.81      0.78      0.80    160347

    accuracy                           0.80    319919
   macro avg       0.80      0.80      0.80    319919
weighted avg       0.80      0.80      0.80    319919



In [43]:
fpr, tpr, thresholds = roc_curve(Ytest, Ypred, pos_label = 1)
auc(fpr, tpr)

0.8002548705185263

K-Fold Cross Validation


In [64]:
# https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-use-k-fold-cross-validation-with-keras.md

kfold = KFold(n_splits=3, shuffle=True)

fold = 1

inputs = np.concatenate((Xtrain, Xtest), axis=0)
targets = np.concatenate((Ytrain, Ytest), axis=0)

acc_per_fold = []
loss_per_fold = []
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model = Sequential()
  model.add(emb_layer)
  model.add(Dropout(0.5))
  model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(1, activation='sigmoid'))

  # Compile model
  model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold} ...')

  # Fit data to model
  model_history=model.fit(inputs[train], targets[train], batch_size=2048,epochs=7,validation_split=0.1,verbose=1)

  # Generate generalization metrics
  #scores = model.evaluate(inputs[test], targets[test], verbose=0)
  #print(scores)
  #print(f'Score for fold {fold}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  #acc_per_fold.append(scores[1] * 100)
  #loss_per_fold.append(scores[0])


  scores = model.predict(inputs[test], verbose=1, batch_size=2048)
  Ypred2 = np.where(scores>0.5,1,0)
  Ytest2 = targets[test]

  cm=confusion_matrix(Ypred2, Ytest2)
  print(cm)
  print("True Positives: " + str(cm[0][0]))
  print("False Negatives: " + str(cm[0][1]))
  print("False Positives: " + str(cm[1][0]))
  print("True Negatives: " + str(cm[1][1]))
  print()

  print(classification_report(Ytest2, Ypred2))
  print()

  fpr, tpr, thresholds = roc_curve(Ytest2, Ypred2, pos_label = 1)
  print("AUC: " + str(auc(fpr, tpr)))
  print()
  print()


  # Increase fold number
  fold = fold + 1



------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
[[212502  55615]
 [ 54245 210836]]
True Positives: 212502
False Negatives: 55615
False Positives: 54245
True Negatives: 210836





              precision    recall  f1-score   support

           0       0.79      0.80      0.79    266747
           1       0.80      0.79      0.79    266451

    accuracy                           0.79    533198
   macro avg       0.79      0.79      0.79    533198
weighted avg       0.79      0.79      0.79    533198


AUC: 0.7939587275270364


------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
[[208598  51400]
 [ 57634 215565]]
True Positives: 208598
False Negatives: 51400
False Positives: 57634
True Negatives: 215565





              precision    recall  f1-score   support

           0       0.80      0.78      0.79    266232
           1       0.79      0.81      0.80    266965

    accuracy                           0.80    533197
   macro avg       0.80      0.80      0.80    533197
weighted avg       0.80      0.80      0.80    533197


AUC: 0.7954925175471075


------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
[[217988  61844]
 [ 48825 204540]]
True Positives: 217988
False Negatives: 61844
False Positives: 48825
True Negatives: 204540

              precision    recall  f1-score   support

           0       0.78      0.82      0.80    266813
           1       0.81      0.77      0.79    266384

    accuracy                           0.79    533197
   macro avg       0.79      0.79      0.79    533197
weighted avg       0.79      0.79      0.79    533197


AUC: 0.7924227884174788