# **Sentiment Analysis on COVID-19 related Tweets (Deep Learning: LSTM with Variable Length Input Sequence)**


*   Dataset: SentimentAnalysisCOVID-19MasterFinalDataset.csv
*   Runtime Type: GPU




# **Mount Google Drive**

In [None]:
# We have to mount
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


# **Import Packages**

In [None]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import nltk
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

# **Read COVID-19 Dataset**


In [None]:
#Read CSV from Pandas for Data Analysis
import pandas as pd

f = open('gdrive/My Drive/Colab Notebooks/SentimentAnalysisCOVID-19MasterFinalDataset.csv','rU')   
df = pd.read_csv(f)

  after removing the cwd from sys.path.


In [None]:
df = df[['Translated','Sentiment']]
df.head()

Unnamed: 0,Translated,Sentiment
0,me to covid,Neutral
1,so many realizations during ecq because of cov...,Positive
2,it's like a covid covid kesa back to cejay hah...,Positive
3,while we are all fighting against covid meanwh...,Neutral
4,person house because of covid,Neutral


In [None]:
print(df.shape)

(44709, 2)


In [None]:
def convert_label(polarity):
  polarity.lower()
  for i in range(len(df)):
    if polarity == "Negative":
      return 0;
    elif polarity == "Neutral":
      return 1;
    elif polarity == "Positive":
      return 2;

df["Analysis"] = df["Sentiment"].apply(convert_label)          

In [None]:
#df.dropna(inplace=True)
#df.reset_index(drop=True, inplace=True)

In [None]:
df.head(10)

Unnamed: 0,Translated,Sentiment,Analysis
0,me to covid,Neutral,1
1,so many realizations during ecq because of cov...,Positive,2
2,it's like a covid covid kesa back to cejay hah...,Positive,2
3,while we are all fighting against covid meanwh...,Neutral,1
4,person house because of covid,Neutral,1
5,I will die in stress not in covid,Neutral,1
6,I Want To Join Case There is still a covid,Neutral,1
7,So confiscate the Akuang Alcohol ML then the w...,Neutral,1
8,i choose you to be a positive from covid char...,Positive,2
9,jgh then ligo grabe covid stop na 🤷,Neutral,1


# **Stop-words**

In [None]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def cleanTxt(text):
  text = re.sub(r'@[A-Za-z0-9]+','',text) #remove @mentions
  text = re.sub(r'#','', text) #removing the '#' symbol
  text = re.sub(r'RT[\s]+','', text) #Removing RT
  text = re.sub(r'https?:\/\/\S+', '',text) #remove the hyper link
  text = re.sub('([^A-Za-z\ ])','',text) #removes everything and keeps only letters
  text = text.lower() #transforms everything to lowercase
  text = [word for word in text.split() if text.lower() not in stopword] #removes stopwords
  text = ' '.join(text)
  return text

df['Translated'] = df['Translated'].apply(cleanTxt)
df.head()

Unnamed: 0,Translated,Sentiment,Analysis
0,me to covid,Neutral,1
1,so many realizations during ecq because of cov...,Positive,2
2,its like a covid covid kesa back to cejay haha...,Positive,2
3,while we are all fighting against covid meanwh...,Neutral,1
4,person house because of covid,Neutral,1


# **Calculating sentence lengths**

In [None]:
df['length'] = df['Translated'].apply(lambda x: len(x))
df = df[['Translated', 'Analysis','length']]

df.head(20)



Unnamed: 0,Translated,Analysis,length
0,me to covid,1,11
1,so many realizations during ecq because of cov...,2,153
2,its like a covid covid kesa back to cejay haha...,2,50
3,while we are all fighting against covid meanwh...,1,106
4,person house because of covid,1,29
5,i will die in stress not in covid,1,33
6,i want to join case there is still a covid,1,42
7,so confiscate the akuang alcohol ml then the w...,1,62
8,i choose you to be a positive from covid charrr,2,47
9,jgh then ligo grabe covid stop na,1,33


In [None]:
df.drop(df[df['length']<10].index, inplace = True)
df.reset_index(drop=True, inplace=True)
df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Translated,Analysis,length
0,me to covid,1,11
1,so many realizations during ecq because of cov...,2,153
2,its like a covid covid kesa back to cejay haha...,2,50
3,while we are all fighting against covid meanwh...,1,106
4,person house because of covid,1,29
5,i will die in stress not in covid,1,33
6,i want to join case there is still a covid,1,42
7,so confiscate the akuang alcohol ml then the w...,1,62
8,i choose you to be a positive from covid charrr,2,47
9,jgh then ligo grabe covid stop na,1,33


# **Count number of occurences of each word**

In [None]:
#count number of occurences of each word
counts = Counter()
for index, row in df.iterrows():
    counts.update(row['Translated'])

# **Creating Vocabulary**

In [None]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

# **Encoding Per Tweet**

In [None]:
def encode_sentence(text, vocab2index, N=42):
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in text])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [None]:
df['encoded'] = df['Translated'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))
df.head(15)

  """Entry point for launching an IPython kernel.


Unnamed: 0,Translated,Analysis,length,encoded
0,me to covid,1,11,"[[2, 3, 4, 5, 6, 4, 7, 6, 8, 9, 10, 0, 0, 0, 0..."
1,so many realizations during ecq because of cov...,2,153,"[[11, 6, 4, 2, 12, 13, 14, 4, 15, 3, 12, 16, 9..."
2,its like a covid covid kesa back to cejay haha...,2,50,"[[9, 5, 11, 4, 16, 9, 23, 3, 4, 12, 4, 7, 6, 8..."
3,while we are all fighting against covid meanwh...,1,106,"[[24, 25, 9, 16, 3, 4, 24, 3, 4, 12, 15, 3, 4,..."
4,person house because of covid,1,29,"[[26, 3, 15, 11, 6, 13, 4, 25, 6, 18, 11, 3, 4..."
5,i will die in stress not in covid,1,33,"[[9, 4, 24, 9, 16, 16, 4, 10, 9, 3, 4, 9, 13, ..."
6,i want to join case there is still a covid,1,42,"[[9, 4, 24, 12, 13, 5, 4, 5, 6, 4, 27, 6, 9, 1..."
7,so confiscate the akuang alcohol ml then the w...,1,62,"[[11, 6, 4, 7, 6, 13, 22, 9, 11, 7, 12, 5, 3, ..."
8,i choose you to be a positive from covid charrr,2,47,"[[9, 4, 7, 25, 6, 6, 11, 3, 4, 14, 6, 18, 4, 5..."
9,jgh then ligo grabe covid stop na,1,33,"[[27, 19, 25, 4, 5, 25, 3, 13, 4, 16, 9, 19, 6..."


#**Data Set Counter**

*   Negative = 0
*   Neutral = 1
*   Positive = 2







In [None]:
#check how balanced the dataset is
Counter(df['Analysis'])

Counter({0: 7705, 1: 17485, 2: 19482})

# **Modeling**

In [None]:
X = list(df['encoded'])
y = list(df['Analysis'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# **Numpy to Pytorch**

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx): 
        return torch.from_numpy(self.X[idx][0].astype(np.int32)).to(device), self.y[idx], self.X[idx][1]

# **Declaring Train and Valid Variables for Modeling**


In [None]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

In [None]:
train_loss=list()
validation_loss=list()
accuracy_val=list()
prediction=list()

# **Train Model and Validation Metrics Function**

In [None]:
def train_model(model, epochs, lr,saving_path):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    #Utilization of Stochastic Gradient Descent for optimization
    #with corresponding Learning Rate
    best_valid_loss = float('inf')
    optimizer = torch.optim.SGD(parameters, lr=lr)
    best_epoch=0
    for i in range(epochs):
        sum_loss = 0.0
        total = 0
        correct=0.0
        for x, y, l in train_dl:
            # x is the data
            # y is the target variable (true label)
            # l is the label

            x = x.long()
            y = y.long().to(device)

            #prediction of the model using the given data
            y_pred = model(x, l)
            pred = torch.max(y_pred, 1)[1]

            #resetting gradients for each iteration
            optimizer.zero_grad()
            #using Cross Entropy for Loss calculation
            loss = F.cross_entropy(y_pred, y).to(device)
            #backward propagation
            loss.backward()
            #gradient step
            optimizer.step()
            #calculation of total loss value
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
            correct += (pred == y).float().sum()
        #return values from the validation metrics    
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)

        #gathering the data for train loss, validation loss, accuracy
        #for each epoch
        train_loss.append(sum_loss/total)
        validation_loss.append(val_loss)
        accuracy_val.append(val_acc)

        print("Epoch "+str(i+1)+": train loss %.3f, val loss %.3f, train accuracy %.3f, val accuracy %.3f, and val rmse %.3f" % 
              (sum_loss/total, val_loss, correct/total, val_acc, val_rmse))
        
        if val_loss < best_valid_loss:
          best_valid_loss = val_loss
          best_val_acc=val_acc
          best_epoch=i
          torch.save({'epoch': i,
            #'pretrained_weights':pretrained_weights,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'train_loss': sum_loss/total,
            'train_acc': correct/total,
            'val_acc':val_acc}
            , saving_path)
          
        if abs((sum_loss/total)-val_loss) >= 0.2:
          return best_epoch
    return best_epoch

def validation_metrics (model, valid_dl):
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
      # x is the data
      # y is the target variable (true label)
      # l is the label
      x = x.long()
      y = y.long().to(device)

      #prediction of the model using the given data
      y_hat = model(x, l)

      #using Cross Entropy for Loss calculation
      loss = F.cross_entropy(y_hat, y).to(device)
      pred = torch.max(y_hat, 1)[1]

      #collecting the prediction data for y_pred use in classification report
      prediction.append(pred)

      #calculation of total loss value and RMSE
      correct += (pred == y).float().sum()
      total += y.shape[0]
      sum_loss += loss.item()*y.shape[0]
      sum_rmse += np.sqrt(mean_squared_error(pred.cpu(), y.cpu().unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [None]:
batch_size = 128
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=False)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

# **LSTM Function**

In [None]:
class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.1)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional = True)
        self.linear = nn.Linear(hidden_dim, 3)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

# **Dimensions**

In [None]:

#model = LSTM_variable_input(vocab_size, 32, 64)
#model = LSTM_variable_input(vocab_size, 64, 64)
#model = LSTM_variable_input(vocab_size, 128, 64)
model = LSTM_variable_input(vocab_size, 128, 64)
#model = LSTM_variable_input(vocab_size, 256, 128)
#model = LSTM_variable_input(vocab_size, 256, 256)

In [None]:
model.to(device)

LSTM_variable_input(
  (dropout): Dropout(p=0.1, inplace=False)
  (embeddings): Embedding(29, 128, padding_idx=0)
  (lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=64, out_features=3, bias=True)
)

#**Train Model**


In [None]:
final_path='/content/gdrive/MyDrive/Best_Model_LSTM_Thesis_nonEmbedded.pt'


stop_epoch = train_model(model, epochs=2000, lr=0.2, saving_path=final_path)

In [None]:
#Getting the y values (label)
targets=list()
for x,y,l in val_dl:
  for z in y:
    targets.append(z)
print(targets)

In [None]:
temp = 0
for i in prediction:
  temp+=1
  if(len(i)< 128):
    temp+=1
    break
print('prediction model indexing for validation', temp-1)

In [None]:
preds=list()
temp=prediction[(stop_epoch*70):(stop_epoch*70)+70]
for i in range (0,len(temp)):
  for z in temp[i]:
    preds.append(z.cpu().numpy())


# **Evaluation Metrics**

In [None]:
#Confusion Matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
y_true = targets
y_pred = preds
#print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

#**Experiments**

In [None]:
#stop_epoch = train_model(model, epochs=2000, lr=0.2, saving_path=final_path)

In [None]:
#stop_epoch = train_model(model, epochs=2000, lr=0.3, saving_path=final_path)

In [None]:
#stop_epoch = train_model(model, epochs=2000, lr=0.4, saving_path=final_path)

In [None]:
#stop_epoch = train_model(model, epochs=2000, lr=0.5, saving_path=final_path)

In [None]:
#stop_epoch = train_model(model, epochs=2000, lr=0.6, saving_path=final_path)

In [None]:
#stop_epoch = train_model(model, epochs=2000, lr=0.7, saving_path=final_path)

In [None]:
#stop_epoch = train_model(model, epochs=2000, lr=0.8, saving_path=final_path)

In [None]:
#stop_epoch = train_model(model, epochs=2000, lr=0.9, saving_path=final_path)