In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score
import time
import os
from sklearn.model_selection import train_test_split
import spacy
import re
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence
from torchtext.functional import to_tensor
from torchtext.models import RobertaClassificationHead

In [38]:
#Import Dataset
#For preprocessing purposes: Clean Text of special characters/URLs/HTML tags/punctuation and put in all lowercase,
#Split text into words or "tokens", remove "stopwords" such as "and", "is", or "the,"
#Reduce words to root or stem form through "lemmatization" or "stemming" to reduce complexity,
#Apply "vectorization" to convert word or token data into a set of numerical data, and
#Pad the data to normalize all the variables so they can be more easily inputted to a ML algorithm
TrainSentimentData = pd.read_csv('data/FinalTrainSentimentData.csv')
TestSentimentData = pd.read_csv('data/FinalTestSentimentData.csv')
TrainSentimentData

Unnamed: 0,Sentiment,Text
0,0,username akita try kill dog saturday apparentl...
1,0,push daisy tonight yay finally sad go go soon ...
2,4,username wuddup sayin hiiii
3,4,username diggin gameplay acit look epic
4,4,username twitter obsessed new favorite
...,...,...
1278235,4,username englishman win matter
1278236,0,tired million thing need plus sorta hungry wan...
1278237,4,say good night friend finish clean morning
1278238,0,sit bed bored


In [39]:
device = 'cuda:0' if torch.cuda.is_available else 'cpu'
device

'cuda:0'

In [40]:
#Vectorize string data into numerical data more easily accessible to the model
tokenizer = get_tokenizer('basic_english')
vocabTr = build_vocab_from_iterator(map(tokenizer, TrainSentimentData['Text']), specials=["<unk>"])

vocabTe = build_vocab_from_iterator(map(tokenizer, TestSentimentData['Text']), specials=["<unk>"])

In [42]:
def train_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTr[token]) for token in tokens]
    return indices

def test_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTe[token]) for token in tokens]
    return indices

In [43]:
#This gave me a lot of trouble since I didn't realize I needed to convert from DataFrame to a Pytorch Dataset
VectorTrainSentimentDataSet = TrainSentimentData.copy()
VectorTrainSentimentDataSet['Text'] = TrainSentimentData['Text'].apply(train_vectorize_twts)
print(VectorTrainSentimentDataSet.head())

VectorTestSentimentDataSet = TestSentimentData.copy()
VectorTestSentimentDataSet['Text'] = TestSentimentData['Text'].apply(test_vectorize_twts)
print(VectorTestSentimentDataSet.head())

   Sentiment                                               Text
0          0  [1, 33423, 45, 282, 250, 254, 537, 2370, 82872...
1          0     [983, 2235, 49, 109, 130, 41, 2, 2, 75, 62605]
2          4                             [1, 27578, 2352, 8865]
3          4                   [1, 6280, 10218, 81257, 28, 951]
4          4                             [1, 31, 4589, 25, 344]
   Sentiment                                Text
0          0  [1, 34, 1341, 1490, 40, 394, 1187]
1          4    [77, 2429, 3637, 946, 701, 2658]
2          4                      [3, 31, 86, 8]
3          0                   [1079, 203, 2212]
4          0             [1, 16, 256, 19959, 14]


In [44]:
#Turn pandas dataset into Numpy
VectorTrainSentimentDataSet = VectorTrainSentimentDataSet.to_numpy()
VectorTestSentimentDataSet = VectorTestSentimentDataSet.to_numpy()
VectorTrainSentimentDataSet

array([[0,
        list([1, 33423, 45, 282, 250, 254, 537, 2370, 82872, 3191, 1059])],
       [0, list([983, 2235, 49, 109, 130, 41, 2, 2, 75, 62605])],
       [4, list([1, 27578, 2352, 8865])],
       ...,
       [4, list([73, 3, 22, 50, 127, 264, 32])],
       [0, list([164, 58, 226])],
       [4, list([1, 77, 3267, 2684, 118, 42])]], dtype=object)

In [45]:
#Split up the Data
Xtrain_sentiment = VectorTrainSentimentDataSet[:,1:]
Ytrain_sentiment = VectorTrainSentimentDataSet[:,0]
Xtest_sentiment = VectorTestSentimentDataSet[:,1:]
Ytest_sentiment = VectorTestSentimentDataSet[:,0]
print(f'{Xtrain_sentiment.shape}, {Ytrain_sentiment.shape}, {Xtest_sentiment.shape}, {Ytest_sentiment.shape}')

(1278240, 1), (1278240,), (319561, 1), (319561,)


In [501]:
"""
There was a problem since I originally wanted to pad my data at the batch level, but I couldn't figure out if
that was even possible, then I tried using pack_sequence and pad_packed_sequence to pad my data to the largest
packed data point, but that isn't how the function worked, so I tried pad_sequence with pack_padded_sequence, which
also didn't end up working because that wasn't how the function worked. Now this just pads to the biggest datapoint
"""

In [46]:
def prepSentimentData(X, y):
    temp_X = []
    for row in X:
        temp_row = []
        for elem in row[0]:
            temp_row.append(float(elem))
        temp_X.append(torch.tensor(temp_row))
    Final_X = pad_sequence(temp_X, batch_first=True) #pad the data    Final_X = temp_X
    for pos, label in enumerate(y):
        if label == 4:
            y[pos] = 1
    return Final_X, y

In [47]:
FXtest_sentiment, FYtest_sentiment = prepSentimentData(Xtest_sentiment, Ytest_sentiment)
print(f'Test Data:\nX: {FXtest_sentiment.shape}, Y: {FYtest_sentiment.shape}')
FXtrain_sentiment, FYtrain_sentiment = prepSentimentData(Xtrain_sentiment, Ytrain_sentiment)
print(f'Train Data:\nX: {FXtrain_sentiment.shape}, Y: {FYtrain_sentiment.shape}')
print('-----------------Final Preprocessing Complete-----------------')

Test Data:
X: torch.Size([319561, 30]), Y: (319561,)
Train Data:
X: torch.Size([1278240, 33]), Y: (1278240,)
-----------------Final Preprocessing Complete-----------------


In [48]:
class SentimentDataset(Dataset):
    def __init__(self, X, y, adjust=False):
        #If dimensions are incorrect, manually fix them
        if adjust:
            shaped = torch.zeros(len(X), 33)
            shaped[:,:30] = X
            X = shaped   
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

test_sentiment_Dset = SentimentDataset(FXtest_sentiment, FYtest_sentiment, True)
print(f'Test: {test_sentiment_Dset[0]}')
train_sentiment_Dset = SentimentDataset(FXtrain_sentiment, FYtrain_sentiment)
print(f'Train: {train_sentiment_Dset[0]}')

Test: (tensor([1.0000e+00, 3.4000e+01, 1.3410e+03, 1.4900e+03, 4.0000e+01, 3.9400e+02,
        1.1870e+03, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00]), 0)
Train: (tensor([1.0000e+00, 3.3423e+04, 4.5000e+01, 2.8200e+02, 2.5000e+02, 2.5400e+02,
        5.3700e+02, 2.3700e+03, 8.2872e+04, 3.1910e+03, 1.0590e+03, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00]), 0)


In [49]:
print(test_sentiment_Dset[0:5])

(tensor([[1.0000e+00, 3.4000e+01, 1.3410e+03, 1.4900e+03, 4.0000e+01, 3.9400e+02,
         1.1870e+03, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [7.7000e+01, 2.4290e+03, 3.6370e+03, 9.4600e+02, 7.0100e+02, 2.6580e+03,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [3.0000e+00, 3.1000e+01, 8.6000e+01, 8.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e

In [50]:
#Turn data into DataLoaders
batch_size = 150
n_iters = 42608
num_epochs = int(n_iters / (len(train_sentiment_Dset) / batch_size))
train_loader = DataLoader(dataset=train_sentiment_Dset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_sentiment_Dset, batch_size=batch_size, shuffle=False)

In [51]:
#Checking DataLoader object
for _, batch in enumerate(train_loader):
    #This will also be a test to see if device is working
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print(x_batch.shape, y_batch.shape)
    break

torch.Size([150, 33]) torch.Size([150])


In [52]:
#(num_layers, batch_size, hidden_size)
class SentimentAnalysisLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(SentimentAnalysisLSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = x[:,:,None]
        x = torch.permute(x, (0, 2, 1))
        batch_len = x.size(0)
        hidden = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        cell = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        
        out, _ = self.lstm(x, (hidden, cell))
        out = self.fc(out[:,-1,:])
        out = nn.functional.sigmoid(out)
        return out

In [53]:
#Input dimension is equal to the length of my vocab list
#Output dimension is 2 as they are either positive or negative
#Need number of hidden dimensions (start with 1 or 2 and adjust according to results)
#Need dimension of the hidden dimensions (Try 200)
input_dim = len(train_sentiment_Dset[0][0])
print(input_dim)
hidden_dim = 150
layer_dim = 2
output_dim = 2

LSTMmodel = SentimentAnalysisLSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
LSTMmodel.to(device)
loss_func = nn.CrossEntropyLoss()
learning_rate = 0.05
optimizer = optim.SGD(LSTMmodel.parameters(), lr = learning_rate)

33


In [54]:
print(len(list(LSTMmodel.parameters())))
for i in range(len(list(LSTMmodel.parameters()))):
    print(list(LSTMmodel.parameters())[i].size())

10
torch.Size([600, 33])
torch.Size([600, 150])
torch.Size([600])
torch.Size([600])
torch.Size([600, 150])
torch.Size([600, 150])
torch.Size([600])
torch.Size([600])
torch.Size([2, 150])
torch.Size([2])


In [55]:
def LSTM_train_epoch():
    LSTMmodel.train(True)
    print(f'Epoch: {epoch+1}')
    running_loss = 0.0
    
    for idx, batch in enumerate(train_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        outputs = LSTMmodel(text)
        loss = loss_func(outputs, labels)
        running_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx % 1000) == 0:
            avg_loss_bat = running_loss / 100
            print(f'Batch {idx+1}, Loss: {avg_loss_bat}')
            running_loss = 0.0
    print()

In [56]:
def LSTM_validate_epoch():
    LSTMmodel.train(False)
    running_loss = 0.0
    for idx, batch in enumerate(test_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        with torch.no_grad():
            correct = 0
            total = 0
            outputs = LSTMmodel(text)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
            loss = loss_func(outputs, labels)
            running_loss += loss.item()
        
    accuracy = 100 * correct / total
    avg_loss_bat = running_loss / len(test_loader)
    print(f'Val Loss: {avg_loss_bat}, Accuracy: {accuracy}')
    print('-------------------------\n')

In [57]:
#Ideas to solve accuracy issues:
#   Implement techniques to reduce overfitting (in theory it can't be this)
#   Make the transfer learning model, then come back
#   Change the way I've vectorized my words to a more complex method
#   Check other Evaluation metrics (not the issue)
for epoch in range(num_epochs):
    LSTM_train_epoch()
    LSTM_validate_epoch()

Epoch: 1
Batch 1, Loss: 0.006929090023040771
Batch 1001, Loss: 6.927044813632965
Batch 2001, Loss: 6.919760698080063
Batch 3001, Loss: 6.913816301226616
Batch 4001, Loss: 6.90883704483509
Batch 5001, Loss: 6.907722533941269
Batch 6001, Loss: 6.905277451276779
Batch 7001, Loss: 6.9036200362443925
Batch 8001, Loss: 6.901959850788116

Val Loss: 0.6898499336224775, Accuracy: 57.377044677734375
-------------------------

Epoch: 2
Batch 1, Loss: 0.006843953728675843
Batch 1001, Loss: 6.899394483566284
Batch 2001, Loss: 6.8991652172803875
Batch 3001, Loss: 6.89995025575161
Batch 4001, Loss: 6.894144133329392
Batch 5001, Loss: 6.895817286372185
Batch 6001, Loss: 6.897953376173973
Batch 7001, Loss: 6.897286902070046
Batch 8001, Loss: 6.89511649608612

Val Loss: 0.6894464674884307, Accuracy: 52.459014892578125
-------------------------

Epoch: 3
Batch 1, Loss: 0.006939285397529602
Batch 1001, Loss: 6.895645796060562
Batch 2001, Loss: 6.897057763934136
Batch 3001, Loss: 6.894914665818215
Batch 40

In [58]:
#Getting variables to calculate a few metrics that will help determine model effectiveness
LSTMmodel.train(False)
total_pred = []
total_labels = []
for idx, batch in enumerate(test_loader):
    text, labels = batch[0].to(device), batch[1].to(device)
    total_labels += labels
    with torch.no_grad():
        correct = 0
        total = 0
        outputs = LSTMmodel(text)
        _, predicted = torch.max(outputs.data, 1)
        total_pred += predicted
        total += labels.size(0)
        correct += (predicted == labels).sum()
#Switching tensors back to cpu for analysis
total_labels_cpu = [label.cpu() for label in total_labels]
total_pred_cpu = [pred.cpu() for pred in total_pred]
print('Done')

Done


In [59]:
#Calculating Scores
accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}')
#F1-Score is the harmonic mean of Precision and Recall
f1 = f1_score(total_labels_cpu, total_pred_cpu, average='weighted')
print(f'F1-Score: {f1}')
#Precision is True positive over total predicted positive
precision = precision_score(total_labels_cpu, total_pred_cpu, average='weighted')
print(f'Precision: {precision}')
#Recall is True positive over total actual positive
recall = recall_score(total_labels_cpu, total_pred_cpu, average='weighted')
print(f'Recall: {recall}')

Accuracy: 50.81966781616211
F1-Score: 0.5196040159573411
Precision: 0.5395759671718091
Recall: 0.5342016078307428


In [14]:
#Run on primary computer
"""torch.save(LSTMmodel, 'TwitterSentimentModel.pt')"""

#Run on secondary computer
MainModel = torch.load('TwitterSentimentModel.pt', map_location=torch.device('cpu'))

In [21]:
#Transfer learning version of my sentiment analysis model using XLM-Roberta
#What I need to do is use the transforms on the data, then make the functions to run the model using the data
xlmr = torchtext.models.XLMR_BASE_ENCODER
class_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 768)
TLearn_model = xlmr.get_model(head = class_head)
transform = xlmr.transform()

In [22]:
preTrans_train_dset, preTrans_test_dset = TrainSentimentData, TestSentimentData
#Turn pandas dataset into Numpy
preTrans_train_dset = preTrans_train_dset.to_numpy()
preTrans_test_dset = preTrans_test_dset.to_numpy()
preTrans_train_dset

array([[1, 'username m follow post'],
       [0, 'username username jealous yous liverpool'],
       [0, 'username haha funny lonely school'],
       ...,
       [0, 'omg hot thunderstorm suppose cool'],
       [1, 'username oh boyo'],
       [1, 'username banner hang right remember ummm easy']], dtype=object)

In [23]:
#Split Data
XpreTrans_train_dset, YLearn_train_dset = preTrans_train_dset[:,1], preTrans_train_dset[:,0]
XpreTrans_test_dset, YLearn_test_dset = preTrans_test_dset[:,1], preTrans_test_dset[:,0]
#Run Transforms on data
XLearn_train_dset = [to_tensor(transform(row), padding_value=0) for row in XpreTrans_train_dset]
XLearn_test_dset = [to_tensor(transform(row), padding_value=0) for row in XpreTrans_test_dset]
print(XLearn_train_dset[0:3])

[tensor([    0, 38937, 11627,   347, 28960,  1305,     2]), tensor([     0,  38937,  11627,  38937,  11627,     55,   7779,    223,    398,
             7,      6, 134148,  20740,      2]), tensor([    0, 38937, 11627, 22010, 99864,   459,    86,   538, 10696,     2])]


In [24]:
#make the program to run the transfer model
class TransferSentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

learn_train_dataset = TransferSentimentDataset(XLearn_train_dset, YLearn_train_dset)
learn_test_dataset = TransferSentimentDataset(XLearn_test_dset, YLearn_test_dset)
print('Finished')

Finished


In [26]:
#Turn data into DataLoaders
batch_size = 100
n_iters = 38400
num_epochs = int(n_iters / (len(learn_train_dataset) / batch_size))
learn_train_loader = DataLoader(dataset=learn_train_dataset, batch_size=batch_size, shuffle=True)
learn_test_loader = DataLoader(dataset=learn_test_dataset, batch_size=batch_size, shuffle=False)

#Checking DataLoader object, looks like I'll need to pad my data before pushing it through
for _, batch in enumerate(learn_train_loader):
    #This will also be a test to see if device is working
    #Re-add '.to(device)' later to batch[0] and [1]
    x_batch, y_batch = batch[0], batch[1]
    print(x_batch[0])
    print(x_batch.shape, y_batch.shape)
    break

RuntimeError: stack expects each tensor to be equal size, but got [11] at entry 0 and [9] at entry 1