In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score
import time
import os
from sklearn.model_selection import train_test_split
import spacy
import re
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence
from torchtext.functional import to_tensor
from torchtext.models import RobertaClassificationHead

In [5]:
#Import Dataset
#For preprocessing purposes: Clean Text of special characters/URLs/HTML tags/punctuation and put in all lowercase,
#Split text into words or "tokens", remove "stopwords" such as "and", "is", or "the,"
#Reduce words to root or stem form through "lemmatization" or "stemming" to reduce complexity,
#Apply "vectorization" to convert word or token data into a set of numerical data, and
#Pad the data to normalize all the variables so they can be more easily inputted to a ML algorithm
TrainSentimentData = pd.read_csv('data/FinalTrainSentimentData.csv')
TestSentimentData = pd.read_csv('data/FinalTestSentimentData.csv')
TrainSentimentData

Unnamed: 0,Sentiment,Text
0,1,username m follow post
1,0,username username jealous yous liverpool
2,0,username haha funny lonely school
3,1,watch jay leno episode late night get sooooooo...
4,0,username find niqqa like
...,...,...
1278235,1,username nice pic
1278236,1,add upcoming gig ongoing drunken itinerary mon...
1278237,0,omg hot thunderstorm suppose cool
1278238,1,username oh boyo


In [7]:
device = 'cuda:0' if torch.cuda.is_available else 'cpu'
device

'cuda:0'

In [9]:
#Vectorize string data into numerical data more easily accessible to the model
tokenizer = get_tokenizer('basic_english')
vocabTr = build_vocab_from_iterator(map(tokenizer, TrainSentimentData['Text']), specials=["<unk>"])

vocabTe = build_vocab_from_iterator(map(tokenizer, TestSentimentData['Text']), specials=["<unk>"])

In [11]:
def train_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTr[token]) for token in tokens]
    return indices

def test_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTe[token]) for token in tokens]
    return indices

In [13]:
#This gave me a lot of trouble since I didn't realize I needed to convert from DataFrame to a Pytorch Dataset
VectorTrainSentimentDataSet = TrainSentimentData.copy()
VectorTrainSentimentDataSet['Text'] = TrainSentimentData['Text'].apply(train_vectorize_twts)
print(VectorTrainSentimentDataSet.head())

VectorTestSentimentDataSet = TestSentimentData.copy()
VectorTestSentimentDataSet['Text'] = TestSentimentData['Text'].apply(test_vectorize_twts)
print(VectorTestSentimentDataSet.head())

   Sentiment                                               Text
0          1                                   [1, 19, 70, 168]
1          0                            [1, 1, 378, 4681, 3124]
2          0                              [1, 40, 211, 662, 62]
3          1  [23, 1242, 2283, 465, 153, 22, 5, 12277, 179, ...
4          0                                  [1, 53, 32081, 7]
   Sentiment                                               Text
0          0              [6, 9, 151, 699, 229, 4650, 78948, 6]
1          0        [24, 2429, 5, 41, 3535, 15, 14992, 5, 1439]
2          1                   [1, 679, 78, 15, 21, 2109, 1322]
3          1  [1, 56, 24, 87, 328, 776, 87, 5, 38, 1436, 161...
4          1                                 [44, 3718, 67, 63]


In [15]:
#Turn pandas dataset into Numpy
VectorTrainSentimentDataSet = VectorTrainSentimentDataSet.to_numpy()
VectorTestSentimentDataSet = VectorTestSentimentDataSet.to_numpy()
VectorTrainSentimentDataSet

array([[1, list([1, 19, 70, 168])],
       [0, list([1, 1, 378, 4681, 3124])],
       [0, list([1, 40, 211, 662, 62])],
       ...,
       [0, list([126, 136, 1646, 341, 97])],
       [1, list([1, 28, 45842])],
       [1, list([1, 4141, 244, 44, 266, 2146, 402])]], dtype=object)

In [17]:
#Split up the Data
Xtrain_sentiment = VectorTrainSentimentDataSet[:,1:]
Ytrain_sentiment = VectorTrainSentimentDataSet[:,0]
Xtest_sentiment = VectorTestSentimentDataSet[:,1:]
Ytest_sentiment = VectorTestSentimentDataSet[:,0]
print(f'{Xtrain_sentiment.shape}, {Ytrain_sentiment.shape}, {Xtest_sentiment.shape}, {Ytest_sentiment.shape}')

(1278240, 1), (1278240,), (319561, 1), (319561,)


In [501]:
"""
There was a problem since I originally wanted to pad my data at the batch level, but I couldn't figure out if
that was even possible, then I tried using pack_sequence and pad_packed_sequence to pad my data to the largest
packed data point, but that isn't how the function worked, so I tried pad_sequence with pack_padded_sequence, which
also didn't end up working because that wasn't how the function worked. Now this just pads to the biggest datapoint
"""

In [19]:
def prepSentimentData(X, y):
    temp_X = []
    for row in X:
        temp_row = []
        for elem in row[0]:
            temp_row.append(float(elem))
        temp_X.append(torch.tensor(temp_row))
    Final_X = pad_sequence(temp_X, batch_first=True) #pad the data    Final_X = temp_X
    for pos, label in enumerate(y):
        if label == 4:
            y[pos] = 1
    return Final_X, y

In [21]:
FXtest_sentiment, FYtest_sentiment = prepSentimentData(Xtest_sentiment, Ytest_sentiment)
print(f'Test Data:\nX: {FXtest_sentiment.shape}, Y: {FYtest_sentiment.shape}')
FXtrain_sentiment, FYtrain_sentiment = prepSentimentData(Xtrain_sentiment, Ytrain_sentiment)
print(f'Train Data:\nX: {FXtrain_sentiment.shape}, Y: {FYtrain_sentiment.shape}')
print('-----------------Final Preprocessing Complete-----------------')

Test Data:
X: torch.Size([319561, 30]), Y: (319561,)
Train Data:
X: torch.Size([1278240, 33]), Y: (1278240,)
-----------------Final Preprocessing Complete-----------------


In [23]:
class SentimentDataset(Dataset):
    def __init__(self, X, y, adjust=False):
        #If dimensions are incorrect, manually fix them
        if adjust:
            shaped = torch.zeros(len(X), 33)
            shaped[:,:30] = X
            X = shaped   
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

test_sentiment_Dset = SentimentDataset(FXtest_sentiment, FYtest_sentiment, True)
print(f'Test: {test_sentiment_Dset[:][0].shape}')
train_sentiment_Dset = SentimentDataset(FXtrain_sentiment, FYtrain_sentiment)
print(f'Train: {train_sentiment_Dset[:][0].shape}')

Test: torch.Size([319561, 33])
Train: torch.Size([1278240, 33])


In [12]:
print(test_sentiment_Dset[0:5])

(tensor([[6.0000e+00, 9.0000e+00, 1.5100e+02, 6.9900e+02, 2.2900e+02, 4.6500e+03,
         7.8948e+04, 6.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.4000e+01, 2.4290e+03, 5.0000e+00, 4.1000e+01, 3.5350e+03, 1.5000e+01,
         1.4992e+04, 5.0000e+00, 1.4390e+03, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0000e+00, 6.7900e+02, 7.8000e+01, 1.5000e+01, 2.1000e+01, 2.1090e+03,
         1.3220e

In [25]:
#Turn data into DataLoaders
batch_size = 150
n_iters = 42608
num_epochs = int(n_iters / (len(train_sentiment_Dset) / batch_size))
train_loader = DataLoader(dataset=train_sentiment_Dset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_sentiment_Dset, batch_size=batch_size, shuffle=False)

In [27]:
#Checking DataLoader object
for _, batch in enumerate(train_loader):
    #This will also be a test to see if device is working
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print(x_batch.shape, y_batch.shape)
    break

AssertionError: Torch not compiled with CUDA enabled

In [8]:
#(num_layers, batch_size, hidden_size)
class SentimentAnalysisLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(SentimentAnalysisLSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = x[:,:,None]
        x = torch.permute(x, (0, 2, 1))
        batch_len = x.size(0)
        hidden = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        cell = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        
        out, _ = self.lstm(x, (hidden, cell))
        out = self.fc(out[:,-1,:])
        out = nn.functional.sigmoid(out)
        return out

In [17]:
#Input dimension is equal to the length of my vocab list
#Output dimension is 2 as they are either positive or negative
#Need number of hidden dimensions (start with 1 or 2 and adjust according to results)
#Need dimension of the hidden dimensions (Try 200)
input_dim = len(train_sentiment_Dset[0][0])
print(input_dim)
hidden_dim = 150
layer_dim = 2
output_dim = 2

LSTMmodel = SentimentAnalysisLSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
LSTMmodel.to(device)
loss_func = nn.CrossEntropyLoss()
learning_rate = 0.05
optimizer = optim.SGD(LSTMmodel.parameters(), lr = learning_rate)

33


In [18]:
print(len(list(LSTMmodel.parameters())))
for i in range(len(list(LSTMmodel.parameters()))):
    print(list(LSTMmodel.parameters())[i].size())

10
torch.Size([800, 33])
torch.Size([800, 200])
torch.Size([800])
torch.Size([800])
torch.Size([800, 200])
torch.Size([800, 200])
torch.Size([800])
torch.Size([800])
torch.Size([2, 200])
torch.Size([2])


In [19]:
def LSTM_train_epoch():
    LSTMmodel.train(True)
    print(f'Epoch: {epoch+1}')
    running_loss = 0.0
    
    for idx, batch in enumerate(train_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        outputs = LSTMmodel(text)
        loss = loss_func(outputs, labels)
        running_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx % 1000) == 0:
            avg_loss_bat = running_loss / 100
            print(f'Batch {idx+1}, Loss: {avg_loss_bat}')
            running_loss = 0.0
    print()

In [20]:
def LSTM_validate_epoch():
    LSTMmodel.train(False)
    running_loss = 0.0
    for idx, batch in enumerate(test_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        with torch.no_grad():
            correct = 0
            total = 0
            outputs = LSTMmodel(text)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
            loss = loss_func(outputs, labels)
            running_loss += loss.item()
        
    accuracy = 100 * correct / total
    avg_loss_bat = running_loss / len(test_loader)
    print(f'Val Loss: {avg_loss_bat}, Accuracy: {accuracy}')
    print('-------------------------\n')

In [193]:
#Ideas to solve accuracy issues:
#   Implement techniques to reduce overfitting (I'm pretty sure it's this considering the low loss)
#   Make the transfer learning model, then come back
#   Change the way I've vectorized my words to a more complex method
#   Check other Evaluation metrics (not the issue)
for epoch in range(num_epochs):
    LSTM_train_epoch()
    LSTM_validate_epoch()

Epoch: 1
Batch 1, Loss: 0.006824893951416016


KeyboardInterrupt: 

In [None]:
#Getting variables to calculate a few metrics that will help determine model effectiveness
LSTMmodel.train(False)
total_pred = []
total_labels = []
for idx, batch in enumerate(test_loader):
    text, labels = batch[0].to(device), batch[1].to(device)
    total_labels += labels
    with torch.no_grad():
        correct = 0
        total = 0
        outputs = LSTMmodel(text)
        _, predicted = torch.max(outputs.data, 1)
        total_pred += predicted
        total += labels.size(0)
        correct += (predicted == labels).sum()
#Switching tensors back to cpu for analysis
total_labels_cpu = [label.cpu() for label in total_labels]
total_pred_cpu = [pred.cpu() for pred in total_pred]
print('Done')

In [None]:
#Calculating Scores
accuracy = 100 * correct / total
print(f'Accuracy: {accuracy}')
#F1-Score is the harmonic mean of Precision and Recall
f1 = f1_score(total_labels_cpu, total_pred_cpu, average='weighted')
print(f'F1-Score: {f1}')
#Precision is True positive over total predicted positive
precision = precision_score(total_labels_cpu, total_pred_cpu, average='weighted')
print(f'Precision: {precision}')
#Recall is True positive over total actual positive
recall = recall_score(total_labels_cpu, total_pred_cpu, average='weighted')
print(f'Recall: {recall}')

In [14]:
#Run on primary computer
"""torch.save(LSTMmodel, 'TwitterSentimentModel.pt')"""

#Run on secondary computer
MainModel = torch.load('TwitterSentimentModel.pt', map_location=torch.device('cpu'))

In [29]:
#Transfer learning version of my sentiment analysis model using XLM-Roberta
#What I need to do is use the transforms on the data, then make the functions to run the model using the data
xlmr = torchtext.models.XLMR_BASE_ENCODER
class_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 768)
TLearn_model = xlmr.get_model(head = class_head)
transform = xlmr.transform()

In [41]:
preTrans_train_dset, preTrans_test_dset = TrainSentimentData, TestSentimentData
#Turn pandas dataset into Numpy
preTrans_train_dset = preTrans_train_dset.to_numpy()
preTrans_test_dset = preTrans_test_dset.to_numpy()
preTrans_train_dset

array([[1, 'username m follow post'],
       [0, 'username username jealous yous liverpool'],
       [0, 'username haha funny lonely school'],
       ...,
       [0, 'omg hot thunderstorm suppose cool'],
       [1, 'username oh boyo'],
       [1, 'username banner hang right remember ummm easy']], dtype=object)

In [66]:
#Split Data
def apply_transform(data):
    return transform(data[1]), data[0]

longest = 0
#Run Transforms on data
TransXTrainDset, TransYTrainDset = [], []
TransXTestDset, TransYTestDset = [], []
for row in preTrans_train_dset:
    tensor_row = torch.tensor(transform(row[1]))
    tensor_row = [(int(word)+1) for word in tensor_row]
    if len(tensor_row) > longest:
        longest = len(tensor_row)
    TransXTrainDset.append(tensor_row)
    TransYTrainDset.append(row[0])
print('Train Done')
print(TransXTrainDset[0])
for row in preTrans_test_dset:
    tensor_row = torch.tensor(transform(row[1]))
    tensor_row = [(int(word)+1) for word in tensor_row]
    if len(tensor_row) > longest:
        longest = len(tensor_row)
    TransXTestDset.append(tensor_row)
    TransYTestDset.append(row[0])
print('Test Done')
print(TransXTestDset[0])

Train Done
[1, 38938, 11628, 348, 28961, 1306, 3]
Test Done
[1, 4489, 18926, 92950, 206987, 50783, 118483, 54, 56291, 72, 4489, 3]


In [118]:
#Still need to understand exactly what format my data needs to be in for "Roberta"
#Training is very similar to normal
empty_tensor_test = torch.zeros(len(TransXTestDset), longest).long()
empty_tensor_train = torch.zeros(len(TransXTrainDset), longest).long()

for i, row in enumerate(TransXTestDset):
    empty_tensor_test[i, :len(row)] = torch.tensor(row)
TransXTestDset = empty_tensor_test
print(TransXTestDset[0:3])

for i, row in enumerate(TransXTrainDset):
    empty_tensor_train[i, :len(row)] = torch.tensor(row)
TransXTrainDset = empty_tensor_train    

print(TransXTrainDset[0:3])

  empty_tensor_test[i, :len(row)] = torch.tensor(row)


tensor([[     1,   4489,  18926,  92950, 206987,  50783, 118483,     54,  56291,
             72,   4489,      3,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [     1,  39545,  14800,  25501,   2047,  17111,  31462,   3715,  22474,
           3379,   2047,  12349,      3,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,   

  empty_tensor_train[i, :len(row)] = torch.tensor(row)


tensor([[     1,  38938,  11628,    348,  28961,   1306,      3,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [     1,  38938,  11628,  38938,  11628,     56,   7780,    224,    399,
              8,      7, 134149,  20741,      3,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,   

In [120]:
#Convert to DataSet object
class TransferSentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
learn_train_dataset = TransferSentimentDataset(TransXTrainDset, TransYTrainDset)
learn_test_dataset = TransferSentimentDataset(TransXTestDset, TransYTestDset)
print('Finished')

Finished


In [122]:
#Do whatever model prep I need, seems like I need to research how to use the model briefly to start
#Or I could just dive in and work backward once it starts throwing errors
#Possible solution to my model's problem is processing the dataset differently
#   Specifically keep the emoticons and some punctuation
#   and maybe divide the dataset in a more complex way than just word to number
print(learn_train_dataset[0])

(tensor([    1, 38938, 11628,   348, 28961,  1306,     3,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]), 1)


In [124]:
#Turn data into DataLoaders
batch_size = 50
num_epochs = 4
n_iters = int((num_epochs * batch_size) / len(learn_train_dataset))
learn_train_loader = DataLoader(dataset=learn_train_dataset, batch_size=batch_size, shuffle=True)
learn_test_loader = DataLoader(dataset=learn_test_dataset, batch_size=batch_size, shuffle=False)
TT_dataloaders = {'train': learn_train_loader, 'test': learn_test_loader}
dataset_sizes = {'train': TransXTrainDset.size(0), 'test': TransXTestDset.size(0)}

#Checking DataLoader object, looks like I'll need to pad my data before pushing it through
for _, batch in enumerate(learn_train_loader):
    #This will also be a test to see if device is working
    #Re-add '.to(device)' later to batch[0] and [1]
    x_batch, y_batch = batch[0], batch[1]
    print(x_batch[0])
    print(x_batch.shape, y_batch.shape)
    break

tensor([     1,   2047, 103402,    187,     14,      3,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0])
torch.Size([50, 61]) torch.Size([50])


In [126]:
def train_model(model, criterion, optimizer, num_epochs=25):
    best_acc = 0.0

    #normal loop
    for epoch in range(num_epochs):
        #just keeps track of what num epoch we are on out of the total
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            #Resets loss and number correct for the current epoch and train/val state
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in TT_dataloaders[phase]:
                #Add .to(device) to these
                inputs = inputs
                labels = labels
                
                optimizer.zero_grad()
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
        print()
    print(f'Best val Acc: {best_acc:4f}') #The 4f just sets a minimum width for the number
    return model

In [128]:
loss_func = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = optim.SGD(TLearn_model.parameters(), lr = learning_rate)
best_model = train_model(TLearn_model, loss_func, optimizer, num_epochs=num_epochs)

Epoch 0/3
----------


KeyboardInterrupt: 