In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import matplotlib.pyplot as plt
import time
import os
from sklearn.model_selection import train_test_split
import spacy
import re
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence
from torchtext.functional import to_tensor
from torchtext.models import RobertaClassificationHead

In [18]:
#Import Dataset
#For preprocessing purposes: Clean Text of special characters/URLs/HTML tags/punctuation and put in all lowercase,
#Split text into words or "tokens", remove "stopwords" such as "and", "is", or "the,"
#Reduce words to root or stem form through "lemmatization" or "stemming" to reduce complexity,
#Apply "vectorization" to convert word or token data into a set of numerical data, and
#Pad the data to normalize all the variables so they can be more easily inputted to a ML algorithm
TrainSentimentData = pd.read_csv('data/FinalTrainSentimentData.csv')
TestSentimentData = pd.read_csv('data/FinalTestSentimentData.csv')
TrainSentimentData

Unnamed: 0,Sentiment,Text
0,1,username m follow post
1,0,username username jealous yous liverpool
2,0,username haha funny lonely school
3,1,watch jay leno episode late night get sooooooo...
4,0,username find niqqa like
...,...,...
1278235,1,username nice pic
1278236,1,add upcoming gig ongoing drunken itinerary mon...
1278237,0,omg hot thunderstorm suppose cool
1278238,1,username oh boyo


In [31]:
device = 'cuda:0' if torch.cuda.is_available else 'cpu'
device

'cpu'

In [19]:
#Vectorize string data into numerical data more easily accessible to the model
tokenizer = get_tokenizer('basic_english')
vocabTr = build_vocab_from_iterator(map(tokenizer, TrainSentimentData['Text']))

vocabTe = build_vocab_from_iterator(map(tokenizer, TestSentimentData['Text']))

In [20]:
def train_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTr[token]) for token in tokens]
    return indices

def test_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTe[token]) for token in tokens]
    return indices

In [21]:
#This gave me a lot of trouble since I didn't realize I needed to convert from DataFrame to a Pytorch Dataset
VectorTrainSentimentDataSet = TrainSentimentData.copy()
VectorTrainSentimentDataSet['Text'] = TrainSentimentData['Text'].apply(train_vectorize_twts)
print(VectorTrainSentimentDataSet.head())

VectorTestSentimentDataSet = TestSentimentData.copy()
VectorTestSentimentDataSet['Text'] = TestSentimentData['Text'].apply(test_vectorize_twts)
print(VectorTestSentimentDataSet.head())

   Sentiment                                               Text
0          1                                   [0, 18, 69, 167]
1          0                            [0, 0, 377, 4680, 3123]
2          0                              [0, 39, 210, 661, 61]
3          1  [22, 1241, 2282, 464, 152, 21, 4, 12276, 178, ...
4          0                                  [0, 52, 32080, 6]
   Sentiment                                               Text
0          0              [5, 8, 150, 698, 228, 4649, 78947, 5]
1          0        [23, 2428, 4, 40, 3534, 14, 14991, 4, 1438]
2          1                   [0, 678, 77, 14, 20, 2108, 1321]
3          1  [0, 55, 23, 86, 327, 775, 86, 4, 37, 1435, 160...
4          1                                 [43, 3717, 66, 62]


In [22]:
#Turn pandas dataset into Numpy
VectorTrainSentimentDataSet = VectorTrainSentimentDataSet.to_numpy()
VectorTestSentimentDataSet = VectorTestSentimentDataSet.to_numpy()
VectorTrainSentimentDataSet

array([[1, list([0, 18, 69, 167])],
       [0, list([0, 0, 377, 4680, 3123])],
       [0, list([0, 39, 210, 661, 61])],
       ...,
       [0, list([125, 135, 1645, 340, 96])],
       [1, list([0, 27, 45841])],
       [1, list([0, 4140, 243, 43, 265, 2145, 401])]], dtype=object)

In [23]:
#Split up the Data
Xtrain_sentiment = VectorTrainSentimentDataSet[:,1:]
Ytrain_sentiment = VectorTrainSentimentDataSet[:,0]
Xtest_sentiment = VectorTestSentimentDataSet[:,1:]
Ytest_sentiment = VectorTestSentimentDataSet[:,0]
print(f'{Xtrain_sentiment.shape}, {Ytrain_sentiment.shape}, {Xtest_sentiment.shape}, {Ytest_sentiment.shape}')

(1278240, 1), (1278240,), (319561, 1), (319561,)


In [501]:
"""
There was a problem since I originally wanted to pad my data at the batch level, but I couldn't figure out if
that was even possible, then I tried using pack_sequence and pad_packed_sequence to pad my data to the largest
packed data point, but that isn't how the function worked, so I tried pad_sequence with pack_padded_sequence, which
also didn't end up working because that wasn't how the function worked. Now this just pads to the biggest datapoint
"""

In [24]:
def prepSentimentData(X, y):
    temp_X = []
    for row in X:
        for vector_text in row:
            temp_X.append(torch.tensor(vector_text).float()) #Turn data into float tensors
    temp_X = pad_sequence(temp_X, batch_first=True) #pad the data
    Final_X = temp_X
    for pos, label in enumerate(y):
        if label == 4:
            y[pos] = 1
    return temp_X, y

In [25]:
FXtrain_sentiment, FYtrain_sentiment = prepSentimentData(Xtrain_sentiment, Ytrain_sentiment)
FXtest_sentiment, FYtest_sentiment = prepSentimentData(Xtest_sentiment, Ytest_sentiment)
print(f'Train Data: X: {FXtrain_sentiment.shape}, Y: {FYtrain_sentiment.shape}')
print(f'Test Data: X: {FXtest_sentiment.shape}, Y: {FYtest_sentiment.shape}')
print('-----------------Final Preprocessing Complete-----------------')

Train Data: X: torch.Size([1278240, 33]), Y: (1278240,)
Test Data: X: torch.Size([319561, 30]), Y: (319561,)
-----------------Final Preprocessing Complete-----------------


In [26]:
class SentimentDataset(Dataset):
    def __init__(self, X, y, adjust=False):
        #If dimensions are incorrect, manually fix them
        if adjust:
            shaped = torch.zeros(len(X), 33)
            shaped[:,:30] = X
            X = shaped   
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

test_sentiment_Dset = SentimentDataset(FXtest_sentiment, FYtest_sentiment, True)
print(f'Test: {test_sentiment_Dset[:][0].shape}')
train_sentiment_Dset = SentimentDataset(FXtrain_sentiment, FYtrain_sentiment)
print(f'Train: {train_sentiment_Dset[:][0].shape}')

Test: torch.Size([319561, 33])
Train: torch.Size([1278240, 33])


In [12]:
print(test_sentiment_Dset[0:5])

(tensor([[6.0000e+00, 9.0000e+00, 1.5100e+02, 6.9900e+02, 2.2900e+02, 4.6500e+03,
         7.8948e+04, 6.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.4000e+01, 2.4290e+03, 5.0000e+00, 4.1000e+01, 3.5350e+03, 1.5000e+01,
         1.4992e+04, 5.0000e+00, 1.4390e+03, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0000e+00, 6.7900e+02, 7.8000e+01, 1.5000e+01, 2.1000e+01, 2.1090e+03,
         1.3220e

In [27]:
#Turn data into DataLoaders
batch_size = 100
n_iters = 38400
num_epochs = int(n_iters / (len(train_sentiment_Dset) / batch_size))
train_loader = DataLoader(dataset=train_sentiment_Dset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_sentiment_Dset, batch_size=batch_size, shuffle=False)

In [28]:
#Checking DataLoader object
for _, batch in enumerate(train_loader):
    #This will also be a test to see if device is working
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print(x_batch[0])
    print(x_batch.shape, y_batch.shape)
    break

NameError: name 'device' is not defined

In [8]:
#(num_layers, batch_size, hidden_size)
class SentimentAnalysisLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(SentimentAnalysisLSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = x[:,:,None]
        x = torch.permute(x, (0, 2, 1))
        batch_len = x.size(0)
        hidden = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        cell = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        
        out, _ = self.lstm(x, (hidden, cell))
        out = self.fc(out[:,-1,:])
        return out

In [17]:
#Input dimension is equal to the length of my vocab list
#Output dimension is 2 as they are either positive or negative
#Need number of hidden dimensions (start with 1 or 2 and adjust according to results)
#Need dimension of the hidden dimensions (Try 200)
input_dim = len(train_sentiment_Dset[0][0])
print(input_dim)
hidden_dim = 200
layer_dim = 2
output_dim = 2

LSTMmodel = SentimentAnalysisLSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
LSTMmodel.to(device)
loss_func = nn.CrossEntropyLoss()
learning_rate = 0.1
optimizer = optim.SGD(LSTMmodel.parameters(), lr = learning_rate)

33


In [18]:
print(len(list(LSTMmodel.parameters())))
for i in range(len(list(LSTMmodel.parameters()))):
    print(list(LSTMmodel.parameters())[i].size())

10
torch.Size([800, 33])
torch.Size([800, 200])
torch.Size([800])
torch.Size([800])
torch.Size([800, 200])
torch.Size([800, 200])
torch.Size([800])
torch.Size([800])
torch.Size([2, 200])
torch.Size([2])


In [19]:
def LSTM_train_epoch():
    LSTMmodel.train(True)
    print(f'Epoch: {epoch+1}')
    running_loss = 0.0
    
    for idx, batch in enumerate(train_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        outputs = LSTMmodel(text)
        loss = loss_func(outputs, labels)
        running_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx % 1000) == 0:
            avg_loss_bat = running_loss / 100
            print(f'Batch {idx+1}, Loss: {avg_loss_bat}')
            running_loss = 0.0
    print()

In [20]:
def LSTM_validate_epoch():
    LSTMmodel.train(False)
    running_loss = 0.0
    
    for idx, batch in enumerate(test_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        with torch.no_grad():
            outputs = LSTMmodel(text)
            loss = loss_func(outputs, labels)
            running_loss += loss.item()
        
    avg_loss_bat = running_loss / len(test_loader)
    print(f'Val Loss {avg_loss_bat}')
    print('-------------------------\n')

In [193]:
for epoch in range(num_epochs):
    LSTM_train_epoch()
    LSTM_validate_epoch()

Epoch: 1
Batch 1, Loss: 0.006824893951416016


KeyboardInterrupt: 

In [14]:
#Run on primary computer
"""torch.save(LSTMmodel, 'TwitterSentimentModel.pt')"""

#Run on secondary computer
MainModel = torch.load('TwitterSentimentModel.pt', map_location=torch.device('cpu'))

In [35]:
def LSTM_accTest_epoch():
    MainModel.train(False)
    running_loss = 0.0
    correct = 0
    total = 0
    for idx, batch in enumerate(test_loader):
        text, labels = batch[0], batch[1]
        with torch.no_grad():
            correct = 0
            total = 0
            outputs = MainModel(text)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
    accuracy = 100 * correct / total
    print('Iteration: {}. Accuracy: {}'.format(iter, accuracy))

In [36]:
for epoch in range(num_epochs):
    LSTM_accTest_epoch()

Iteration: <built-in function iter>. Accuracy: 52.459014892578125
Iteration: <built-in function iter>. Accuracy: 52.459014892578125
Iteration: <built-in function iter>. Accuracy: 52.459014892578125


In [21]:
#Transfer learning version of my sentiment analysis model using XLM-Roberta
#What I need to do is use the transforms on the data, then make the functions to run the model using the data
xlmr = torchtext.models.XLMR_BASE_ENCODER
class_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 768)
TLearn_model = xlmr.get_model(head = class_head)
transform = xlmr.transform()

In [22]:
preTrans_train_dset, preTrans_test_dset = TrainSentimentData, TestSentimentData
#Turn pandas dataset into Numpy
preTrans_train_dset = preTrans_train_dset.to_numpy()
preTrans_test_dset = preTrans_test_dset.to_numpy()
preTrans_train_dset

array([[1, 'username m follow post'],
       [0, 'username username jealous yous liverpool'],
       [0, 'username haha funny lonely school'],
       ...,
       [0, 'omg hot thunderstorm suppose cool'],
       [1, 'username oh boyo'],
       [1, 'username banner hang right remember ummm easy']], dtype=object)

In [23]:
#Split Data
XpreTrans_train_dset, YLearn_train_dset = preTrans_train_dset[:,1], preTrans_train_dset[:,0]
XpreTrans_test_dset, YLearn_test_dset = preTrans_test_dset[:,1], preTrans_test_dset[:,0]
#Run Transforms on data
XLearn_train_dset = [to_tensor(transform(row), padding_value=0) for row in XpreTrans_train_dset]
XLearn_test_dset = [to_tensor(transform(row), padding_value=0) for row in XpreTrans_test_dset]
print(XLearn_train_dset[0:3])

[tensor([    0, 38937, 11627,   347, 28960,  1305,     2]), tensor([     0,  38937,  11627,  38937,  11627,     55,   7779,    223,    398,
             7,      6, 134148,  20740,      2]), tensor([    0, 38937, 11627, 22010, 99864,   459,    86,   538, 10696,     2])]


In [24]:
#make the program to run the transfer model
class TransferSentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

learn_train_dataset = TransferSentimentDataset(XLearn_train_dset, YLearn_train_dset)
learn_test_dataset = TransferSentimentDataset(XLearn_test_dset, YLearn_test_dset)
print('Finished')

Finished


In [26]:
#Turn data into DataLoaders
batch_size = 100
n_iters = 38400
num_epochs = int(n_iters / (len(learn_train_dataset) / batch_size))
learn_train_loader = DataLoader(dataset=learn_train_dataset, batch_size=batch_size, shuffle=True)
learn_test_loader = DataLoader(dataset=learn_test_dataset, batch_size=batch_size, shuffle=False)

#Checking DataLoader object, looks like I'll need to pad my data before pushing it through
for _, batch in enumerate(learn_train_loader):
    #This will also be a test to see if device is working
    #Re-add '.to(device)' later to batch[0] and [1]
    x_batch, y_batch = batch[0], batch[1]
    print(x_batch[0])
    print(x_batch.shape, y_batch.shape)
    break

RuntimeError: stack expects each tensor to be equal size, but got [11] at entry 0 and [9] at entry 1