In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import matplotlib.pyplot as plt
import time
import os
import spacy
import re
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence

In [2]:
#Import Dataset
#For preprocessing purposes: Clean Text of special characters/URLs/HTML tags/punctuation and put in all lowercase,
#Split text into words or "tokens", remove "stopwords" such as "and", "is", or "the,"
#Reduce words to root or stem form through "lemmatization" or "stemming" to reduce complexity,
#Apply "vectorization" to convert word or token data into a set of numerical data, and
#Pad the data to normalize all the variables so they can be more easily inputted to a ML algorithm
TrainSentimentData = pd.read_csv('data/FinalTrainSentimentData.csv')
TestSentimentData = pd.read_csv('data/FinalTestSentimentData.csv')

In [3]:
TrainSentimentData

Unnamed: 0,Sentiment,Text
0,0,username akita try kill dog saturday apparentl...
1,0,push daisy tonight yay finally sad go go soon ...
2,4,username wuddup sayin hiiii
3,4,username diggin gameplay acit look epic
4,4,username twitter obsessed new favorite
...,...,...
1278235,4,username englishman win matter
1278236,0,tired million thing need plus sorta hungry wan...
1278237,4,say good night friend finish clean morning
1278238,0,sit bed bored


In [4]:
#Update this on actual computer
device = 'cuda:0' if torch.cuda.is_available else 'cpu'
device

'cuda:0'

In [5]:
#Vectorize string data into numerical data more easily accessible to the model
tokenizer = get_tokenizer('basic_english')
vocabTr = build_vocab_from_iterator(map(tokenizer, TrainSentimentData['Text']))

vocabTe = build_vocab_from_iterator(map(tokenizer, TestSentimentData['Text']))

In [6]:
def train_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTr[token]) for token in tokens]
    return indices

def test_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTe[token]) for token in tokens]
    return indices

In [7]:
#This gave me a lot of trouble since I didn't realize I needed to convert from DataFrame to a Pytorch Dataset
VectorTrainSentimentDataSet = TrainSentimentData.copy()
VectorTrainSentimentDataSet['Text'] = TrainSentimentData['Text'].apply(train_vectorize_twts)
print(VectorTrainSentimentDataSet.head())

VectorTestSentimentDataSet = TestSentimentData.copy()
VectorTestSentimentDataSet['Text'] = TestSentimentData['Text'].apply(test_vectorize_twts)
print(VectorTestSentimentDataSet.head())

   Sentiment                                               Text
0          0  [0, 33422, 44, 281, 249, 253, 536, 2369, 82871...
1          0     [982, 2234, 48, 108, 129, 40, 1, 1, 74, 62604]
2          4                             [0, 27577, 2351, 8864]
3          4                   [0, 6279, 10217, 81256, 27, 950]
4          4                             [0, 30, 4588, 24, 343]
   Sentiment                                Text
0          0  [0, 33, 1340, 1489, 39, 393, 1186]
1          4    [76, 2428, 3636, 945, 700, 2657]
2          4                      [2, 30, 85, 7]
3          0                   [1078, 202, 2211]
4          0             [0, 15, 255, 19958, 13]


In [8]:
#Turn pandas dataset into Numpy
VectorTrainSentimentDataSet = VectorTrainSentimentDataSet.to_numpy()
VectorTestSentimentDataSet = VectorTestSentimentDataSet.to_numpy()
VectorTrainSentimentDataSet

array([[0,
        list([0, 33422, 44, 281, 249, 253, 536, 2369, 82871, 3190, 1058])],
       [0, list([982, 2234, 48, 108, 129, 40, 1, 1, 74, 62604])],
       [4, list([0, 27577, 2351, 8864])],
       ...,
       [4, list([72, 2, 21, 49, 126, 263, 31])],
       [0, list([163, 57, 225])],
       [4, list([0, 76, 3266, 2683, 117, 41])]], dtype=object)

In [9]:
#Split up the Data
Xtrain_sentiment = VectorTrainSentimentDataSet[:,1:]
Ytrain_sentiment = VectorTrainSentimentDataSet[:,0]
Xtest_sentiment = VectorTestSentimentDataSet[:,1:]
Ytest_sentiment = VectorTestSentimentDataSet[:,0]
print(f'{Xtrain_sentiment.shape}, {Ytrain_sentiment.shape}, {Xtest_sentiment.shape}, {Ytest_sentiment.shape}')

(1278240, 1), (1278240,), (319561, 1), (319561,)


In [501]:
"""
This was a problem place since I originally wanted to pad my data at the batch level, but I couldn't figure out if
that was even possible, then I tried using pack_sequence and pad_packed_sequence to pad my data to the largest
packed data point, but that isn't how the function worked, so I tried pad_sequence with pack_padded_sequence, which
also didn't end up working because that wasn't how the function worked. Now this just pads to the biggest datapoint
"""

In [10]:
def prepSentimentData(X, y):
    temp_X = []
    for row in X:
        for vector_text in row:
            temp_X.append(torch.tensor(vector_text).float()) #Turn data into float tensors
    temp_X = pad_sequence(temp_X, batch_first=True) #pad the data
    Final_X = temp_X
    for pos, label in enumerate(y):
        if label == 4:
            y[pos] = 1
    return temp_X, y

In [11]:
FXtrain_sentiment, FYtrain_sentiment = prepSentimentData(Xtrain_sentiment, Ytrain_sentiment)
FXtest_sentiment, FYtest_sentiment = prepSentimentData(Xtest_sentiment, Ytest_sentiment)
print(f'Train Data: X: {FXtrain_sentiment.shape}, Y: {FYtrain_sentiment.shape}')
print(f'Test Data: X: {FXtest_sentiment.shape}, Y: {FYtest_sentiment.shape}')
print('-----------------Final Preprocessing Complete-----------------')

Train Data: X: torch.Size([1278240, 33]), Y: (1278240,)
Test Data: X: torch.Size([319561, 30]), Y: (319561,)
-----------------Final Preprocessing Complete-----------------


In [12]:
#Need to pad my data to a specified amount(may have to manually do this with torch.zeros if no function for it)
#Also need to inspect all my dimensions (final dims for input should be: [len(data), 33, 1])
class SentimentDataset(Dataset):
    def __init__(self, X, y, adjust=False):
        #If dimensions are incorrect, manually fix them
        if adjust:
            shaped = torch.zeros(len(X), 33)
            shaped[:,:30] = X
            X = shaped   
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

test_sentiment_Dset = SentimentDataset(FXtest_sentiment, FYtest_sentiment, True)
print(f'Test: {test_sentiment_Dset[:][0].shape}')
train_sentiment_Dset = SentimentDataset(FXtrain_sentiment, FYtrain_sentiment)
print(f'Train: {train_sentiment_Dset[:][0].shape}')

Test: torch.Size([319561, 33])
Train: torch.Size([1278240, 33])


In [13]:
print(test_sentiment_Dset[0:5])

(tensor([[0.0000e+00, 3.3000e+01, 1.3400e+03, 1.4890e+03, 3.9000e+01, 3.9300e+02,
         1.1860e+03, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [7.6000e+01, 2.4280e+03, 3.6360e+03, 9.4500e+02, 7.0000e+02, 2.6570e+03,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.0000e+00, 3.0000e+01, 8.5000e+01, 7.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e

In [14]:
#Turn data into DataLoaders
batch_size = 100
n_iters = 38400
num_epochs = int(n_iters / (len(train_sentiment_Dset) / batch_size))
train_loader = DataLoader(dataset=train_sentiment_Dset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_sentiment_Dset, batch_size=batch_size, shuffle=False)

In [15]:
#Checking DataLoader object
for _, batch in enumerate(train_loader):
    #This will also be a test to see if device is working
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print(x_batch[0])
    print(x_batch.shape, y_batch.shape)
    break

tensor([2.7000e+01, 6.0000e+00, 2.0200e+02, 9.6500e+03, 3.2300e+02, 9.3000e+01,
        6.5000e+01, 9.0000e+00, 4.8800e+02, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00], device='cuda:0')
torch.Size([100, 33]) torch.Size([100])


In [16]:
#(num_layers, batch_size, hidden_size)
class SentimentAnalysisLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(SentimentAnalysisLSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = x[:,:,None]
        x = torch.permute(x, (0, 2, 1))
        batch_len = x.size(0)
        hidden = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        cell = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        
        out, _ = self.lstm(x, (hidden, cell))
        out = self.fc(out[:,-1,:])
        return out

In [17]:
#Input dimension is equal to the length of my vocab list
#Output dimension is 2 as they are either positive or negative
#Need number of hidden dimensions (start with 1 or 2 and adjust according to results)
#Need dimension of the hidden dimensions (Try 200)
input_dim = len(train_sentiment_Dset[0][0])
print(input_dim)
hidden_dim = 200
layer_dim = 2
output_dim = 2

LSTMmodel = SentimentAnalysisLSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
LSTMmodel.to(device)
loss_func = nn.CrossEntropyLoss()
learning_rate = 0.1
optimizer = optim.SGD(LSTMmodel.parameters(), lr = learning_rate)

33


In [18]:
print(len(list(LSTMmodel.parameters())))
for i in range(len(list(LSTMmodel.parameters()))):
    print(list(LSTMmodel.parameters())[i].size())

10
torch.Size([800, 33])
torch.Size([800, 200])
torch.Size([800])
torch.Size([800])
torch.Size([800, 200])
torch.Size([800, 200])
torch.Size([800])
torch.Size([800])
torch.Size([2, 200])
torch.Size([2])


In [19]:
def LSTM_train_epoch():
    LSTMmodel.train(True)
    print(f'Epoch: {epoch+1}')
    running_loss = 0.0
    
    for idx, batch in enumerate(train_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        outputs = LSTMmodel(text)
        loss = loss_func(outputs, labels)
        running_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx % 1000) == 0:
            avg_loss_bat = running_loss / 100
            print(f'Batch {idx+1}, Loss: {avg_loss_bat}')
            running_loss = 0.0
    print()

In [20]:
def LSTM_validate_epoch():
    LSTMmodel.train(False)
    running_loss = 0.0
    
    for idx, batch in enumerate(test_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        with torch.no_grad():
            outputs = LSTMmodel(text)
            loss = loss_func(outputs, labels)
            running_loss += loss.item()
        
    avg_loss_bat = running_loss / len(test_loader)
    print(f'Val Loss {avg_loss_bat}')
    print('-------------------------\n')

In [21]:
for epoch in range(num_epochs):
    LSTM_train_epoch()
    LSTM_validate_epoch()

Epoch: 1
Batch 1, Loss: 0.006935247778892517
Batch 1001, Loss: 6.910366452336311
Batch 2001, Loss: 6.907598538398743
Batch 3001, Loss: 6.904312596321106
Batch 4001, Loss: 6.900823189020157
Batch 5001, Loss: 6.9005317431688304
Batch 6001, Loss: 6.899130181074143
Batch 7001, Loss: 6.898121938705445
Batch 8001, Loss: 6.8995128262043
Batch 9001, Loss: 6.90336029291153
Batch 10001, Loss: 6.900833694934845
Batch 11001, Loss: 6.900955268144608
Batch 12001, Loss: 6.895984420180321

Val Loss 0.6894310474843346
-------------------------

Epoch: 2
Batch 1, Loss: 0.0069932138919830324
Batch 1001, Loss: 6.901795073747635
Batch 2001, Loss: 6.899813757538795
Batch 3001, Loss: 6.897905724048615
Batch 4001, Loss: 6.895096092224121
Batch 5001, Loss: 6.890479831695557
Batch 6001, Loss: 6.894054544568061
Batch 7001, Loss: 6.902797916531563
Batch 8001, Loss: 6.8936617159843445
Batch 9001, Loss: 6.893164383769036
Batch 10001, Loss: 6.899158970117569
Batch 11001, Loss: 6.893444098234177
Batch 12001, Loss: 6.

In [23]:
torch.save(LSTMmodel, 'TwitterSentimentModel.pt')