In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import matplotlib.pyplot as plt
import time
import os
import spacy
import re
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence

In [2]:
#Import Dataset
#For preprocessing purposes: Clean Text of special characters/URLs/HTML tags/punctuation and put in all lowercase,
#Split text into words or "tokens", remove "stopwords" such as "and", "is", or "the,"
#Reduce words to root or stem form through "lemmatization" or "stemming" to reduce complexity,
#Apply "vectorization" to convert word or token data into a set of numerical data, and
#Pad the data to normalize all the variables so they can be more easily inputted to a ML algorithm
TrainSentimentData = pd.read_csv('data/FinalTrainSentimentData.csv')
TestSentimentData = pd.read_csv('data/FinalTestSentimentData.csv')

In [3]:
TrainSentimentData

Unnamed: 0,Sentiment,Text
0,0,username akita try kill dog saturday apparentl...
1,0,push daisy tonight yay finally sad go go soon ...
2,4,username wuddup sayin hiiii
3,4,username diggin gameplay acit look epic
4,4,username twitter obsessed new favorite
...,...,...
1278235,4,username englishman win matter
1278236,0,tired million thing need plus sorta hungry wan...
1278237,4,say good night friend finish clean morning
1278238,0,sit bed bored


In [4]:
#Update this on actual computer
device = 'cuda:0' if torch.cuda.is_available else 'cpu'
device

'cuda:0'

In [5]:
#Vectorize string data into numerical data more easily accessible to the model
tokenizer = get_tokenizer('basic_english')
vocabTr = build_vocab_from_iterator(map(tokenizer, TrainSentimentData['Text']), specials=["<unk>"])
vocabTr.set_default_index(vocabTr["<unk>"])

vocabTe = build_vocab_from_iterator(map(tokenizer, TestSentimentData['Text']), specials=["<unk>"])
vocabTe.set_default_index(vocabTe["<unk>"])

In [6]:
def train_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTr[token]) for token in tokens]
    return indices

def test_vectorize_twts(text):
    tokens = tokenizer(text)
    indices = [int(vocabTe[token]) for token in tokens]
    return indices

In [7]:
#This gave me a lot of trouble since I didn't realize I needed to convert from DataFrame to a Pytorch Dataset
VectorTrainSentimentDataSet = TrainSentimentData.copy()
VectorTrainSentimentDataSet['Text'] = TrainSentimentData['Text'].apply(train_vectorize_twts)
print(VectorTrainSentimentDataSet.head())

VectorTestSentimentDataSet = TestSentimentData.copy()
VectorTestSentimentDataSet['Text'] = TestSentimentData['Text'].apply(test_vectorize_twts)
print(VectorTestSentimentDataSet.head())

   Sentiment                                               Text
0          0  [1, 33423, 45, 282, 250, 254, 537, 2370, 82872...
1          0     [983, 2235, 49, 109, 130, 41, 2, 2, 75, 62605]
2          4                             [1, 27578, 2352, 8865]
3          4                   [1, 6280, 10218, 81257, 28, 951]
4          4                             [1, 31, 4589, 25, 344]
   Sentiment                                Text
0          0  [1, 34, 1341, 1490, 40, 394, 1187]
1          4    [77, 2429, 3637, 946, 701, 2658]
2          4                      [3, 31, 86, 8]
3          0                   [1079, 203, 2212]
4          0             [1, 16, 256, 19959, 14]


In [8]:
#Turn pandas dataset into Numpy
VectorTrainSentimentDataSet = VectorTrainSentimentDataSet.to_numpy()
VectorTestSentimentDataSet = VectorTestSentimentDataSet.to_numpy()
VectorTrainSentimentDataSet

array([[0,
        list([1, 33423, 45, 282, 250, 254, 537, 2370, 82872, 3191, 1059])],
       [0, list([983, 2235, 49, 109, 130, 41, 2, 2, 75, 62605])],
       [4, list([1, 27578, 2352, 8865])],
       ...,
       [4, list([73, 3, 22, 50, 127, 264, 32])],
       [0, list([164, 58, 226])],
       [4, list([1, 77, 3267, 2684, 118, 42])]], dtype=object)

In [101]:
#Split up the Data
Xtrain_sentiment = VectorTrainSentimentDataSet[:,1:]
Ytrain_sentiment = VectorTrainSentimentDataSet[:,0]
Xtest_sentiment = VectorTestSentimentDataSet[:,1:]
Ytest_sentiment = VectorTestSentimentDataSet[:,0]
print(f'{Xtrain_sentiment.shape}, {Ytrain_sentiment.shape}, {Xtest_sentiment.shape}, {Ytest_sentiment.shape}')

(1278240, 1), (1278240,), (319561, 1), (319561,)


In [501]:
"""
This was a problem place since I originally wanted to pad my data at the batch level, but I couldn't figure out if
that was even possible, then I tried using pack_sequence and pad_packed_sequence to pad my data to the largest
packed data point, but that isn't how the function worked, so I tried pad_sequence with pack_padded_sequence, which
also didn't end up working because that wasn't how the function worked. Now this just pads to the biggest datapoint
def pad_tensors(dataset):    
    batch_size = 50
    num_batches = len(dataset) // batch_size
    longest = 0
    #Gets length of longest row in dataset
    for row in dataset:
        if len(row) > longest:
            longest = len(row)
            
    #Split data into batches and loop through 
    for idx in range(num_batches):
        s_idx = idx * batch_size
        e_idx = min((idx + 1) * batch_size, len(dataset))
        batch = dataset[s_idx:e_idx]
        
    #Pad Sequences
    padded_dset = pad_sequence(dataset, batch_first=True)
    
    return padded_dset
"""

In [176]:
def prepSentimentData(X, y):
    temp_X = []
    for row in X:
        for vector_text in row:
            temp_X.append(torch.tensor(vector_text).float()) #Turn data into float tensors
    temp_X = pad_sequence(temp_X, batch_first=True) #pad the data
    Final_X = temp_X
    for pos, label in enumerate(y):
        if label == 4:
            y[pos] = 1
    return temp_X, y

In [177]:
FXtrain_sentiment, FYtrain_sentiment = prepSentimentData(Xtrain_sentiment, Ytrain_sentiment)
FXtest_sentiment, FYtest_sentiment = prepSentimentData(Xtest_sentiment, Ytest_sentiment)
print(f'Train Data: X: {FXtrain_sentiment.shape}, Y: {FYtrain_sentiment.shape}')
print(f'Test Data: X: {FXtest_sentiment.shape}, Y: {FYtest_sentiment.shape}')
print('-----------------Final Preprocessing Complete-----------------')

Train Data: X: torch.Size([1278240, 33]), Y: (1278240,)
Test Data: X: torch.Size([319561, 30]), Y: (319561,)
-----------------Final Preprocessing Complete-----------------


In [179]:
#Need to pad my data to a specified amount(may have to manually do this with torch.zeros if no function for it)
#Also need to inspect all my dimensions (final dims for input should be: [len(data), 33, 1])
class SentimentDataset(Dataset):
    def __init__(self, X, y, adjust=False):
        #If dimensions are incorrect, manually fix them
        if adjust:
            shaped = torch.zeros(len(X), 33)
            shaped[:,:30] = X
            X = shaped   
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

test_sentiment_Dset = SentimentDataset(FXtest_sentiment, FYtest_sentiment, True)
print(f'Test: {test_sentiment_Dset[:][0].shape}')
train_sentiment_Dset = SentimentDataset(FXtrain_sentiment, FYtrain_sentiment)
print(f'Train: {train_sentiment_Dset[:][0].shape}')

Test: torch.Size([319561, 33])
Train: torch.Size([1278240, 33])


In [180]:
print(test_sentiment_Dset[0:5])

(tensor([[1.0000e+00, 3.4000e+01, 1.3410e+03, 1.4900e+03, 4.0000e+01, 3.9400e+02,
         1.1870e+03, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [7.7000e+01, 2.4290e+03, 3.6370e+03, 9.4600e+02, 7.0100e+02, 2.6580e+03,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [3.0000e+00, 3.1000e+01, 8.6000e+01, 8.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e

In [181]:
#Turn data into DataLoaders
batch_size = 100
n_iters = 38400
num_epochs = int(n_iters / (len(train_sentiment_Dset) / batch_size))
train_loader = DataLoader(dataset=train_sentiment_Dset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_sentiment_Dset, batch_size=batch_size, shuffle=False)

In [197]:
#Checking DataLoader object
for _, batch in enumerate(train_loader):
    #This will also be a test to see if device is working
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print(x_batch[0])
    print(x_batch.shape, y_batch.shape)
    break

AssertionError: Torch not compiled with CUDA enabled

In [198]:
#(num_layers, batch_size, hidden_size)
#Need to relearn how all of this works, especially for LSTM so I can know how to modify this
class SentimentAnalysisLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(SentimentAnalysisLSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = x[:,:,None]
        x = torch.permute(x, (0, 2, 1))
        batch_len = x.size(0)
        #Add .to(device) to both of these
        hidden = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        cell = torch.zeros(self.layer_dim, batch_len, self.hidden_dim).to(device)
        
        out, _ = self.lstm(x, (hidden, cell))
        out = self.fc(out[:,-1,:])
        return out

In [189]:
#Input dimension is equal to the length of my vocab list
#Output dimension is 2 as they are either positive or negative
#Need number of hidden dimensions (start with 1 or 2 and adjust according to results)
#Need dimension of the hidden dimensions (Try 200)
input_dim = len(train_sentiment_Dset[0][0])
print(input_dim)
hidden_dim = 200
layer_dim = 2
output_dim = 2

LSTMmodel = SentimentAnalysisLSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
LSTMmodel.to(device)
loss_func = nn.CrossEntropyLoss()
learning_rate = 0.1
optimizer = optim.SGD(LSTMmodel.parameters(), lr = learning_rate)

33


In [59]:
print(len(list(LSTMmodel.parameters())))
for i in range(len(list(LSTMmodel.parameters()))):
    print(list(LSTMmodel.parameters())[i].size())

10
torch.Size([800, 205830])
torch.Size([800, 200])
torch.Size([800])
torch.Size([800])
torch.Size([800, 200])
torch.Size([800, 200])
torch.Size([800])
torch.Size([800])
torch.Size([2, 200])
torch.Size([2])


In [199]:
def train_epoch():
    LSTMmodel.train(True)
    print(f'Epoch: {epoch+1}')
    running_loss = 0.0
    
    for idx, batch in enumerate(train_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        outputs = LSTMmodel(text)
        loss = loss_func(outputs, labels)
        running_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (idx % 1000) == 0:
            avg_loss_bat = running_loss / 100
            print(f'Batch {idx+1}, Loss: {avg_loss_bat}')
            running_loss = 0.0
    print()

In [200]:
def validate_epoch():
    LSTMmodel.train(False)
    running_loss = 0.0
    
    for idx, batch in enumerate(test_loader):
        text, labels = batch[0].to(device), batch[1].to(device)
        
        with torch.no_grad():
            outputs = LSTMmodel(text)
            loss = loss_func(outputs, labels)
            running_loss += loss.item()
        
    avg_loss_bat = running_loss / len(test_loader)
    print(f'Val Loss {avg_loss_bat}')
    print('-------------------------\n')

In [193]:
for epoch in range(num_epochs):
    train_epoch()
    validate_epoch()

Epoch: 1
Batch 1, Loss: 0.006824893951416016


KeyboardInterrupt: 

In [404]:
#Time to write the code to train the model baby!!
iter = 0
for epoch in range(num_epochs):
    for i, (text, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = LSTMmodel(text)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        iter +=1
        if iter % 1000 == 0:
            correct = 0
            total = 0
            for labels, text in test_loader:
                outputs = model(text)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum()
            accuracy = 100 * correct / total
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.item(), accuracy))

RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors