In [None]:
from google.colab import drive
#specify project directory in drive eg /content/drive/NLUProject
# drive.flush_and_unmount()
drive.mount('/content/drive')

#define necessary imports
import time
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader

import gensim

Mounted at /content/drive


In [None]:
# define the dataset 
class IMBDDataset(Dataset):
    """IMDB dataset."""

    # initialize the data in the dataset, pad the sequences to the max length
    def __init__(self, df):
        self.df = df
        self.maxLenForDF = self.getMaximumLengthSequence()
        self.padReturningItems(self.maxLenForDF)
        self.sendListToTensors()

# pad all of the sequences up to a given length
    def padReturningItems(self, lengthToPadTo):
        for index, row in self.df.iterrows():
            # if(index % 100 == 0):
            #   print(index)
            paddingNeeded = (lengthToPadTo - len(row['review_tokens_reduced']))
            if(paddingNeeded > 0):
                padData = [paddingID] * paddingNeeded
                self.df.at[index, "review_tokens_reduced"] = row['review_tokens_reduced'] + padData
# convert data that will go to the model into tensors
    def sendListToTensors(self):
        for index, row in self.df.iterrows():
            self.df.at[index, "review_tokens_reduced"] = torch.tensor(row['review_tokens_reduced'], dtype=torch.int)
            self.df.at[index, "user_id_transformed_to_label"] = torch.tensor(row['user_id_transformed_to_label'], dtype=torch.int)

    def __len__(self):
        return self.df.shape[0]

    def getMaximumLengthSequence(self):
        dfColumnAsList = self.df['review_tokens_reduced'].tolist()
        listOfListLengths = [len(i) for i in dfColumnAsList]
        return max(listOfListLengths)
# on getitem, return the row
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        dfRowToReturn = self.df.iloc[idx]
        dictToReturn = {'input': dfRowToReturn['review_tokens_reduced'], 'label': dfRowToReturn['user_id_transformed_to_label']}
        #print(dictToReturn)
        return dictToReturn
#get the number of unique authors
    def getAuthorCount(self):
        uniqueAuthors = self.df["user_id_transformed_to_label"].unique()
        uniqueAuthorLength = len(uniqueAuthors)
        return uniqueAuthorLength


In [None]:
# load in the datasets
train_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBtrain.pt')
validate_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBvalidate.pt')
test_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBtest.pt')

In [None]:
# place the datasets in the dataloader

batchSize = 64
train_dataloader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batchSize)

In [None]:
#define the model
class LSTMModel(torch.nn.Module):

  def __init__(self, pretrainedEmbeddingWeights, lstmHiddenDimensions, lstmLayerNumber, outputClassNumber, paddingId):
    super(LSTMModel, self).__init__()
    #embedding layer converts tokens to the saved embedding weights
    self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrainedEmbeddingWeights, padding_idx = paddingId)
    #get the embedding dimensins
    self.EmbeddingDimensions = pretrainedEmbeddingWeights.size()[1]
    # define the lstm
    self.LSTMLayer = nn.LSTM(self.EmbeddingDimensions, lstmHiddenDimensions, batch_first=True)
    #define the linear layer that reduces or expands hidden dimension to artist number
    self.LinearLayer = nn.Linear(lstmHiddenDimensions, outputClassNumber)
    # define softmax layer
    self.nnSoftMax = nn.LogSoftmax()

  def forward(self, input):
    embeddedInput = self.EmbeddingLayer(input)
    lstmOutput, (finalHiddenStates, finalCellStates) = self.LSTMLayer(embeddedInput)
    #take the hidden state of the final layer
    #print(finalHiddenStates.size())
    # linearOutput = self.LinearLayer(finalHiddenStates[-1])

    # seqLength = lstmOutput.shape[1]
    # print(lstmOutput.size())
    # outVector = torch.nn.functional.avg_pool2d(lstmOutput, kernel_size=(seqLength,1))
    lstmOutsPooled = torch.mean(lstmOutput, dim=1)
    # print(outVector.size())
    # print(outVector)

    linearOutput = self.LinearLayer(lstmOutsPooled)

    softMaxOut = self.nnSoftMax(linearOutput)
    # print(softMaxOut.size())
    return softMaxOut


In [None]:
#define function to check accuracy with a dataloader
def get_accuracy(dataloader, model):

  model.eval()
  correctlyPredictedNum = 0
  acummulateLength = 0

  for i, data in enumerate(dataloader):

    modelOutputOnSampleBatch = model(data['input'])
    # print(modelOutputOnSampleBatch)
    classPredictions = np.argmax(modelOutputOnSampleBatch.detach().numpy(), axis=1)
    # print(classPredictions, "predictions")
    # print(data['label'], "actual")
    correctBoolean = classPredictions == data['label'].detach().numpy()
    # print(correctBoolean)
    correctlyPredictedNum += np.sum(correctBoolean)

    acummulateLength+= data['input'].size(0)
    # print(correctlyPredictedNum)
    # print("accum", correctlyPredictedNum)

    # print("Size of modelOutput", modelOutputOnSampleBatch.size())
    # print("Size of labels", data['label'].size())
    # print("Size of argmax", len(classPredictions))
  # print(correctlyPredictedNum)
  # print( len(dataloader))
  accuracyToReturn = correctlyPredictedNum / acummulateLength
  model.train()

  return accuracyToReturn

In [None]:
# define tokens that relate to helping with embeddings generation for the model
unknownToken = "<unk>"
padToken = "<pad>"
# load pretrained embeddings
model = gensim.models.KeyedVectors.load('drive/MyDrive/NLU Project/data/preProcessedEmbeddings/glove_vectors.kv')

paddingID = model.vocab[padToken].index

In [None]:
outputSize = train_dataset.getAuthorCount()

In [None]:
# define hyperparameters
hiddnSize = 64
layersLstm = 3


# get the weights to be placed in embedding layer
preTrainedEmbeddings = torch.from_numpy(model.vectors)
# define the model
validationModel = LSTMModel(preTrainedEmbeddings, hiddnSize, layersLstm, outputSize, paddingID)
# convert values to flaot so all layers are consistent
validationModel.float()
# define loss function 
lossFunction = nn.NLLLoss()
#define optimizer including the LR hyperparamter
optimizer = torch.optim.Adam(validationModel.parameters(), lr=0.001)

#define lists and variables for training tracking
epochTestAccuracyList = []
epochTrainAccuracyList = []

lossForBatch = 0
import time
timeStart = time.time()

#train across epoch encounterings of the data
for epoch in range(20):
  #break if the epoch number is not met within a given time
  if(time.time() - timeStart > 43200):
      break
  for i, data in enumerate(train_dataloader):
    if(time.time() - timeStart > 43200):
      break
    # set the model to expect to be trained
    validationModel.train()
    # clear the gradient calculations from last backward
    optimizer.zero_grad()
    # run the model on the input
    modelOutputOnSampleBatch = validationModel(data['input'])
    # calculate the loss between output and expected result
    lossForBatch = lossFunction(modelOutputOnSampleBatch, data['label'])
    # calculate the gradient
    lossForBatch.backward()
    # update the model weights
    optimizer.step()
    if(i % 100 == 0):
      print("Step run on batch", i, "time:",(time.time() - timeStart), "loss:", lossForBatch)

  testAccuracyThisEpoch = get_accuracy(test_dataloader, validationModel)
  print("Accuracy epoch TESTING", epoch, ":", testAccuracyThisEpoch, "time:", (time.time() - timeStart))
  epochTestAccuracyList.append(testAccuracyThisEpoch)

  trainAccuracyThisEpoch = get_accuracy(train_dataloader, validationModel)
  print("Accuracy epoch TRAINING", epoch, ":", trainAccuracyThisEpoch, "time:", (time.time() - timeStart))
  epochTrainAccuracyList.append(trainAccuracyThisEpoch)



Step run on batch 0 time: 2.5277223587036133 loss: tensor(4.1382, grad_fn=<NllLossBackward0>)
Step run on batch 100 time: 221.509920835495 loss: tensor(3.9171, grad_fn=<NllLossBackward0>)


KeyboardInterrupt: ignored

In [None]:
# # save the lists
# import pickle
# with open('drive/MyDrive/NLU Project/data/LSTMTestTrainingList', 'wb') as f:
#   pickle.dump(epochTestAccuracyList, f)
# with open('drive/MyDrive/NLU Project/data/LSTMTrainTrainingList', 'wb') as f:
#   pickle.dump(epochTrainAccuracyList, f)

In [None]:
# # save the model weights
# torch.save(validationModel.state_dict(), 'drive/MyDrive/NLU Project/models/IMDBLSTMmodel')
