In [None]:
from google.colab import drive
#specify project directory in drive eg /content/drive/NLUProject
# drive.flush_and_unmount()
drive.mount('/content/drive')

#define necessary imports
import time
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader

import gensim

Mounted at /content/drive


In [None]:
# define the dataset 
class IMBDDataset(Dataset):
    """IMDB dataset."""

    # initialize the data in the dataset, pad the sequences to the max length
    def __init__(self, df):
        self.df = df
        self.maxLenForDF = self.getMaximumLengthSequence()
        self.padReturningItems(self.maxLenForDF)
        self.sendListToTensors()

# pad all of the sequences up to a given length
    def padReturningItems(self, lengthToPadTo):
        for index, row in self.df.iterrows():
            # if(index % 100 == 0):
            #   print(index)
            paddingNeeded = (lengthToPadTo - len(row['review_tokens_reduced']))
            if(paddingNeeded > 0):
                padData = [paddingID] * paddingNeeded
                self.df.at[index, "review_tokens_reduced"] = row['review_tokens_reduced'] + padData
# convert data that will go to the model into tensors
    def sendListToTensors(self):
        for index, row in self.df.iterrows():
            self.df.at[index, "review_tokens_reduced"] = torch.tensor(row['review_tokens_reduced'], dtype=torch.int)
            self.df.at[index, "user_id_transformed_to_label"] = torch.tensor(row['user_id_transformed_to_label'], dtype=torch.int)

    def __len__(self):
        return self.df.shape[0]

    def getMaximumLengthSequence(self):
        dfColumnAsList = self.df['review_tokens_reduced'].tolist()
        listOfListLengths = [len(i) for i in dfColumnAsList]
        return max(listOfListLengths)
# on getitem, return the row
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        dfRowToReturn = self.df.iloc[idx]
        dictToReturn = {'input': dfRowToReturn['review_tokens_reduced'], 'label': dfRowToReturn['user_id_transformed_to_label']}
        #print(dictToReturn)
        return dictToReturn
#get the number of unique authors
    def getAuthorCount(self):
        uniqueAuthors = self.df["user_id_transformed_to_label"].unique()
        uniqueAuthorLength = len(uniqueAuthors)
        return uniqueAuthorLength


In [None]:
# load in the datasets
train_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBtrain.pt')
validate_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBvalidate.pt')
test_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBtest.pt')

In [None]:
# place the datasets in the dataloader

batchSize = 64
train_dataloader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batchSize)

In [None]:
# Define GRU model
class GRUModel(torch.nn.Module):

  def __init__(self, pretrainedEmbeddingWeights, gruHiddenDimensions, gruLayerNumber, outputClassNumber, paddingId):
    super(GRUModel, self).__init__()
    self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrainedEmbeddingWeights, padding_idx = paddingId)
    self.EmbeddingDimensions = pretrainedEmbeddingWeights.size()[1]
    self.GRULayer = nn.GRU(self.EmbeddingDimensions, gruHiddenDimensions, batch_first=True)
    self.LinearLayer = nn.Linear(gruHiddenDimensions, outputClassNumber)
    self.nnSoftMax = nn.LogSoftmax()

  def forward(self, input):
    embeddedInput = self.EmbeddingLayer(input)
    gruOutput, finalHiddenStates = self.GRULayer(embeddedInput)
    gruOutsPooled = torch.mean(gruOutput, dim=1)
    linearOutput = self.LinearLayer(gruOutsPooled)

    softMaxOut = self.nnSoftMax(linearOutput)
    # print(softMaxOut.size())
    return softMaxOut

In [None]:
#define function to check accuracy with a dataloader
def get_accuracy(dataloader, model):

  model.eval()
  correctlyPredictedNum = 0
  acummulateLength = 0

  for i, data in enumerate(dataloader):

    modelOutputOnSampleBatch = model(data['input'])
    # print(modelOutputOnSampleBatch)
    classPredictions = np.argmax(modelOutputOnSampleBatch.detach().numpy(), axis=1)
    # print(classPredictions, "predictions")
    # print(data['label'], "actual")
    correctBoolean = classPredictions == data['label'].detach().numpy()
    # print(correctBoolean)
    correctlyPredictedNum += np.sum(correctBoolean)

    acummulateLength+= data['input'].size(0)
    # print(correctlyPredictedNum)
    # print("accum", correctlyPredictedNum)

    # print("Size of modelOutput", modelOutputOnSampleBatch.size())
    # print("Size of labels", data['label'].size())
    # print("Size of argmax", len(classPredictions))
  # print(correctlyPredictedNum)
  # print( len(dataloader))
  accuracyToReturn = correctlyPredictedNum / acummulateLength
  model.train()

  return accuracyToReturn

In [None]:
# define tokens that relate to helping with embeddings generation for the model
unknownToken = "<unk>"
padToken = "<pad>"
# load pretrained embeddings
model = gensim.models.KeyedVectors.load('drive/MyDrive/NLU Project/data/preProcessedEmbeddings/glove_vectors.kv')

paddingID = model.vocab[padToken].index

In [None]:
outputSize = train_dataset.getAuthorCount()

In [None]:
hiddnSize = 64
layersGru = 3



preTrainedEmbeddings = torch.from_numpy(model.vectors)
validationModel = GRUvalidationModel = GRUModel(preTrainedEmbeddings, hiddnSize, layersGru, outputSize, paddingID)
validationModel.float()
lossFunction = nn.NLLLoss()
optimizer = torch.optim.Adam(validationModel.parameters(), lr=0.001)

epochTestAccuracyList = []
epochTrainAccuracyList = []

lossForBatch = 0
import time
timeStart = time.time()
for epoch in range(20):
  if(time.time() - timeStart > 43200):
      break
  for i, data in enumerate(train_dataloader):
    if(time.time() - timeStart > 43200):
      break
    validationModel.train()
    optimizer.zero_grad()
    modelOutputOnSampleBatch = validationModel(data['input'])
    lossForBatch = lossFunction(modelOutputOnSampleBatch, data['label'])
    lossForBatch.backward()
    optimizer.step()
    if(i % 100 == 0):
      print("Step run on batch", i, "time:",(time.time() - timeStart), "loss:", lossForBatch)

  testAccuracyThisEpoch = get_accuracy(test_dataloader, validationModel)
  print("Accuracy epoch TESTING", epoch, ":", testAccuracyThisEpoch, "time:", (time.time() - timeStart))
  epochTestAccuracyList.append(testAccuracyThisEpoch)

  trainAccuracyThisEpoch = get_accuracy(train_dataloader, validationModel)
  print("Accuracy epoch TRAINING", epoch, ":", trainAccuracyThisEpoch, "time:", (time.time() - timeStart))
  epochTrainAccuracyList.append(trainAccuracyThisEpoch)



Step run on batch 0 time: 2.6353447437286377 loss: tensor(4.1496, grad_fn=<NllLossBackward0>)
Step run on batch 100 time: 199.2576777935028 loss: tensor(3.9683, grad_fn=<NllLossBackward0>)


KeyboardInterrupt: ignored

In [None]:
# # save the lists
# import pickle
# with open('drive/MyDrive/NLU Project/data/GRUTestTrainingList', 'wb') as f:
#   pickle.dump(epochTestAccuracyList, f)
# with open('drive/MyDrive/NLU Project/data/GRUTrainTrainingList', 'wb') as f:
#   pickle.dump(epochTrainAccuracyList, f)

In [None]:
# # save the model weights
# torch.save(validationModel.state_dict(), 'drive/MyDrive/NLU Project/models/IMDBGRUmodel')
