In [1]:
from google.colab import drive
#specify project directory in drive eg /content/drive/NLUProject
# drive.flush_and_unmount()
drive.mount('/content/drive')

#define necessary imports
import time
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader

import gensim

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Siamese Dataset class definition

class IMBDSiameseDataset(Dataset):
    """IMDB dataset for siamese implementation."""

    def __init__(self, df):
        self.df = df
        self.maxLenForDF = self.getMaximumLengthSequence()
        self.padReturningItems(self.maxLenForDF)
        self.sendListToTensors()

    def padReturningItems(self, lengthToPadTo):
        for index, row in self.df.iterrows():
            paddingNeeded = (lengthToPadTo - len(row['review_tokens_reduced']))
            if(paddingNeeded > 0):
                padData = [paddingID] * paddingNeeded
                self.df.at[index, "review_tokens_reduced"] = row['review_tokens_reduced'] + padData

    def sendListToTensors(self):
        for index, row in self.df.iterrows():
            self.df.at[index, "review_tokens_reduced"] = torch.tensor(row['review_tokens_reduced'], dtype=torch.int)
            self.df.at[index, "user_id_transformed_to_label"] = torch.tensor(row['user_id_transformed_to_label'], dtype=torch.int)

    def __len__(self):
        return self.df.shape[0]

    def getMaximumLengthSequence(self):
        dfColumnAsList = self.df['review_tokens_reduced'].tolist()
        listOfListLengths = [len(i) for i in dfColumnAsList]
        return max(listOfListLengths)
# on getitem, 50% of the time take another random item that is of the same author, and 50% of the time take a different author's document
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        input1 = self.df.sample()

        should_get_same_class = np.random.randint(0,2) 
        if should_get_same_class:
            while True:
                input2 = self.df.sample()
                if input1['user_id_transformed_to_label'].iloc[0] == input2['user_id_transformed_to_label'].iloc[0]:
                    break
        else:

            while True:
                input2 = self.df.sample()
                if input1['user_id_transformed_to_label'].iloc[0] != input2['user_id_transformed_to_label'].iloc[0]:
                    break


        dictToReturn = {'input1': input1['review_tokens_reduced'].iloc[0], 'label1': input1['user_id_transformed_to_label'].iloc[0], 
                        'input2': input2['review_tokens_reduced'].iloc[0], 'label2': input2['user_id_transformed_to_label'].iloc[0], 
                        'same_class' : int(input1['user_id_transformed_to_label'].iloc[0] == input2['user_id_transformed_to_label'].iloc[0])}
        return dictToReturn
        
    def getAuthorCount(self):
        uniqueAuthors = self.df["user_id_transformed_to_label"].unique()
        uniqueAuthorLength = len(uniqueAuthors)
        return uniqueAuthorLength


In [3]:
#load in the datasets
siamese_train_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBtrainSiamese.pt')
siamese_validate_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBvalidateSiamese.pt')
siamese_test_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/IMDBtestSiamese.pt')

In [4]:
#define dataloaders
batchSize = 64
siamese_train_dataloader = DataLoader(siamese_train_dataset, batch_size=batchSize, shuffle=True)
siamese_test_dataloader = DataLoader(siamese_test_dataset, batch_size=batchSize, shuffle=True)

In [5]:
# Define GRU model
class SiameseModel(torch.nn.Module):

  def __init__(self, pretrainedEmbeddingWeights, gruHiddenDimensions, gruLayerNumber, outputClassNumber, paddingId):
    super(SiameseModel, self).__init__()
    self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrainedEmbeddingWeights, padding_idx = paddingId)
    self.EmbeddingDimensions = pretrainedEmbeddingWeights.size()[1]
    self.GRULayer = nn.GRU(self.EmbeddingDimensions, gruHiddenDimensions, batch_first=True)
    self.LinearLayer = nn.Linear(gruHiddenDimensions, outputClassNumber)
    self.nnSoftMax = nn.LogSoftmax()
  #network forward passes a single input through the single network as shown by the forward of previous GRU model
  def network_forward(self, input):
    embeddedInput = self.EmbeddingLayer(input)
    gruOutput, finalHiddenStates = self.GRULayer(embeddedInput)
    gruOutsPooled = torch.mean(gruOutput, dim=1)
    linearOutput = self.LinearLayer(gruOutsPooled)

    softMaxOut = self.nnSoftMax(linearOutput)
    return softMaxOut
#return the outputs of both inputs through the model
  def forward(self, input1, input2):
    output1 = self.network_forward(input1)
    output2 = self.network_forward(input2)

    return output1, output2
# get the loss of the model on an input with its labels
  def getLoss(self, input1, input2, label, real1, real2):
    #forward on both inputs
    output1 = self.network_forward(input1)
    output2 = self.network_forward(input2)
#get the negative log likelihood of both inputs for loss
    out1ModelPredictionLoss = torch.nn.functional.nll_loss(output1, real1)
    out2ModelPredictionLoss = torch.nn.functional.nll_loss(output2, real2)
# get the consine similarity between the two outputs
    similarityMeasures = torch.nn.functional.cosine_similarity(output1, output2, dim=1)
#for batches, take the mean difference in expected similarity and the actual similarity
    similarityLoss = torch.mean(((1-label) * similarityMeasures) +          #if they are different, we want no similarity, so use the measure as loss
                                ((label) * torch.absolute(torch.sub(similarityMeasures, 1))))            #if they are the same we want the similarity to be 1, so add loss as |result-1|


    #increase the 0-1 range of simLoss to 0-5
    normalizedLoss = 5*(similarityLoss)
#combine all the loss values and return
    fullLoss = out1ModelPredictionLoss + out2ModelPredictionLoss + normalizedLoss
    return fullLoss

      

In [6]:
#load pretrained embeddings
model = gensim.models.KeyedVectors.load('drive/MyDrive/NLU Project/data/preProcessedEmbeddings/glove_vectors.kv')
unknownToken = "<unk>"
padToken = "<pad>"
paddingID = model.vocab[padToken].index
preTrainedEmbeddings = torch.from_numpy(model.vectors)

In [7]:
#define hyperparameters
hiddnSize = 64
layersGru = 3
outputSizeSiamese = siamese_test_dataset.getAuthorCount()

In [8]:
# Train siamese model

siameseValidationModel = SiameseModel(preTrainedEmbeddings, hiddnSize, layersGru, outputSizeSiamese, paddingID)

siameseValidationModel.float()
optimizer = torch.optim.Adam(siameseValidationModel.parameters(), lr = 0.002)


epochTestAccuracyList = []
epochTrainAccuracyList = []

lossForBatch = 0
import time
timeStart = time.time()
for epoch in range(50):
  if(time.time() - timeStart > 39600):
      break
  for i, data in enumerate(siamese_train_dataloader):
    if(time.time() - timeStart > 39600):
      break
    siameseValidationModel.train()
    optimizer.zero_grad()

    # Pass the outputs of the networks and label into the loss function
    combinedLoss = siameseValidationModel.getLoss(data['input1'], data['input2'], data['same_class'], data['label1'], data['label2'])

    # Calculate the backpropagation
    combinedLoss.backward()
    
    optimizer.step()

    if i % 25 == 0 :
      print(f"Epoch number {epoch}\n Current loss {combinedLoss.item()}\n")




Epoch number 0
 Current loss 11.473751068115234



KeyboardInterrupt: ignored

In [None]:
# #save model weights
# torch.save(siameseValidationModel.state_dict(), 'drive/MyDrive/NLU Project/data/modelSiameseIMDB')