In [2]:
from google.colab import drive
#specify project directory in drive eg /content/drive/NLUProject
# drive.flush_and_unmount()
drive.mount('/content/drive')

#define necessary imports
import time
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

from torch.utils.data import Dataset, DataLoader

import gensim

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
class lyricDataset(Dataset):
    """lyric dataset."""

    def __init__(self, df):
        self.df = df
        self.maxLenForDF = self.getMaximumLengthSequence()
        self.padReturningItems(self.maxLenForDF)
        self.sendListToTensors()

    def padReturningItems(self, lengthToPadTo):
        for index, row in self.df.iterrows():
            # if(index % 100 == 0):
            #   print(index)
            paddingNeeded = (lengthToPadTo - len(row['lyric_tokens_reduced']))
            if(paddingNeeded > 0):
                padData = [paddingID] * paddingNeeded
                self.df.at[index, 'lyric_tokens_reduced'] = row['lyric_tokens_reduced'] + padData

    def sendListToTensors(self):
        for index, row in self.df.iterrows():
            self.df.at[index, "lyric_tokens_reduced"] = torch.tensor(row['lyric_tokens_reduced'], dtype=torch.int)
            self.df.at[index, "artist_label"] = torch.tensor(row['artist_label'], dtype=torch.int)

    def __len__(self):
        return self.df.shape[0]

    def getMaximumLengthSequence(self):
        #print(df)
        dfColumnAsList = self.df['lyric_tokens_reduced'].tolist()
        listOfListLengths = [len(i) for i in dfColumnAsList]
        return max(listOfListLengths)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        dfRowToReturn = self.df.iloc[idx]
        dictToReturn = {'input': dfRowToReturn['lyric_tokens_reduced'], 'label': dfRowToReturn['artist_label']}
        #print(dictToReturn)
        return dictToReturn
#get the number of unique authors
    def getAuthorCount(self):
        uniqueAuthors = self.df["artist_label"].unique()
        uniqueAuthorLength = len(uniqueAuthors)
        return uniqueAuthorLength

In [4]:
#load datasets back in
train_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/LyricTrain.pt')
validate_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/LyricValidate.pt')
test_dataset = torch.load('drive/MyDrive/NLU Project/data/PyTorchDataset/LyricTest.pt')

In [5]:
#define special tokens
unknownToken = "<unk>"
padToken = "<pad>"
#Load gensim model in
model = gensim.models.KeyedVectors.load('drive/MyDrive/NLU Project/data/preProcessedEmbeddings/lyric-glove_vectors.kv')
paddingID = model.vocab[padToken].index

In [6]:
batchSize = 64
train_dataloader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batchSize)

In [7]:
#define LSTM model
class LSTMModel(torch.nn.Module):

  def __init__(self, pretrainedEmbeddingWeights, lstmHiddenDimensions, lstmLayerNumber, outputClassNumber, paddingId):
    super(LSTMModel, self).__init__()
    self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrainedEmbeddingWeights, padding_idx = paddingId)
    self.EmbeddingDimensions = pretrainedEmbeddingWeights.size()[1]
    self.LSTMLayer = nn.LSTM(self.EmbeddingDimensions, lstmHiddenDimensions, batch_first=True)
    self.LinearLayer = nn.Linear(lstmHiddenDimensions, outputClassNumber)
    self.nnSoftMax = nn.LogSoftmax()

  def forward(self, input):
    embeddedInput = self.EmbeddingLayer(input)
    lstmOutput, (finalHiddenStates, finalCellStates) = self.LSTMLayer(embeddedInput)

    lstmOutsPooled = torch.mean(lstmOutput, dim=1)
    linearOutput = self.LinearLayer(lstmOutsPooled)
    softMaxOut = self.nnSoftMax(linearOutput)

    return softMaxOut
# Define GRU model
class GRUModel(torch.nn.Module):

  def __init__(self, pretrainedEmbeddingWeights, gruHiddenDimensions, gruLayerNumber, outputClassNumber, paddingId):
    super(GRUModel, self).__init__()
    self.EmbeddingLayer = nn.Embedding.from_pretrained(pretrainedEmbeddingWeights, padding_idx = paddingId)
    self.EmbeddingDimensions = pretrainedEmbeddingWeights.size()[1]
    self.GRULayer = nn.GRU(self.EmbeddingDimensions, gruHiddenDimensions, batch_first=True)
    self.LinearLayer = nn.Linear(gruHiddenDimensions, outputClassNumber)
    self.nnSoftMax = nn.LogSoftmax()

  def forward(self, input):
    embeddedInput = self.EmbeddingLayer(input)
    gruOutput, finalHiddenStates = self.GRULayer(embeddedInput)
    gruOutsPooled = torch.mean(gruOutput, dim=1)
    linearOutput = self.LinearLayer(gruOutsPooled)

    softMaxOut = self.nnSoftMax(linearOutput)
    # print(softMaxOut.size())
    return softMaxOut


In [8]:
#define function to check accuracy with a dataloader
def get_accuracy(dataloader, model):

  model.eval()
  correctlyPredictedNum = 0
  acummulateLength = 0

  for i, data in enumerate(dataloader):

    modelOutputOnSampleBatch = model(data['input'])
    classPredictions = np.argmax(modelOutputOnSampleBatch.detach().numpy(), axis=1)
    correctBoolean = classPredictions == data['label'].detach().numpy()
    correctlyPredictedNum += np.sum(correctBoolean)

    acummulateLength+= data['input'].size(0)

  accuracyToReturn = correctlyPredictedNum / acummulateLength
  model.train()

  return accuracyToReturn

In [9]:
#Define model hyperparameters
outputSize = train_dataset.getAuthorCount()
hiddnSize = 64
layers = 3

In [13]:
outputSize = 14691
preTrainedEmbeddings = torch.from_numpy(model.vectors)

In [13]:
#train the LSTM model


LSTMTrainedModel = LSTMModel(preTrainedEmbeddings, hiddnSize, layers, outputSize, paddingID)
LSTMTrainedModel.float()
lossFunction = nn.NLLLoss()
optimizer = torch.optim.Adam(LSTMTrainedModel.parameters(), lr=0.001)

epochTestAccuracyListLSTM = []
epochTrainAccuracyListLSTM = []

lossForBatch = 0
import time
timeStart = time.time()
for epoch in range(20):
  if(time.time() - timeStart > 43200):
      break
  for i, data in enumerate(train_dataloader):
    if(time.time() - timeStart > 43200):
      break
    LSTMTrainedModel.train()
    optimizer.zero_grad()
    modelOutputOnSampleBatch = LSTMTrainedModel(data['input'])
    lossForBatch = lossFunction(modelOutputOnSampleBatch, data['label'])
    lossForBatch.backward()
    optimizer.step()
    if(i % 100 == 0):
      print("Step run on batch", i, "time:",(time.time() - timeStart), "loss:", lossForBatch)

  testAccuracyThisEpochLSTM = get_accuracy(test_dataloader, LSTMTrainedModel)
  print("Accuracy epoch TESTING", epoch, ":", testAccuracyThisEpochLSTM, "time:", (time.time() - timeStart))
  epochTestAccuracyListLSTM.append(testAccuracyThisEpochLSTM)

  trainAccuracyThisEpochLSTM = get_accuracy(train_dataloader, LSTMTrainedModel)
  print("Accuracy epoch TRAINING", epoch, ":", trainAccuracyThisEpochLSTM, "time:", (time.time() - timeStart))
  epochTrainAccuracyListLSTM.append(trainAccuracyThisEpochLSTM)



Step run on batch 0 time: 1.104745864868164 loss: tensor(9.6015, grad_fn=<NllLossBackward0>)
Accuracy epoch TESTING 0 : 0.00096 time: 314.0166404247284


KeyboardInterrupt: ignored

In [11]:
# # # COMMENTED To prevent overwrite during testing
# #save model weights
# torch.save(LSTMTrainedModel.state_dict(), 'drive/MyDrive/NLU Project/data/modelLyricsLSTM')
# #save the lists
# import pickle
# with open('drive/MyDrive/NLU Project/data/LyricsLSTMTestList', 'wb') as f:
#   pickle.dump(epochTestAccuracyListLSTM, f)
# with open('drive/MyDrive/NLU Project/data/LyricsLSTMTrainList', 'wb') as f:
#   pickle.dump(epochTrainAccuracyListLSTM, f)
# #run an experiment, optimizing the hyperparameters
# import matplotlib.pyplot as plt
# epochRange = np.arange(len(epochTrainAccuracyListLSTM))
# plt.plot(epochRange, epochTrainAccuracyListLSTM, label = "Train Accuracy")
# plt.plot(epochRange, epochTestAccuracyListLSTM, label = "Test Accuracy")
# plt.legend()
# plt.show()

In [14]:
#Train for the GRU model

GRUTrainedModel = GRUModel(preTrainedEmbeddings, hiddnSize, layers, outputSize, paddingID)
GRUTrainedModel.float()
lossFunction = nn.NLLLoss()
optimizer = torch.optim.Adam(GRUTrainedModel.parameters(), lr=0.001)

epochTestAccuracyListGRU = []
epochTrainAccuracyListGRU = []

lossForBatch = 0
import time
timeStart = time.time()
for epoch in range(20):
  if(time.time() - timeStart > 20):
      break
  for i, data in enumerate(train_dataloader):
    if(time.time() - timeStart > 20):
      break
    GRUTrainedModel.train()
    optimizer.zero_grad()
    modelOutputOnSampleBatch = GRUTrainedModel(data['input'])
    lossForBatch = lossFunction(modelOutputOnSampleBatch, data['label'])
    lossForBatch.backward()
    optimizer.step()
    if(i % 100 == 0):
      print("Step run on batch", i, "time:",(time.time() - timeStart), "loss:", lossForBatch)

  testAccuracyThisEpochGRU = get_accuracy(test_dataloader, GRUTrainedModel)
  print("Accuracy epoch TESTING", epoch, ":", testAccuracyThisEpochGRU, "time:", (time.time() - timeStart))
  epochTestAccuracyListGRU.append(testAccuracyThisEpochGRU)

  trainAccuracyThisEpochGRU = get_accuracy(train_dataloader, GRUTrainedModel)
  print("Accuracy epoch TRAINING", epoch, ":", trainAccuracyThisEpochGRU, "time:", (time.time() - timeStart))
  epochTrainAccuracyListGRU.append(trainAccuracyThisEpochGRU)



Step run on batch 0 time: 0.9917905330657959 loss: tensor(9.6059, grad_fn=<NllLossBackward0>)


KeyboardInterrupt: ignored

In [15]:
# #save model weights
# torch.save(GRUTrainedModel.state_dict(), 'drive/MyDrive/NLU Project/models/modelLyricsGRU')
# #save the lists
# import pickle
# with open('drive/MyDrive/NLU Project/evaluation-lists/LyricsGRUTestList', 'wb') as f:
#   pickle.dump(epochTestAccuracyListGRU, f)
# with open('drive/MyDrive/NLU Project/evaluation-lists/LyricsGRUTrainList', 'wb') as f:
#   pickle.dump(epochTrainAccuracyListGRU, f)
# #run an experiment, optimizing the hyperparameters
# import matplotlib.pyplot as plt
# epochRange = np.arange(len(epochTrainAccuracyListGRU))
# plt.plot(epochRange, epochTrainAccuracyListGRU, label = "Train Accuracy")
# plt.plot(epochRange, epochTestAccuracyListGRU, label = "Test Accuracy")
# plt.legend()
# plt.show()