In [1]:
import torch
import torchtext.vocab as vocab
import numpy as np
from torch import nn
from sklearn.preprocessing import LabelEncoder,LabelBinarizer
import math
import re
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.probability import FreqDist
import sys
from sklearn.model_selection import train_test_split
from matplotlib.pylab import plt
from numpy import arange
from torchsummary import summary
import optuna
from utilities import cleanLine
import pickle
import gc

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [3]:
embeddingName = "840B"
dimensions = 300
eostoken="<|eos|>"
sostoken="<|sos|>"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
dataset = vocab.GloVe(name=embeddingName,dim=dimensions)

In [None]:
print(device)

In [6]:
def getWordEmbedding(word):
  return dataset.vectors[dataset.stoi[word]]

In [7]:
#Loading the preprocessed data
file = open("preprocessing/preprocessedLines", 'rb')
augmentedLines = pickle.load(file)
file.close()

In [None]:
#Define vocabulary, size and create embedding matrix
uniq_words = np.unique(np.array(" ".join(augmentedLines).split(" ")))
uniq_words_idx = np.arange(uniq_words.size)

word_to_idx = dict(zip(uniq_words.tolist(), uniq_words_idx.tolist()))
idx_to_word = dict(zip(uniq_words_idx.tolist(), uniq_words.tolist()))

vocab_size = len(word_to_idx)

embedding_matrix = np.zeros((vocab_size, dimensions))

unknown_words = []

for word, idx in word_to_idx.items():
  try: 
    embedding_matrix[idx] = getWordEmbedding(word)
  except KeyError:
    if(word==eostoken):
        embedding_matrix[idx] = np.zeros((1,dimensions))
    elif(word==sostoken):
        embedding_matrix[idx] = np.zeros((1,dimensions))
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(dimensions, ))
        unknown_words.append(word)

print({ 'Number of unknown words': len(unknown_words) })
print({ 'Number of known words': vocab_size-len(unknown_words) })
print(unknown_words)   

In [9]:
# Create inputs and targets
x = []
y = []

for s in augmentedLines:
    x.append(" ".join(s.split()[:-1]))
    y.append(" ".join(s.split()[1:]))

def get_seq_idx(seq):
    return [word_to_idx[word] for word in seq.split()]

x_idx = np.array([get_seq_idx(word) for word in x])
y_idx = np.array([get_seq_idx(word) for word in y])

In [10]:
# Splitting train, validation and test
train_ratio=0.8
test_ratio=0.10
val_ratio=0.10

x_idx_train, x_idx_test, y_idx_train, y_idx_test = train_test_split(x_idx, y_idx, test_size=test_ratio)
x_idx_train, x_idx_val, y_idx_train, y_idx_val = train_test_split(x_idx_train, y_idx_train, test_size=val_ratio/(val_ratio + train_ratio))

In [None]:
# Dataset exploring
def countWord(words, word):
  count=0
  for entry in allWords:
    if(entry==word):
      count+=1
  return count

def filterStopwords(words, stopwords):
  temp = []
  for word in words:
    if(stopwords.count(word) == 0):
      temp.append(word)
  return temp

def removeEmptyWords(words):
  temp = []
  for word in words:
    if(word!=""):
      temp.append(word)
  return temp


stopwords = []
defaultStopwords = list(STOPWORDS)
for word in defaultStopwords:
  cleanedWord = cleanLine(word, pickle, re)
  stopwords.append(cleanedWord)

stopwords.append(eostoken)    
stopwords.append(sostoken)   
allWords = np.array(" ".join(augmentedLines).split(" "))
allWords = filterStopwords(allWords, stopwords)
allWords = removeEmptyWords(allWords)

frequency_distribution = FreqDist(allWords)
frequency_distribution.most_common(20)
frequency_distribution.plot(20)

words = []

for word in allWords:
  words.append(word)

wordtext = " ".join(words)

wordcloud = WordCloud(width=1000, height=500, max_words=200, stopwords=stopwords).generate(wordtext)
plt.figure(figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

In [13]:
class KanyeDataset(torch.utils.data.Dataset):
  def __init__(self, x, y, vocab_size):
    super(KanyeDataset, self)

    self.vocab_size = vocab_size
    self.x = torch.from_numpy(x).type(torch.LongTensor).to(device)
    self.y = torch.from_numpy(y).type(torch.LongTensor).to(device)

  def __len__(self):
      return len(self.x)
  def __getitem__(self, idx):
    x = self.x[idx]
    y = self.y[idx]
    return x, y

In [14]:
class LSTMRNN(nn.Module):
  def __init__(self, pretrained_emb, freeze_emb, vocab_size, drop_out, hidden_size, lstm_layers):
    super(LSTMRNN, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = lstm_layers

    self.embed = nn.Embedding.from_pretrained(pretrained_emb, freeze=freeze_emb)
    self.lstm = nn.LSTM(dimensions, self.hidden_size, num_layers=self.num_layers, dropout=drop_out, batch_first = True)
    self.lin1 = nn.Linear(self.hidden_size, self.hidden_size)
    self.lin2 = nn.Linear(self.hidden_size, vocab_size)
    self.bn1 = nn.BatchNorm1d(self.hidden_size, device=device)
    self.bn2 = nn.BatchNorm1d(vocab_size, device=device)
    self.relu = nn.ReLU()
    
  def forward(self, x, prev_hidden):
    out = self.embed(x)
    
    lstm_output, hidden = self.lstm(out, prev_hidden)
    
    lin_output = self.lin1(lstm_output)
    lin_output = lin_output.transpose(1,2)
    lin_output = self.bn1(lin_output)
    lin_output = lin_output.transpose(1,2)
    lin_output = self.relu(lin_output)
    
    lin_output = self.lin2(lin_output)
    lin_output = lin_output.transpose(1,2)
    lin_output = self.bn2(lin_output)
    lin_output = lin_output.transpose(1,2)
    
    return lin_output, hidden
  def initHiddenLayer(self, batch_size):
    return (torch.zeros(self.num_layers,batch_size,self.hidden_size).to(device),
            torch.zeros(self.num_layers,batch_size,self.hidden_size).to(device))

In [15]:
trainSet = KanyeDataset(x_idx_train,y_idx_train, vocab_size)
valSet = KanyeDataset(x_idx_val,y_idx_val, vocab_size)
testSet = KanyeDataset(x_idx_test,y_idx_test, vocab_size)  

In [16]:
def plotTrainingSession(train_loss, val_loss):
    num_epochs = len(train_loss)
    train_values = train_loss.values()
    val_values = val_loss.values()
 
    epochs = range(1, num_epochs+1)

    plt.plot(epochs, train_values, label='Training Loss')
    plt.plot(epochs, val_values, label='Validation Loss')

    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')

    plt.xticks(arange(0, num_epochs+1, 2))

    plt.legend(loc='best')
    plt.show()

In [17]:
import copy
def trainModel(model, dataloader, valloader, num_epochs, batch_size, loss_fn, optim, lr_scheduler, printresults = False, printsessionreport = False):
    best_val_loss = np.inf
    best_avg_val_loss = np.inf
    train_loss = {}
    val_loss = {}
    strike = 0
 
    for epoch in range(num_epochs):
        # Initialize the hidden layer
        hidden_layer = model.initHiddenLayer(batch_size)
        h = hidden_layer[0]
        c = hidden_layer[1]
        epoch_train_loss = 0
        
        model.train()
        for xbatch, ybatch in dataloader:
            model.zero_grad()
            output, (h, c) = model(xbatch, (h,c))

            h = h.detach()
            c = c.detach()
            
            # Compute loss
            loss = loss_fn(output.transpose(1,2), ybatch)
            
            epoch_train_loss += loss.item()
            
            # Do backward propagation
            loss.backward()
            optim.step()
        
        epoch_val_loss = 0
        model.eval()
        
        # Run validation epoch
        for xbatch, ybatch in valloader:
            output, (h, c) = model(xbatch, (h,c))

            h = h.detach()
            c = c.detach()

            loss = loss_fn(output.transpose(1,2), ybatch)
            epoch_val_loss += loss.item()
            
        #If result is the best so far, save it - else terminate    
        if(epoch_val_loss<best_val_loss):
            best_val_loss = epoch_val_loss
            best_avg_val_loss = epoch_val_loss/len(valloader)
            bestModel=copy.deepcopy(model.state_dict())
        else:
            strike = 1
            
        if(strike==1):
            print("Early stopping due to lack of validation improvement")
            break
        
        average_train_loss = epoch_train_loss/len(dataloader)
        average_val_loss = epoch_val_loss/len(valloader)
        
        train_loss[epoch] = average_train_loss
        val_loss[epoch] = average_val_loss
        if(printresults):
            print({ 'Number epoch': epoch+1, 'Average train loss': average_train_loss, 'Average val loss': average_val_loss })
        
        # Run step on the LR scheduler
        lr_scheduler.step()
    
    if(printsessionreport):
        plotTrainingSession(train_loss, val_loss)
    return bestModel, best_avg_val_loss

In [18]:
# Returns a model based on hyper parameters
def buildModel(params):
    drop_out_lstm = params['drop_out_lstm']
    lstm_layers = params['lstm_layers']
    hidden_size = params['hidden_size']
    freeze_emb = params['freeze_emb']
    if freeze_emb == 1:
        freeze_emb = True
    else:
        freeze_emb = False
    
    model = LSTMRNN(torch.from_numpy(embedding_matrix).float(), freeze_emb, vocab_size, drop_out_lstm, hidden_size, lstm_layers)
    model.to(device)
    
    return model

In [19]:
# Objective function for Optuna to minimize
def objective(trial):    

    params = {
          'drop_out_lstm': trial.suggest_float("drop_out_lstm", 0.2, 0.6, step=0.2),
          'lstm_layers': trial.suggest_int("lstm_layers", 2, 4, step=1),
          'hidden_size': trial.suggest_int("hidden_size", 128, 512, step=128),
        'freeze_emb': trial.suggest_int("freeze_emb", 0, 1, step=1),
        'batch_size': trial.suggest_int("batch_size", 16, 64, step=16),
        'decay_rate': trial.suggest_float("decay_rate", 0.80, 0.96, step=0.2),
        'lr': trial.suggest_float("lr", 0.001, 0.004, step=0.0005),
        'weight_decay': trial.suggest_float("weight_decay", 0.0, 0.1, step=0.5),
    }

    model = buildModel(params)

    num_epochs = 30
    batch_size = params['batch_size']
    decayRate = params['decay_rate']
    lr = params['lr']
    weight_decay = params['weight_decay']
    

    trainloader = torch.utils.data.DataLoader(trainSet, 
                                          batch_size=batch_size, 
                                          shuffle=True,
                                          drop_last=True)
    valloader = torch.utils.data.DataLoader(valSet,
                                            batch_size=batch_size,
                                            shuffle=True,
                                            drop_last=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr = lr, weight_decay=weight_decay)
    lr_sched = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
    loss_func = nn.CrossEntropyLoss()
    
    bestModel, val_loss = trainModel(model,
                                     trainloader,
                                     valloader,
                                     num_epochs=num_epochs,
                                     batch_size=batch_size,
                                     loss_fn=loss_func, 
                                     optim=optimizer,
                                     lr_scheduler = lr_sched,
                                     printresults=True,
                                     printsessionreport=True)

    return val_loss

In [None]:
# Run hypertuning and save results
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=100)

studyname = "Study"
file = open(studyname, 'wb')
pickle.dump(study, file)
file.close()

In [None]:
# Train with best parameters
params = {
      'drop_out_lstm': 0.2,
      'lstm_layers': 2,
      'hidden_size': 512,
    'freeze_emb': 0,
    'num_epochs': 40,
    'batch_size': 20,
    'decay_rate': 0.86,
    'lr': 0.0035,
    'weight_decay': 0.05,
}

model = buildModel(params)

num_epochs = params['num_epochs']
batch_size = params['batch_size']
decayRate = params['decay_rate']
lr = params['lr']
weight_decay = params['weight_decay']

trainloader = torch.utils.data.DataLoader(trainSet, 
                                      batch_size=batch_size, 
                                      shuffle=True,
                                      drop_last=True)
valloader = torch.utils.data.DataLoader(valSet,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        drop_last=True)

optimizer = torch.optim.AdamW(model.parameters(), lr = lr, weight_decay=weight_decay)
lr_sched = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=decayRate)
loss_func = nn.CrossEntropyLoss()

bestModel, val_loss = trainModel(model,
                                 trainloader,
                                 valloader,
                                 num_epochs=num_epochs,
                                 batch_size=batch_size,
                                 loss_fn=loss_func, 
                                 optim=optimizer,
                                 lr_scheduler = lr_sched,
                                 printresults=True,
                                 printsessionreport=True)

# Save model parameters and the testset for it.
file = open("testset", 'wb')
pickle.dump(testSet, file)
file.close()

file = open("model", 'wb')
pickle.dump(bestModel, file)
file.close()

In [20]:
file = open("model", 'rb')
bestModel = pickle.load(file)
file.close()

file = open("testset", 'rb')
testSet = pickle.load(file)
file.close()

In [None]:
testloader = torch.utils.data.DataLoader(testSet,
                                        batch_size=1,
                                        shuffle=True,
                                        drop_last=True)

model = LSTMRNN(torch.from_numpy(embedding_matrix).float(), 0, vocab_size, 0.2, 512, 2)
model.to(device)
model.load_state_dict(bestModel)

In [22]:
import random
# Generate one line
def makeline(model,text,linelength):
    model.eval()
    h,c = model.initHiddenLayer(1)
    for i in range(len(text),linelength):
        x = torch.tensor([[word_to_idx[w] for w in text]]).to(device)
        y_pred, (h, c) = model(x, (h, c))
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        top_n_idx = p.argmax()
        #top_n_idx = p.argsort()[-3:][::-1]
        #sampled_token_index = top_n_idx[random.sample([0,1,2],1)[0]]
        nextword = idx_to_word[top_n_idx]
        
        if(nextword==eostoken):
            text.append(eostoken)
            break
        else:
            text.append(nextword)
    
    return text

In [None]:
# Wrapper to format input output
def makelyric(input, model, sentencemaxlength):
    input = sostoken+ " "+input
    alterinput = input.split()
    output = makeline(model,alterinput,sentencemaxlength)
    output = " ".join(output)
    print({'Prompt    ': input})
    print({'Generated:': output})
    print("\n")

makelyric("all i want is", model, 20)
makelyric("can i get a", model, 20)
makelyric("and i know what they", model, 20)

In [24]:
from nltk.translate.bleu_score import corpus_bleu

#Calculate BLEU on generated and reference sentences
def bleu(generated, target):
    ref_bleu = []
    gen_bleu = []
    for l in generated:
        gen_bleu.append(l.split())
    for i,l in enumerate(target):
        ref_bleu.append([l.split()])

    score_bleu = corpus_bleu(ref_bleu, gen_bleu)
    return score_bleu

In [25]:
# Run evaluation
def evaluate(testloader, model, promptlength):
    model.eval()
    refs = []
    gens = []
    for xbatch, ybatch in testloader:
        x = [idx_to_word[token] for token in xbatch.squeeze().cpu().numpy()]
        y = [idx_to_word[token] for token in ybatch.squeeze().cpu().numpy()]

        refsentence = x
        refsentence = np.append(refsentence, y[-1])
        
        words = xbatch.squeeze()[:promptlength].cpu().numpy()
        words = [idx_to_word[token] for token in words]
        
        gensentence = makeline(model, words, 16)  
        
        gensentence = gensentence[promptlength:]
        refsentence = refsentence[promptlength:]

        for i, w in enumerate(refsentence):
            if(w==eostoken):
                index = i
                refsentence = refsentence[:index+1]
                break
                
        ref = " ".join(refsentence)
        gen = " ".join(gensentence)
        
        refs.append(ref)
        gens.append(gen)
    return gens, refs

In [48]:
p2 = evaluate(testloader, model, 2)
p3 = evaluate(testloader, model, 3)
p4 = evaluate(testloader, model, 4)
p5 = evaluate(testloader, model, 5)
p6 = evaluate(testloader, model, 6)
p7 = evaluate(testloader, model, 7)

In [49]:
p = [p2,p3,p4,p5,p6,p7]
file = open("evaluations", 'wb')
pickle.dump(p, file)
file.close()

In [36]:
file = open("evaluations", 'rb')
p = pickle.load(file)
file.close()

In [37]:
#Calculate BLEU
b2 = bleu(p[0][0], p[0][1])
b3 = bleu(p[1][0], p[1][1])
b4 = bleu(p[2][0], p[2][1])
b5 = bleu(p[3][0], p[3][1])
b6 = bleu(p[4][0], p[4][1])
b7 = bleu(p[5][0], p[5][1])

In [38]:
def plotScores(bleuscores, roguescores, promptlengths):
    assert(len(bleuscores)==len(promptlengths))
    assert(len(roguescores)==len(bleuscores))
    plt.plot(promptlengths, bleuscores, label='BLEU')
    plt.plot(promptlengths, roguescores, label='ROGUE')

    plt.title('Scores')
    plt.xlabel('Prompt length')
    plt.ylabel('Score')

    plt.xticks(arange(promptlengths[0], promptlengths[-1]+1, 1))
    plt.yticks(arange(0, 1.1, 0.1))

    plt.legend(loc='best')
    plt.show()
    
    return 0

In [39]:
# Taken directly from https://towardsdatascience.com/how-to-evaluate-text-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1

import itertools
def _split_into_words(sentences):
  """Splits multiple sentences into words and flattens the result"""
  return list(itertools.chain(*[_.split(" ") for _ in sentences]))

#supporting function
def _get_word_ngrams(n, sentences):
  """Calculates word n-grams for multiple sentences.
  """
  assert len(sentences) > 0
  assert n > 0

  words = _split_into_words(sentences)
  return _get_ngrams(n, words)

#supporting function
def _get_ngrams(n, text):
  """Calcualtes n-grams.
  Args:
    n: which n-grams to calculate
    text: An array of tokens
  Returns:
    A set of n-grams
  """
  ngram_set = set()
  text_length = len(text)
  max_index_ngram_start = text_length - n
  for i in range(max_index_ngram_start + 1):
    ngram_set.add(tuple(text[i:i + n]))
  return ngram_set

def rouge_n(reference_sentences, evaluated_sentences, n=2):
  """
  Computes ROUGE-N of two text collections of sentences.
  Source: http://research.microsoft.com/en-us/um/people/cyl/download/
  papers/rouge-working-note-v1.3.1.pdf
  Args:
    evaluated_sentences: The sentences that have been picked by the summarizer
    reference_sentences: The sentences from the referene set
    n: Size of ngram.  Defaults to 2.
  Returns:
    recall rouge score(float)
  Raises:
    ValueError: raises exception if a param has len <= 0
  """
  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
    raise ValueError("Collections must contain at least 1 sentence.")

  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
  reference_ngrams = _get_word_ngrams(n, reference_sentences)
  reference_count = len(reference_ngrams)
  evaluated_count = len(evaluated_ngrams)

  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
  overlapping_count = len(overlapping_ngrams)

  if evaluated_count == 0:
    precision = 0.0
  else:
    precision = overlapping_count / evaluated_count

  if reference_count == 0:
    recall = 0.0
  else:
    recall = overlapping_count / reference_count

  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))

  return recall

# Compute scores
r2 = rouge_n(p[0][1], p[0][0])
r3 = rouge_n(p[1][1], p[1][0])
r4 = rouge_n(p[2][1], p[2][0])
r5 = rouge_n(p[3][1], p[3][0])
r6 = rouge_n(p[4][1], p[4][0])
r7 = rouge_n(p[5][1], p[5][0])

In [None]:
# Plot both scores
plotScores([b2,b3,b4,b5,b6,b7], [r2,r3,r4,r5,r6,r7], [2,3,4,5,6,7])
[b2,b3,b4,b5,b6,b7]