In [318]:
import tensorflow as tf
tf.config.experimental.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [329]:
import torch
import spacy
import pandas as pd
import numpy as np
from collections import Counter
from Token import Clean
from Token import Tokenise,paddingString
from nltk.util import ngrams
from torch.utils.data import DataLoader
import argparse

filePath = '../DATA/Pride and Prejudice - Jane Austen.txt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
class Dataset(torch.utils.data.Dataset):
    def __init__(self, train_data, batch_size, min_freq=5):
        self.data = train_data
        self.max_len = 30
        self.min_freq = min_freq
        self.vocab = []
        self.batch_size = batch_size
        self.ngramList = []
        sents = self.loadingWords()
        self.wordToIndex = {w: i for i, w in enumerate(self.vocab)}
        self.indexToWord = {i: w for i, w in enumerate(self.vocab)}
        self.padIndex = self.wordToIndex['<PAD>']
        self.unKnownIndex = self.wordToIndex['<UNK>']
        self.startIndex = self.wordToIndex['<START>']
        self.endIndex = self.wordToIndex['<END>']
        for sent in sents:
            tokens = sent
            prefix_seqs = []
            try:
                pfx = [tokens[0]]
                for token in tokens[1:]:
                    pfx.append(token)
                    prefix_seqs.append(pfx.copy())
            
                for i in range(len(prefix_seqs)):
                    currSeq = [self.wordToIndex.get(w,self.unKnownIndex) for w in prefix_seqs[i]]
                    pref_sq = [self.startIndex] + [self.padIndex]*(self.max_len-len(currSeq)) + [w for w in currSeq]
                    self.ngramList.append(list(pref_sq))
            except IndexError:
                continue

    def loadingWords(self):
        text = [line for line in self.data if line.strip()]
        sentences = []
        wordFreq = {}
        mx = 0
        for line in text:
            pd = paddingString(line)
            tokens = Tokenise(pd)
            sentences.append(tokens)
            self.vocab += tokens
            mx = max(mx, len(tokens))
            for token in tokens:
                if token in wordFreq:
                    wordFreq[token] += 1
                else:
                    wordFreq[token] = 1

        # wordCount = Counter(wordFreq)
        wordCount = {}
        self.vocab = list(filter(lambda w: wordFreq[w] >= self.min_freq, self.vocab))
        print(wordFreq['kind'])
        self.vocab = ['<PAD>', '<UNK>', '<START>', '<END>'] + self.vocab
        self.vocab = set(self.vocab)
        self.vocabSize = len(self.vocab)
        print(self.vocabSize)
        self.max_len = max(mx,self.max_len)
        # print(sentences)
        return sentences



cuda


In [297]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [330]:
def splitData(corpus,train_ratio,valid_ratio,test_ratio):
    with open(corpus, 'r') as f:
        text = f.readlines()
    text = [line.strip() for line in text if line.strip()]
    train_size = int(len(text) * train_ratio)
    valid_size = int(len(text) * valid_ratio)
    test_size = int(len(text) * test_ratio)
    train_data = text[:train_size]
    valid_data = text[train_size:train_size + valid_size]
    test_data = text[train_size + valid_size:]
    return train_data, valid_data, test_data

train_data, valid_data, test_data = splitData(filePath,0.7,0.15,0.15)
TRAINSET = Dataset(train_data, 256)
VLAIDSET = Dataset(valid_data, 256)
TESTSET = Dataset(test_data, 256)
#TRAINSET.wordToIndex['king']

40
1700
15
518
9
525


In [331]:
def generateBatch(dataset):
    input_ngram,trg = [],[]
    for tg in dataset:
        input_ngram.append(tg[:-1])
        trg.append(tg[-1])
    return torch.tensor(input_ngram,dtype=torch.long),torch.tensor(trg,dtype=torch.long)

train_loader = DataLoader(TRAINSET.ngramList, batch_size=256, shuffle=True, collate_fn=generateBatch)
valid_loader = DataLoader(VLAIDSET.ngramList, batch_size=256, shuffle=True, collate_fn=generateBatch)
test_loader = DataLoader(TESTSET.ngramList, batch_size=256, shuffle=True, collate_fn=generateBatch)


In [332]:
import torch
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}")

class LSTMmodel(nn.Module):
    def __init__(self, embedding_size, hidden_size, num_layers, vocabSize,dropout):
        super(LSTMmodel, self).__init__()
        self.vocab_size = vocabSize
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(
            self.vocab_size, self.embedding_size, device=device)
        self.lstm = nn.LSTM(input_size=self.embedding_size,
                            hidden_size=self.hidden_size, batch_first=True, device=device)
        self.dropLayer = nn.Dropout(p=self.dropout)
        self.output = nn.Linear(
            self.hidden_size, self.vocab_size, bias=False, device=device)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, xContext):
        xembed = self.dropLayer(self.embedding(xContext))
        out, hidden = self.lstm(xembed)
        out = self.log_softmax(self.output(out[:,-1]))
        #out = self.output(out[:,-1])
        return out, hidden



Using device cuda


In [333]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
# from dataset import Dataset
from torch.utils.data import DataLoader
# from Model import LSTMmodel
import sys
# from Token import Clean
# from Token import Tokenise

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device {device}")


class Evaluation:
    def __init__(self, model:nn.Module,epochs,datasetTrain:torch.utils.data.DataLoader,datasetValid:torch.utils.data.DataLoader,datasetTest:torch.utils.data.DataLoader):
        self.model = model
        self.datasetTrain = datasetTrain
        self.datasetValid = datasetValid
        self.datasetTest = datasetTest
        self.criterion = nn.NLLLoss()
        self.epochs = epochs
        self.clip = 1
        self.patience = 10
        self.learning_rate = 1e-3
        self.optimizer = torch.optim.Adam(
            self.model.parameters(), lr=self.learning_rate,amsgrad = True)
        self.scheduler = torch.optim.lr_scheduler.StepLR(
            self.optimizer, step_size=6, gamma=0.1,last_epoch=-1,verbose=False)

    def train(self):
        maxValidLoss = math.inf
        ctr = 0

        for epoch in range(self.epochs):
            epochAcc = 0
            epochLoss = 0
            self.model.train()
            # hidden = self.model.init_hidden(24)
            for i, (x, y) in enumerate(tqdm(self.datasetTrain)):
                x = x.to(device)
                y = y.to(device)
                self.optimizer.zero_grad()
                outputs, hidden = self.model(x)
                y = y.view(-1)
                loss = self.criterion(outputs, y)
                loss.backward()

                epochAcc += 100*(outputs.argmax(dim=1)==y).sum().item()/y.shape[0]
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
                epochLoss += loss.item()
                self.optimizer.step()
                if i % 100 == 0:
                    print(
                        f"Epoch: {epoch}, Iteration: {i}, Loss: {loss.item()}")
                    
            print(f"Epoch: {epoch}, Loss: {epochLoss/len(self.datasetTrain)}, Accuracy: {epochAcc/len(self.datasetTrain)}")
            # validate
            valid_loss = self.validate()
            self.scheduler.step()
            if valid_loss < maxValidLoss:
                maxValidLoss = valid_loss
                torch.save(self.model.state_dict(), 'model2.pt')
                print("Model saved")
                ctr = 0
            else:
                ctr += 1
                print(f"Validation loss not improved for {ctr} epochs")
            if ctr > self.patience:
                print("Early stopping")
                break

    def validate(self):
        self.model.eval()
        epochAcc = 0
        epochLoss = 0
        for i, (x, y) in enumerate(tqdm(self.datasetValid)):
            x = x.to(device)
            y = y.to(device)
            outputs, hidden = self.model(x)
            y = y.view(-1)
            loss = self.criterion(outputs, y)
            epochAcc += 100*(outputs.argmax(dim=1)==y).sum().item()/y.shape[0]
            epochLoss += loss.item()

        print(f"Validation Loss: {epochLoss/len(self.datasetValid)}, Validation Accuracy: {epochAcc/len(self.datasetValid)}")
        return epochLoss/len(self.datasetValid)
    
    def test(self):
        self.model.eval()
        epochAcc = 0
        epochLoss = 0
        for i, (x, y) in enumerate(tqdm(self.datasetTest)):
            x = x.to(device)
            y = y.to(device)
            outputs, hidden = self.model(x)
            y = y.view(-1)
            loss = self.criterion(outputs, y)
            epochAcc += 100*(outputs.argmax(dim=1)==y).sum().item()/y.shape[0]
            epochLoss += loss.item()

        print(f"Test Loss: {epochLoss/len(self.datasetTest)}, Test Accuracy: {epochAcc/len(self.datasetTest)}")
        return epochLoss/len(self.datasetTest)





Using device cuda


In [334]:
VOCAB_SIZE = TRAINSET.vocabSize
EMBEDDING_DIM = 512
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROP_OUT = 0.5
BATCH_SIZE = 256

lngMOD = LSTMmodel(EMBEDDING_DIM,HIDDEN_DIM,NUM_LAYERS, VOCAB_SIZE, DROP_OUT)
eval = Evaluation(lngMOD,20,train_loader,valid_loader,test_loader)


In [335]:
eval.train()

  0%|                                                   | 0/373 [00:00<?, ?it/s]


RuntimeError: CUDA error: unspecified launch failure

In [282]:
eval.test()

100%|█████████████████████████████████████████| 157/157 [00:02<00:00, 53.80it/s]

Test Loss: 7.395227067789454, Test Accuracy: 14.464143539581437





7.395227067789454

In [290]:
globalVocab = TRAINSET.vocab
globalWordToIndex = TRAINSET.wordToIndex
PAD_INDEX = TRAINSET.padIndex
START_INDEX = TRAINSET.startIndex
UNK_INDEX = TRAINSET.unKnownIndex
MAX_LEN = TRAINSET.max_len

def getProbPerplexity(model,dataset):
    model.eval()
    perplexity_list = []
    with torch.no_grad():
        for line in dataset.data:
            perplexity = perpForSentence(model,line)
            if perplexity != -1:
            	perplexity_list.append({'line':line,'perplexity':perplexity})
            
    # averagePerplexty
    avgPerplexity = sum([line['perplexity'] for line in perplexity_list])/len(perplexity_list)
    return perplexity_list,avgPerplexity

def writeToFile(filePath,perplexity_list,avg):
    with open(filePath,'w') as f:
        f.write(f"Average Perplexity: {avg}\n")
        for line in perplexity_list:
            f.write(f"{line['line']}\t {line['perplexity']}\n")
            
            
def perpForSentence(model,sentence):
    model.eval()
    with torch.no_grad():
        prob_gram = 1
        tokens = Tokenise(sentence)
        print(tokens)
        if len(tokens)==0:
            return -1
        elif len(tokens) == 1:  # handle unigram case
            input_gram = torch.tensor([globalWordToIndex.get(tokens[0],UNK_INDEX)],dtype=torch.long).to(device)
            output,hidden = model(input_gram.unsqueeze(dim=0))
            output = torch.exp(output.view(-1))
            prob_gram = prob_gram*output[input_gram.item()].cpu().numpy()
            perplexity = (1/prob_gram)**(1/len(tokens))
            return perplexity
        elif len(tokens)>1:
            prefix_seqs = []
            gramList = []
            try:
                pfx = [tokens[0]]
                #prefix_seqs.append(pfx.copy())
                for token in tokens[1:]:
                    pfx.append(token)
                    prefix_seqs.append(pfx.copy())
                for i in range(len(prefix_seqs)):
                    currSeq = [globalWordToIndex.get(w,UNK_INDEX) for w in prefix_seqs[i]]
                    pref_sq = [START_INDEX] + [PAD_INDEX]*(MAX_LEN-len(currSeq)) + [w for w in currSeq]
                    gramList.append(list(pref_sq))
            except IndexError:
                print(tokens)
                print("idhr 2")
                return float("NaN")

            if len(gramList)>0:
                for gram in gramList:
                    input_gram = torch.tensor(gram[:-1],dtype=torch.long).to(device)
                    output_gram = gram[-1]
                    output,hidden = model(input_gram.unsqueeze(dim=0))
                    output = torch.exp(output.view(-1))
                    prob_gram = prob_gram * output[output_gram].cpu().numpy()

                    perplexity = (1/prob_gram)**(1/len(gramList))
                return perplexity
            else:
                print("idhr")
                return float("NaN")


In [285]:


perplexity_list, avgPerplexity = getProbPerplexity(lngMOD,TESTSET)
path = 'test2_perplexity.txt'
writeToFile(path,perplexity_list,avgPerplexity)

In [294]:
str1 = "—How long is Haines going to stay in this tower?"
str2 = "warrior"

print(perpForSentence(lngMOD,str1))
print(perpForSentence(lngMOD,str2))

['how', 'long', 'is', 'haines', 'going', 'to', 'stay', 'in', 'this', 'tower']
90.7964180689187
['warrior']
56.674560294432325


In [142]:
MAX_LEN = TRAINSET.max_len
MAX_LEN

30