In [1]:
import torch
import numpy as np
from datasets import load_dataset
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

from warnings import filterwarnings
filterwarnings('ignore')

import torch._dynamo
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision('high')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 500
n = 25
embedding_dims = 10 # how many dimensional vector should represent each word in the vocabulory
num_epochs = 100
lr = 0.1
lr_coeff = 0.1
max_words = 30
train_data = 10000
print(device)

cuda


# Creating vocabulory

In [3]:
ds = load_dataset("MuskumPillerum/General-Knowledge")
df = ds['train'][:train_data]
ds.shape

{'train': (37635, 2)}

In [4]:
def tokenize(sentence):
    data = sentence
    split_terms = [',', '.', '!', '?', '(', ')', '&', '$', '+', '-', '/', '*', ';', ':']
    for split_term in split_terms:
        if split_term in sentence:
            data = data.replace(split_term, f' {split_term} ')
    data = data.split()
    return data

tokenize('what is your !! ! name ?    ')

['what', 'is', 'your', '!', '!', '!', 'name', '?']

In [5]:
vocab_list = set()
X = []
for x,y in zip(df['Question'], df['Answer']):
    data = f'Question: {x} Answer: {y}'
    data = data.lower().replace('\\n', '')
    vocab_list.update(tokenize(data))
    X.append(data)

In [6]:
vocab_list.add('<PAD>')
vocab_list.add('<start>')
vocab_list.add('<end>')
vocab = {v:i for v,i in zip(vocab_list, range(1, len(vocab_list)+1))}
vocab['<UNK>'] = 0
len(vocab)

23447

In [7]:
embedding_matrix = torch.randn(len(vocab),embedding_dims)
embedding_matrix.shape

torch.Size([23447, 10])

In [8]:
def get_word_embedding(word, vocab=vocab, embedding_matrix=embedding_matrix):
    if word not in vocab:
        word = '<UNK>'
    embedding = embedding_matrix[vocab[word]]
    return embedding

get_word_embedding('as')

tensor([-1.1711,  1.0565, -1.7252, -1.8323, -0.7994,  0.2763, -0.2164, -1.1071,
        -0.5422, -0.9592])

# Creating the model

In [9]:
class NeuralNetwork(nn.Module):
    def __init__(self, n=n, num_hidden_layer=1024, vocab = vocab, dim_embedding=10):
        super(NeuralNetwork, self).__init__()
        self.rev_vocab = {v:k for k,v in vocab.items()}
        vocab_len = len(vocab)
        self.n = n
        self.dim_embedding = dim_embedding
        self.embedding = nn.Embedding(vocab_len, dim_embedding).to(device)

        self.hidden_layer = nn.Linear((n-1)*dim_embedding, num_hidden_layer)
        self.relu = nn.ReLU()
        self.output = nn.Linear(num_hidden_layer, vocab_len)

    def forward(self,x):
        # x will be the indices of embedding representing the input words
        x = torch.tensor(x).to(device)
        # x = x[-(n-1):]
        # print(x.shape)
        x_embeddings = self.embedding(x).view(-1,(self.n-1)*self.dim_embedding).to(device)
        out = self.hidden_layer(x_embeddings)
        out = self.relu(out)
        out = self.output(out)
        # print(out.shape)
        return out
    

    def generate(self, x):
        x = torch.tensor(x).to(device)

        # print(x.shape)
        # assert x.shape == 0 # fix shape here, and write a loop to do autoregressive text gen
        x = x[-(n-1):]
        # assert len(x) == self.n - 1
        logits = self.forward(x)
        probs = torch.softmax(logits, dim=1)
        prob, predicted_token_index = torch.max(probs, dim=1)
        # print(self.embedding(x).shape)  # Shape before reshaping
        # print((self.n-1)*self.dim_embedding)  # Expected shape size

        return predicted_token_index

model = NeuralNetwork(n=n).to(device)

# Creating dataset

In [10]:
def split_sentece_to_n_grams(tokenized_sentence, n, vocab):
    res = []
    data_init = ['<PAD>' for i in range(n-1)]
    for word in tokenized_sentence:
        data_init.append(word)
        sentence = data_init[-n:]
        # print(sentence)
        res.append([vocab[word] if word in vocab else vocab['<UNK>'] for word in sentence])
    return res

split_sentece_to_n_grams(['what', 'is', 'your'], 9 ,vocab)

[[10428, 10428, 10428, 10428, 10428, 10428, 10428, 10428, 12159],
 [10428, 10428, 10428, 10428, 10428, 10428, 10428, 12159, 8808],
 [10428, 10428, 10428, 10428, 10428, 10428, 12159, 8808, 786]]

In [11]:
dataset = []
for x,y in zip(df['Question'], df['Answer']):
    data = f'<start> Question: {x} Answer: {y} <end>'.lower()
    tokenized_data = tokenize(data)
    dataset.extend(split_sentece_to_n_grams(tokenized_data, n, vocab))

dataset_np = np.array(dataset)

In [13]:
class QuestionAnswerDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.x = dataset[:,[i for i in range(n-1)]]
        self.y = dataset[:,-1]
        self.m, self.n = self.x.shape
        self.standardize()
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.m

    def standardize(self):
        scaler = StandardScaler()
        self.dataset = scaler.fit_transform(self.dataset) 
dataset = QuestionAnswerDataset(dataset=dataset_np)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Training

In [14]:
model = model.to(device)
model = torch.compile(model)
lossCategory = nn.CrossEntropyLoss()
optimiser = torch.optim.SGD(model.parameters(), lr=lr)


In [15]:
for epoch in range(num_epochs):
    for i, (inp, label) in enumerate(dataloader):
        x = inp.to(device)
        y = label.to(device)
        y_pred = model(x)
        loss = lossCategory(y_pred, y)
        if epoch%10==0 and i==0: 
            print(f'Epoch {epoch} Loss: {loss.item()}')
        lr = lr/(1+lr_coeff*epoch)
        loss.backward()
        optimiser.step()
        optimiser.zero_grad() # The loss has to go below .5 for the model to be good


Epoch 0 Loss: 10.081502914428711
Epoch 10 Loss: 4.284249782562256
Epoch 20 Loss: 3.6533846855163574
Epoch 30 Loss: 2.7321884632110596
Epoch 40 Loss: 2.239819288253784
Epoch 50 Loss: 1.6325013637542725
Epoch 60 Loss: 1.5693230628967285
Epoch 70 Loss: 1.5376567840576172
Epoch 80 Loss: 1.6682907342910767
Epoch 90 Loss: 1.5232880115509033


# Inference

In [16]:
vocab_rev = {v:k for k,v in vocab.items()}

In [32]:
text = '<start> Question: what is deep learning?'


In [33]:
tokenized_text = tokenize(text.lower())
embedding_indices = np.array(split_sentece_to_n_grams(tokenized_text, n, vocab))
gen_word = ''
data_stream = list(embedding_indices[-1])
words_generated = 0
print(text.lower(), end = ' ')
while gen_word != vocab['<end>'] and words_generated <= max_words:
    data_stream = data_stream[-(n-1):]
    gen_word = model.generate(data_stream)
    words_generated += 1
    data_stream.append(gen_word.item())
    
    print(vocab_rev[gen_word.item()], end= ' ')



<start> question: what is deep learning? answer : an electrophile is the theory of an issue <UNK> <UNK> ( <UNK> <UNK> <UNK> <UNK> <UNK> function <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> anagrams <UNK> <UNK> <UNK> <UNK> <UNK> 

In [20]:
# TO DO
# proper init, weight decay, hyperparameter tunign, diff optimiser test
# add direct connections, see if it is useful
# plot unigram, bi, tri, etc perplexity score, word error rate, etc