In [1]:
import torch
import numpy as np
from datasets import load_dataset
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
batch_size = 256

# Creating vocabulory

In [3]:
n = 3
embedding_dims = 10 # how many dimensional vector should represent each word in the vocabulory

In [4]:
ds = load_dataset("MuskumPillerum/General-Knowledge")
df = ds['train'][:10]

In [5]:
def tokenize(sentence):
    data = sentence
    split_terms = [',', '.', '!', '?', '(', ')', '&', '$', '+', '-', '/', '*', ';', ':']
    for split_term in split_terms:
        if split_term in sentence:
            data = data.replace(split_term, f' {split_term} ')
    data = data.split()
    return data

tokenize('what is your !! ! name ?    ')

['what', 'is', 'your', '!', '!', '!', 'name', '?']

In [6]:
vocab_list = set()
X = []
for x,y in zip(df['Question'], df['Answer']):
    data = f'Question: {x} Answer: {y}'
    data = data.lower().replace('\\n', '')
    vocab_list.update(tokenize(data))
    X.append(data)

In [7]:
vocab_list.add('<UNK>')

In [8]:
vocab = {v:i for v,i in zip(vocab_list, range(0, len(vocab_list)+1))}
len(vocab)

141

In [9]:
vocab

{'such': 0,
 'typically': 1,
 'as': 2,
 '-': 3,
 'us': 4,
 'question': 5,
 'refers': 6,
 'speech': 7,
 'and': 8,
 'algorithm': 9,
 'over': 10,
 'its': 11,
 'form': 12,
 'deep': 13,
 'systems': 14,
 'labeled': 15,
 'tasks': 16,
 'receives': 17,
 'general': 18,
 'make': 19,
 'subset': 20,
 'based': 21,
 'network': 22,
 'natural': 23,
 'two': 24,
 'answer': 25,
 'task': 26,
 'many': 27,
 'perform': 28,
 'a': 29,
 'computers': 30,
 'maximize': 31,
 'process': 32,
 'ai': 33,
 'around': 34,
 'while': 35,
 'consists': 36,
 'feedback': 37,
 'human': 38,
 'narrow': 39,
 'information': 40,
 '<UNK>': 41,
 'data': 42,
 'signal': 43,
 'what': 44,
 'decision': 45,
 'interconnected': 46,
 'development': 47,
 'that': 48,
 'is': 49,
 'computing': 50,
 'it': 51,
 'require': 52,
 'learns': 53,
 'an': 54,
 'translation': 55,
 'way': 56,
 'world': 57,
 'without': 58,
 'neural': 59,
 '.': 60,
 'on': 61,
 'given': 62,
 'recognition': 63,
 'various': 64,
 'vision': 65,
 'where': 66,
 '?': 67,
 'by': 68,
 'act

In [10]:
embedding_matrix = torch.randn(len(vocab),embedding_dims)
embedding_matrix.shape

torch.Size([141, 10])

In [11]:
embedding_matrix[[1,2,3]]

tensor([[ 0.4601, -1.0276, -0.7393, -1.4246,  1.2579, -0.0627, -0.6399, -0.6654,
          0.8230,  0.0858],
        [ 1.1210, -0.1327,  1.7362, -0.8127,  1.5506, -0.7988, -2.0659, -0.1187,
         -0.7267, -1.2480],
        [ 2.1399, -1.2401, -0.5483,  0.2006, -0.2095,  0.7800, -1.6163, -0.0790,
          0.0962, -1.1288]])

In [12]:
def get_word_embedding(word, vocab=vocab, embedding_matrix=embedding_matrix):
    if word not in vocab:
        word = '<UNK>'
    embedding = embedding_matrix[vocab[word]]
    return embedding

get_word_embedding('as')

tensor([ 1.1210, -0.1327,  1.7362, -0.8127,  1.5506, -0.7988, -2.0659, -0.1187,
        -0.7267, -1.2480])

# Creating the model

In [13]:
from print_color import print

In [14]:
class NeuralNetwork(nn.Module):
    def __init__(self, n=n, num_hidden_layer=1024, vocab_len = len(vocab_list), dim_embedding=10):
        super(NeuralNetwork, self).__init__()
        self.n = n
        self.dim_embedding = dim_embedding
        self.embedding = nn.Embedding(vocab_len, dim_embedding).to(device)

        self.hidden_layer = nn.Linear((n-1)*dim_embedding, num_hidden_layer)
        self.relu = nn.ReLU()
        self.output = nn.Linear(num_hidden_layer, vocab_len)
        # self.softmax = nn.Softmax(dim=1)

    def forward(self,x):
        # x will be the indices of embedding representing the input words
        x_embeddings = self.embedding(x).view(-1,(self.n-1)*self.dim_embedding).to(device)
        out = self.hidden_layer(x_embeddings)
        out = self.relu(out)
        out = self.output(out)
        return out

model = NeuralNetwork(n=3).to(device)

# Creating dataset

In [44]:
dataset = []
for x,y in zip(df['Question'], df['Answer']):
    data = f'Question: {x} Answer: {y}'.lower()
    tokenized_data = tokenize(data)
    for i in range(len(tokenized_data)-n):
        # print(i)
        data_i = tokenized_data[i:i+n]
        dataset.append([vocab[i] if i in vocab else vocab['<UNK>'] for i in data_i])
    # print()

dataset_np = np.array(dataset)
dataset_np.shape

(419, 3)

In [45]:
dataset

[[5, 83, 44],
 [83, 44, 49],
 [44, 49, 129],
 [49, 129, 85],
 [129, 85, 67],
 [85, 67, 25],
 [67, 25, 83],
 [25, 83, 129],
 [83, 129, 85],
 [129, 85, 6],
 [85, 6, 110],
 [6, 110, 100],
 [110, 100, 47],
 [100, 47, 125],
 [47, 125, 136],
 [125, 136, 14],
 [136, 14, 48],
 [14, 48, 91],
 [48, 91, 28],
 [91, 28, 16],
 [28, 16, 48],
 [16, 48, 118],
 [48, 118, 1],
 [118, 1, 52],
 [1, 52, 38],
 [52, 38, 85],
 [38, 85, 114],
 [85, 114, 0],
 [114, 0, 2],
 [0, 2, 95],
 [2, 95, 74],
 [95, 74, 114],
 [74, 114, 7],
 [114, 7, 63],
 [7, 63, 114],
 [63, 114, 45],
 [114, 45, 3],
 [45, 3, 132],
 [3, 132, 114],
 [132, 114, 8],
 [114, 8, 94],
 [8, 94, 55],
 [94, 55, 60],
 [5, 83, 44],
 [83, 44, 73],
 [44, 73, 100],
 [73, 100, 24],
 [100, 24, 88],
 [24, 88, 122],
 [88, 122, 125],
 [122, 125, 129],
 [125, 129, 85],
 [129, 85, 67],
 [85, 67, 25],
 [67, 25, 83],
 [25, 83, 100],
 [83, 100, 24],
 [100, 24, 88],
 [24, 88, 122],
 [88, 122, 125],
 [122, 125, 129],
 [125, 129, 85],
 [129, 85, 73],
 [85, 73, 39],
 [7

In [16]:
class QuestionAnswerDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.x = dataset[:,[i for i in range(n-1)]]
        self.y = dataset[:,-1]
        self.m, self.n = self.x.shape
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.m
    
dataset = QuestionAnswerDataset(dataset=dataset_np)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Training

In [17]:
num_epochs = 100
lr = 0.01
print(device)

cuda[0m


In [18]:
model = model.to(device)
model = torch.compile(model)

In [19]:
lossCategory = nn.CrossEntropyLoss()
optimiser = torch.optim.SGD(model.parameters(), lr=lr)

In [20]:
for epoch in range(num_epochs):
    for i, (input, label) in enumerate(dataloader):
        x = input.to(device)
        y = label.to(device)
        y_pred = model(x)
        loss = lossCategory(y_pred, y)
        if epoch%10==0 and i==0: 
            # print(epoch)
            print(loss.item())
        loss.backward()
        optimiser.step()
        optimiser.zero_grad()




4.981821060180664[0m
4.734204292297363[0m
4.493097305297852[0m
4.2728271484375[0m
4.112992286682129[0m
3.974346160888672[0m
3.8212594985961914[0m
3.779200553894043[0m
3.601888418197632[0m
3.5059938430786133[0m


# Inference

In [28]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [33]:
vocab['is']

49

In [53]:
text = 'What is this?'
tokenized_text = tokenize(text.lower())
embedding_indices = torch.tensor(np.array([vocab[word] for word in tokenized_text])).to(device)
embedding_indices

tensor([44, 49, 77, 67], device='cuda:0')

In [56]:
embedding_indices

tensor([44, 49, 77, 67], device='cuda:0')

In [55]:
model(embedding_indices)

tensor([[-5.4498e-01, -7.1713e-01, -4.2650e-01,  4.3660e-01, -2.8130e-01,
         -1.2373e+00, -3.9052e-01, -4.0208e-01,  1.9653e-01,  6.2154e-01,
         -6.2180e-01, -1.4936e-01, -5.1768e-01,  7.0551e-01, -3.2258e-01,
         -4.7212e-01, -1.3177e-01, -5.7923e-01, -3.6617e-01, -2.7128e-01,
          7.0688e-02, -5.3920e-01,  5.7039e-01,  3.0758e-01, -1.4865e-01,
          5.4150e-01, -5.4369e-01, -1.3179e-01, -5.0321e-01,  3.1005e+00,
          5.0916e-02, -3.7448e-01, -3.5911e-03, -1.8104e-01, -6.0373e-01,
         -2.1384e-01, -5.1528e-01,  4.8394e-02,  6.1310e-01, -3.3543e-01,
         -7.0480e-02, -4.2851e-01,  9.7074e-01, -3.4443e-01, -1.9998e-01,
         -1.5703e-01, -3.3473e-01,  2.5693e-01,  1.5371e+00,  1.9967e+00,
         -4.3060e-01, -5.8848e-01, -4.9204e-01,  2.6934e-02,  1.8756e+00,
         -5.3164e-01, -2.8458e-01, -4.4690e-01, -4.1385e-01,  3.2349e-01,
          1.3680e+00,  5.8576e-01, -5.1179e-01, -4.8101e-01, -2.5816e-01,
         -3.6412e-01, -1.1374e-01,  1.

In [58]:
model.embedding

Embedding(141, 10)

In [22]:
# TO DO
# normalisation, standardise, proper init, weight decay, hyperparameter tunign, diff optimiser test, inference, earlyy stopping
# add direct connections, see if it is useful
# plot unigram, bi, tri, etc perplexity score, word error rate, etc