In [1]:
import torch
import numpy as np
from datasets import load_dataset
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
batch_size = 256

# Creating vocabulory

In [3]:
n = 5
embedding_dims = 10 # how many dimensional vector should represent each word in the vocabulory

In [4]:
ds = load_dataset("MuskumPillerum/General-Knowledge")
df = ds['train'][:10]

In [5]:
def tokenize(sentence):
    data = sentence
    split_terms = [',', '.', '!', '?', '(', ')', '&', '$', '+', '-', '/', '*', ';', ':']
    for split_term in split_terms:
        if split_term in sentence:
            data = data.replace(split_term, f' {split_term} ')
    data = data.split()
    return data

tokenize('what is your !! ! name ?    ')

['what', 'is', 'your', '!', '!', '!', 'name', '?']

In [6]:
vocab_list = set()
X = []
for x,y in zip(df['Question'], df['Answer']):
    data = f'<s> Question: {x} Answer: {y} </s>'
    data = data.lower().replace('\\n', '')
    vocab_list.update(tokenize(data))
    X.append(data)

In [7]:
vocab_list.add('<UNK>')

In [8]:
vocab = {v:i for v,i in zip(vocab_list, range(0, len(vocab_list)+1))}
len(vocab)

145

In [9]:
vocab

{'algorithms': 0,
 'analyze': 1,
 'refers': 2,
 'information': 3,
 'this': 4,
 'without': 5,
 'learning': 6,
 'a': 7,
 'reward': 8,
 'and': 9,
 'main': 10,
 'speech': 11,
 'predictions': 12,
 'processing': 13,
 'signal': 14,
 'ability': 15,
 'maximize': 16,
 'transmit': 17,
 'environment': 18,
 'aims': 19,
 'answer': 20,
 'to': 21,
 'such': 22,
 'network': 23,
 'its': 24,
 'would': 25,
 'focuses': 26,
 'by': 27,
 'based': 28,
 'learn': 29,
 '-': 30,
 'patterns': 31,
 'question': 32,
 '<UNK>': 33,
 'designed': 34,
 'decision': 35,
 'subset': 36,
 'interconnected': 37,
 'feedback': 38,
 'using': 39,
 'data': 40,
 'allows': 41,
 'tasks': 42,
 'labeled': 43,
 'narrow': 44,
 'typically': 45,
 'being': 46,
 'examples': 47,
 'development': 48,
 '/': 49,
 'ai': 50,
 'interpret': 51,
 'brain': 52,
 'form': 53,
 'deep': 54,
 'penalty': 55,
 'intelligence': 56,
 '?': 57,
 'relationships': 58,
 'vision': 59,
 'an': 60,
 'system': 61,
 'structure': 62,
 'learns': 63,
 'process': 64,
 'generate': 65

In [10]:
embedding_matrix = torch.randn(len(vocab),embedding_dims)
embedding_matrix.shape

torch.Size([145, 10])

In [11]:
embedding_matrix[[1,2,3]]

tensor([[-0.2772, -0.3281,  0.3344, -0.6267,  1.0277,  0.3134,  0.0751, -1.0227,
         -1.3620,  0.6619],
        [ 2.1767, -0.4258,  0.3887,  1.1047,  0.0862,  0.7653,  0.6838, -0.3058,
          0.0825,  1.4358],
        [-0.0103, -0.9212, -0.9862, -1.1666,  0.0087, -0.8620, -0.8208, -1.8808,
          0.4522,  1.8570]])

In [12]:
def get_word_embedding(word, vocab=vocab, embedding_matrix=embedding_matrix):
    if word not in vocab:
        word = '<UNK>'
    embedding = embedding_matrix[vocab[word]]
    return embedding

get_word_embedding('as')

tensor([ 0.1123,  1.7515, -3.0045, -1.6307,  0.0911,  1.1416, -0.6479,  0.3158,
        -0.7174, -0.1784])

# Creating the model

In [13]:
class NeuralNetwork(nn.Module):
    def __init__(self, n=n, num_hidden_layer=1024, vocab = vocab, dim_embedding=10):
        super(NeuralNetwork, self).__init__()
        vocab_len = len(vocab)
        self.n = n
        self.dim_embedding = dim_embedding
        self.embedding = nn.Embedding(vocab_len, dim_embedding).to(device)

        self.hidden_layer = nn.Linear((n-1)*dim_embedding, num_hidden_layer)
        self.relu = nn.ReLU()
        self.output = nn.Linear(num_hidden_layer, vocab_len)

    def forward(self,x):
        # x will be the indices of embedding representing the input words
        x_embeddings = self.embedding(x).view(-1,(self.n-1)*self.dim_embedding).to(device)
        out = self.hidden_layer(x_embeddings)
        out = self.relu(out)
        out = self.output(out)
        return out
    

    def generate(self, x):
        print(x.shape)
        assert x.shape == 0 # fix shape here, and write a loop to do autoregressive text gen
        logits = self.forward(x)
        probs = torch.softmax(logits, dim=1)
        prob, predicted_token_index = torch.max(probs, dim=1)
        return predicted_token_index

model = NeuralNetwork(n=n).to(device)

# Creating dataset

In [14]:
dataset = []
for x,y in zip(df['Question'], df['Answer']):
    data = f'Question: {x} Answer: {y}'.lower()
    tokenized_data = tokenize(data)
    for i in range(len(tokenized_data)-n):
        # print(i)
        data_i = tokenized_data[i:i+n]
        dataset.append([vocab[i] if i in vocab else vocab['<UNK>'] for i in data_i])
    # print()

dataset_np = np.array(dataset)
dataset_np.shape

(399, 5)

In [15]:
dataset_np[1]

array([140,  90, 109,  96,  56])

In [16]:
class QuestionAnswerDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.x = dataset[:,[i for i in range(n-1)]]
        self.y = dataset[:,-1]
        self.m, self.n = self.x.shape
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.m
    
dataset = QuestionAnswerDataset(dataset=dataset_np)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Training

In [17]:
num_epochs = 100
lr = 0.001
print(device)

cuda


In [18]:
model = model.to(device)
model = torch.compile(model)

In [19]:
lossCategory = nn.CrossEntropyLoss()
optimiser = torch.optim.SGD(model.parameters(), lr=lr)

In [20]:
torch.set_float32_matmul_precision('high')

In [21]:
for epoch in range(num_epochs):
    for i, (input, label) in enumerate(dataloader):
        x = input.to(device)
        y = label.to(device)
        y_pred = model(x)
        loss = lossCategory(y_pred, y)
        if epoch%10==0 and i==0: 
            # print(epoch)
            print(loss.item())
        loss.backward()
        optimiser.step()
        optimiser.zero_grad()


5.033625602722168
5.013422966003418
4.9808454513549805
4.938394546508789
4.942917823791504
4.915888786315918
4.884629249572754
4.873202800750732
4.850069046020508
4.822998046875


# Inference

In [22]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [23]:
vocab['is']

109

In [31]:
text = '<s> Question: What is deep learning?'
tokenized_text = tokenize(text.lower())
embedding_indices = torch.tensor(np.array([vocab[word] for word in tokenized_text])).to(device)
embedding_indices

tensor([121,  32, 140,  90, 109,  54,   6,  57], device='cuda:0')

In [32]:
embedding_indices

tensor([121,  32, 140,  90, 109,  54,   6,  57], device='cuda:0')

In [33]:
model.generate(embedding_indices)

tensor([ 7, 20], device='cuda:0')

In [34]:
vocab_rev = {v:k for k,v in vocab.items()}

In [36]:
vocab_rev[20]

'answer'

In [29]:
model.embedding

Embedding(145, 10)

In [30]:
# TO DO
# normalisation, standardise, proper init, weight decay, hyperparameter tunign, diff optimiser test, inference, earlyy stopping
# add direct connections, see if it is useful
# plot unigram, bi, tri, etc perplexity score, word error rate, etc