In [13]:
import torch
import numpy as np
from datasets import load_dataset
import torch.nn as nn

# Creating vocabulory and embeddings

In [2]:
n = 3
embedding_dims = 10 # how many dimensional vector should represent each word in the vocabulory

In [3]:
ds = load_dataset("MuskumPillerum/General-Knowledge")
df = ds['train'][:10]

In [4]:
def tokenize(sentence):
    data = sentence
    split_terms = [',', '.', '!', '?', '(', ')', '&', '$', '+', '-', '/', '*', ';', ':']
    for split_term in split_terms:
        if split_term in sentence:
            data = data.replace(split_term, f' {split_term} ')
    data = data.split()
    return data

tokenize('what is your !! ! name ?    ')

['what', 'is', 'your', '!', '!', '!', 'name', '?']

In [5]:
vocab_list = set()
X = []
for x,y in zip(df['Question'], df['Answer']):
    data = f'Question: {x} Answer: {y}'
    data = data.lower().replace('\\n', '')
    vocab_list.update(tokenize(data))
    X.append(data)
len(vocab_list)

140

In [6]:
vocab_list.add('<UNK>')

In [7]:
vocab = {v:i for v,i in zip(vocab_list, range(1, len(vocab_list)+1))}
len(vocab)

141

In [8]:
vocab

{'recognition': 1,
 'works': 2,
 'decisions': 3,
 'focuses': 4,
 '<UNK>': 5,
 'consists': 6,
 'take': 7,
 'improve': 8,
 'data': 9,
 'is': 10,
 'perform': 11,
 'by': 12,
 'ai': 13,
 'intelligence': 14,
 'penalty': 15,
 'relationships': 16,
 'process': 17,
 'can': 18,
 'typically': 19,
 'algorithms': 20,
 'many': 21,
 '-': 22,
 'are': 23,
 'inspired': 24,
 'learning': 25,
 'it': 26,
 'answer': 27,
 'given': 28,
 'perception': 29,
 'and': 30,
 'being': 31,
 'type': 32,
 'environment': 33,
 'systems': 34,
 'receives': 35,
 'allows': 36,
 'generate': 37,
 'that': 38,
 'over': 39,
 'designed': 40,
 'vision': 41,
 'computer': 42,
 'natural': 43,
 'feedback': 44,
 'would': 45,
 'takes': 46,
 'have': 47,
 'around': 48,
 'information': 49,
 'signal': 50,
 'or': 51,
 'system': 52,
 'nodes': 53,
 'function': 54,
 'various': 55,
 'form': 56,
 'with': 57,
 'artificial': 58,
 'without': 59,
 'brain': 60,
 'understand': 61,
 'two': 62,
 'patterns': 63,
 'neurons': 64,
 '?': 65,
 'examples': 66,
 'tas

In [9]:
embedding_matrix = torch.randn(len(vocab),embedding_dims)
embedding_matrix.shape

torch.Size([141, 10])

In [10]:
def get_word_embedding(word, vocab=vocab, embedding_matrix=embedding_matrix):
    if word not in vocab:
        word = '<UNK>'
    embedding = embedding_matrix[vocab[word]]
    return embedding

get_word_embedding('as')

tensor([ 0.4681, -0.6751,  1.4149,  0.6815,  0.1955,  1.8689,  0.1731,  1.3575,
        -0.0200,  0.0611])

# Creating the model

In [27]:
class NeuralNetwork(nn.Module):
    def __init__(self, n=n, dim_embedding=embedding_dims, vocab_length=len(vocab), num_hidden_layer=100):
        super(NeuralNetwork, self).__init__()
        self.hidden_layer = nn.Linear(dim_embedding, num_hidden_layer)
        self.relu = nn.ReLU()
        self.output = nn.Linear(num_hidden_layer, vocab_length)
        self.softmax = nn.Softmax()

    def forward(self,x):
        # shape of x: (n, dim_embedding) -> (3, 10)
        out = self.hidden_layer(x)
        out = self.relu(out)
        out = self.output(out)
        out = self.softmax(out)
        return out

model = NeuralNetwork()

In [29]:
# test_x = torch.randn((3,10))
# model(test_x).shape

torch.Size([3, 141])

In [30]:
num_epochs = 100
lr = 0.01

In [33]:
lossCategory = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=lr)

# Creating dataset

# Training