In [123]:
import torch
from torch.nn import Module
import pandas as pd
from torch.utils.data import Dataset,DataLoader
import re

In [171]:
df=pd.read_csv("./100_Unique_QA_Dataset.csv")
df.head(10)

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
5,Who painted the Mona Lisa?,Leonardo-da-Vinci
6,What is the square root of 64?,8
7,What is the chemical symbol for gold?,Au
8,Which year did World War II end?,1945
9,What is the longest river in the world?,Nile


In [125]:
import spacy
nlp=spacy.load('en_core_web_sm')
import numpy as np

In [228]:
vocab={'<UNKNOWN>':0}
def preprocess(text:str):
    text=text.lower().replace('?','').replace("'","")
    # doc=nlp(text)
    vectorized_text=[]
    for token in text.split(" "):
        if token not in vocab.keys():
            vocab[token]=len(vocab)
        vectorized_text.append(vocab[token])
    return np.array(vectorized_text)
X=df['question'].apply(preprocess).values
Y=df['answer'].apply(preprocess).values
data=np.concatenate((X.reshape(-1,1),Y.reshape(-1,1)),axis=1)
data.shape

(90, 2)

In [242]:
class QADataSet(Dataset):
    def __init__(self,data):
        self.data=data
    def __getitem__(self,idx):
        return torch.tensor(self.data[idx][0],dtype=torch.int),torch.tensor(self.data[idx][1],dtype=torch.long)
    def __len__(self):
        return self.data.shape[0]

In [243]:
dataset=QADataSet(data=data)
data_loader=DataLoader(dataset=dataset,batch_size=1,shuffle=True)

In [244]:
embedding_layer=nn.Embedding(len(vocab),4)

In [245]:
embeddings=embedding_layer(torch.tensor(data[1][0]))
rnn=nn.RNN(4,64)

In [246]:
hidden_states,output=rnn(embeddings)

In [247]:
fc=nn.Linear(64,len(vocab))

In [248]:
class QAClassifier(nn.Module):
    def __init__(self,vocab_size:int,hidden_layer_size:int):
        super().__init__()
        self.embedding_layer=nn.Embedding(vocab_size,hidden_layer_size)
        self.rnn=nn.RNN(hidden_layer_size,hidden_layer_size,batch_first=True)
        self.fc=nn.Linear(hidden_layer_size,vocab_size)
    def forward(self,sequence):
        embeddings=self.embedding_layer(sequence)
        hidden_states,output=self.rnn(embeddings)
        return self.fc(output)

In [251]:
classifier=QAClassifier(len(vocab),64)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.AdamW(lr=0.001,params=classifier.parameters())
epochs=100
for epoch in range(epochs):
    total_loss=0
    for features,output in data_loader:
        predicted_output=classifier(features)
        loss=criterion(predicted_output.squeeze(),output.squeeze())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss+=loss
    if epoch%10 ==0:
        print(f"Loss: {total_loss/len(data_loader)}")

Loss: 5.885409832000732
Loss: 0.6183919310569763
Loss: 0.10979039967060089
Loss: 0.040421344339847565
Loss: 0.020134711638092995
Loss: 0.011285902000963688
Loss: 0.006717788986861706
Loss: 0.004134269431233406
Loss: 0.0025895931757986546
Loss: 0.0016436452278867364


In [259]:
torch.argmax(classifier(torch.tensor(preprocess("Who wrote 'To Kill a Mockingbird'?"))))

tensor(247)

In [261]:
list(vocab.keys())[247]

'harper-lee'

In [262]:
torch.argmax(classifier(torch.tensor(preprocess("Which year did World War II end?"))))

tensor(253)

In [263]:
list(vocab.keys())[253]

'1945'