In [1]:
import torch
from torch.utils.data import DataLoader,Dataset
import torch.nn as nn
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/100_Unique_QA_Dataset - 100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
def preprocessing(sen):
  sen = sen.lower()
  sen = sen.replace('?','')
  sen = sen.replace(",",'')
  return sen.split()

In [4]:
vocab = {'<UNK>' : 0}
def build_vocab(row):
  processed_ques = preprocessing(row['question'])
  processed_ans = preprocessing(row['answer'])

  processed_merge = processed_ques + processed_ans

  for word in processed_merge:
    if word not in vocab:
      vocab[word] = len(vocab)

In [5]:
df.apply(build_vocab,axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [6]:
def text_to_index(sen,vocab):
  indexed_sen = []

  for word in preprocessing(sen):
    if word in vocab:
      indexed_sen.append(vocab[word])
    else:
      indexed_sen.append(vocab['<UNK>'])

  return indexed_sen

In [7]:
class make_dataset(Dataset):
  def __init__(self,vocab,df):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self,index):
    indexed_ques = text_to_index(self.df.loc[index,'question'],self.vocab)
    indexed_ans = text_to_index(self.df.loc[index,'answer'],self.vocab)

    return torch.tensor(indexed_ques), torch.tensor(indexed_ans)

In [18]:
dataset = make_dataset(vocab,df)

In [19]:
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)

In [20]:
class MyRNN(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.embeddings = nn.Embedding(vocab_size,embedding_dim=50)
    self.rnn = nn.RNN(50,128,batch_first=True)
    self.output = nn.Linear(128,vocab_size)

  def forward(self,ques):
    embedded_ques = self.embeddings(ques)
    _,final = self.rnn(embedded_ques)
    output = self.output(final.squeeze(0))

    return output

In [22]:
learning_rate = 0.01
epochs = 20

model = MyRNN(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = learning_rate)

for epoch in range(epochs):
  total_loss = 0
  num_batches = 0
  for question,answer in dataloader:
    output = model(question)
    loss = criterion(output,answer[0])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    num_batches += 1
  average_epoch_loss = total_loss / num_batches
  print(f'Epoch : {epoch+1}, Loss : {average_epoch_loss:.4f}')

Epoch : 1, Loss : 6.2773
Epoch : 2, Loss : 6.0779
Epoch : 3, Loss : 5.1968
Epoch : 4, Loss : 3.1989
Epoch : 5, Loss : 1.9376
Epoch : 6, Loss : 1.6169
Epoch : 7, Loss : 0.8500
Epoch : 8, Loss : 0.7797
Epoch : 9, Loss : 0.3800
Epoch : 10, Loss : 0.2425
Epoch : 11, Loss : 0.4131
Epoch : 12, Loss : 0.2185
Epoch : 13, Loss : 0.2877
Epoch : 14, Loss : 0.2980
Epoch : 15, Loss : 0.1827
Epoch : 16, Loss : 0.2119
Epoch : 17, Loss : 0.2732
Epoch : 18, Loss : 0.2170
Epoch : 19, Loss : 0.4147
Epoch : 20, Loss : 0.3682


In [25]:
def predict(model,ques,threshold=0.5):
  indexed_ques = text_to_index(ques,vocab)
  ques_tensor = torch.tensor(indexed_ques).unsqueeze(0)
  with torch.no_grad():
    output = model(ques_tensor)
    probs = nn.functional.softmax(output,dim=1)
    max_prob,index = torch.max(probs,dim=1)

    if max_prob < threshold:
      print("I don't know")
    print(list(vocab.keys())[index])

In [42]:
predict(model,"Capital of Brazil?")

brasilia
