In [1]:
import pandas as pd
df=pd.read_csv("100_Unique_QA_Dataset.csv")

In [2]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
##tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
from nltk.tokenize import word_tokenize

df[["question_tokens", "answer_tokens"]] = df[["question", "answer"]].applymap(word_tokenize)


  df[["question_tokens", "answer_tokens"]] = df[["question", "answer"]].applymap(word_tokenize)


In [5]:
print(df)

                                             question        answer  \
0                      What is the capital of France?         Paris   
1                     What is the capital of Germany?        Berlin   
2                  Who wrote 'To Kill a Mockingbird'?    Harper-Lee   
3     What is the largest planet in our solar system?       Jupiter   
4      What is the boiling point of water in Celsius?           100   
..                                                ...           ...   
85                  Who directed the movie 'Titanic'?  JamesCameron   
86  Which superhero is also known as the Dark Knight?        Batman   
87                     What is the capital of Brazil?      Brasilia   
88        Which fruit is known as the king of fruits?         Mango   
89       Which country is known for the Eiffel Tower?        France   

                                      question_tokens   answer_tokens  
0             [What, is, the, capital, of, France, ?]         [Paris]  
1  

In [6]:
# vocab form
vocab = {"<unk>": 0}

for col in ["question_tokens", "answer_tokens"]:
    for tokens in df[col]:
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)


In [9]:
len(vocab)

333

In [10]:
# to numerical indices
from nltk.tokenize import word_tokenize

def tokens_to_indices(text, vocab):
    indexed_text = []

    for token in word_tokenize(text):
        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab["<unk>"])

    return indexed_text


In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

In [13]:
class rnnDataset(Dataset):
  def __init__(self,df,vocab):
    self.df=df
    self.vocab=vocab
  def __len__(self):
    return len(self.df)
  def __getitem__(self,idx):
    num_question=tokens_to_indices(self.df.iloc[idx]["question"],self.vocab)
    num_answer=tokens_to_indices(self.df.iloc[idx]["answer"],self.vocab)
    return torch.tensor(num_question),torch.tensor(num_answer)

In [14]:
dataset=rnnDataset(df,vocab)

In [15]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6, 7]), tensor([254]))

In [16]:
dataloader=DataLoader(dataset,batch_size=1,shuffle=True)

In [18]:
for question,answer in dataloader:
  print(question)
  print(answer)


tensor([[  1,   2,   3,   4,   5, 185, 186,   7]])
tensor([[312]])
tensor([[  9,  62, 162,   7]])
tensor([[307]])
tensor([[ 36,  71,  72, 188, 189,  18,  34, 190,   7]])
tensor([[314]])
tensor([[ 36, 227, 228,  93, 229, 126, 230, 231,   7]])
tensor([[326]])
tensor([[  1,   2,   3, 117,  71,  18,  70, 150, 151,   7]])
tensor([[303]])
tensor([[  1,   2,   3,   4,   5, 161,   7]])
tensor([[306]])
tensor([[ 9, 26,  3, 27, 28,  7]])
tensor([[259]])
tensor([[36, 82,  2,  3, 16,  7]])
tensor([[277]])
tensor([[ 1,  2,  3, 76, 77, 78,  7]])
tensor([[274]])
tensor([[ 1,  2,  3, 32, 33, 34, 35,  7]])
tensor([[261]])
tensor([[ 36, 156,   2,  13, 157, 158, 159, 160,   7]])
tensor([[305]])
tensor([[ 9, 79,  3, 80,  7]])
tensor([[275]])
tensor([[ 64,  65, 225,  67,  18,  13, 226,   7]])
tensor([[272]])
tensor([[  1,   2,   3,  76, 108,  18,   3,  45,   7]])
tensor([[300]])
tensor([[  1,   2,   3,   4,   5, 107,   7]])
tensor([[288]])
tensor([[ 36, 249,   2,  52,  53,   3, 250,   5, 251,   7]])
tensor

In [19]:
import torch.nn as nn

In [28]:
class vanillarnn(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.embedding=nn.Embedding(vocab_size,50)
    self.rnn=nn.RNN(50,64)
    self.fc=nn.Linear(64,vocab_size)

  def forward(self,x):
    embedded_question=self.embedding(question)
    hidden,final=self.rnn(embedded_question)
    output=self.fc(final)
    return output


In [29]:
learning_rate=0.0001
epochs=20

In [30]:
model=vanillarnn(len(vocab))

In [31]:
criterion=nn.CrossEntropyLoss()
optim=torch.optim.Adam(model.parameters(),lr=learning_rate)

In [37]:
model.eval()

vanillarnn(
  (embedding): Embedding(333, 50)
  (rnn): RNN(50, 64)
  (fc): Linear(in_features=64, out_features=333, bias=True)
)

In [38]:
total_loss = 0.0
correct = 0
total = 0

with torch.no_grad():   # no gradient tracking
    for question, answer in dataloader:

        # forward pass
        output = model(question)        # (B, T, V)
        output = output[:, -1, :]       # (B, V)

        # fix target shape
        answer = answer.view(-1)        # (B,)

        # loss
        loss = criterion(output, answer)
        total_loss += loss.item()

        # prediction
        preds = torch.argmax(output, dim=1)  # (B,)

        # accuracy
        correct += (preds == answer).sum().item()
        total += answer.size(0)

avg_loss = total_loss / len(dataloader)
accuracy = correct / total

print(f"Eval Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")


Eval Loss: 4.4658, Accuracy: 0.0333
