# RNN using PyTorch | Question Answering System using PyTorch

## Importing Libraries

In [173]:
import sys
import subprocess
import importlib.util

package_name = 'torchinfo'

if importlib.util.find_spec(package_name) is None:
    print(f"Installing {package_name}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
else:
    print(f"{package_name} is already installed. Skipping.")

torchinfo is already installed. Skipping.


In [174]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchinfo import summary 

## Loading Dataset

In [175]:
file_id = "1X4Hcj72NK7J2JYvgjICFj0R1XwUq1w0a"
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
df = pd.read_csv(download_url)
print(f"Dataset Downloaded")

Dataset Downloaded


## Reading and Take a look at Data

In [176]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [177]:
df.shape

(90, 2)

## Tokenizing the Data

In [178]:
def tokenize(text):
    text = text.lower()
    text = text.replace('?','')
    text = text.replace("'",'')
    return text.split()

In [179]:
tokenize("Who wrote 'To Kill a Mockingbird'?")

['who', 'wrote', 'to', 'kill', 'a', 'mockingbird']

## Vocabulary Creation

In [180]:
vocab = {"<UNK>":0}

In [181]:
def build_vocab(row):
    tokenized_question = tokenize(row['question'])
    tokenized_answer = tokenize(row['answer'])
    merged_tokenized = tokenized_question + tokenized_answer
    
    for token in merged_tokenized:

        if token not in vocab:
            vocab[token] = len(vocab)

In [182]:
df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [183]:
vocab_size = len(vocab)

In [184]:
from itertools import islice

for key, val in islice(vocab.items(), 10):
    print(key,val)
    

<UNK> 0
what 1
is 2
the 3
capital 4
of 5
france 6
paris 7
germany 8
berlin 9


## Converting Words to Indices

In [185]:
def words_to_indices(text, vocab):

    words_indices = []

    for token in tokenize(text):

        if token in vocab:
            words_indices.append(vocab[token])

        else:
            words_indices.append(vocab["<UNK>"])

    return words_indices

In [186]:
words_to_indices(text="Who wrote 'To Kill a mani'?", vocab=vocab)

[10, 11, 12, 13, 14, 0]

## Creating Dataset and DataLoader

In [187]:
from torch.utils.data import Dataset, DataLoader

In [188]:
class CustomDataset(Dataset):

    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        numerical_question =  words_to_indices(
            text=self.df.iloc[index]['question'],
            vocab=self.vocab
        )
        numerical_answer =  words_to_indices(
            text=self.df.iloc[index]['answer'],
            vocab=self.vocab
        )

        return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [189]:
dataset = CustomDataset(df, vocab)

In [190]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

## Model Building, Training and Evaluation

In [191]:
class Model(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=50)
        # rnn layer
        self.rnn = nn.RNN(input_size=50, hidden_size=64, batch_first=True)
        # fully connected layer
        self.fcl = nn.Linear(in_features=64,out_features=vocab_size)

    def forward(self, question):
        embedded_question = self.embedding(question)
        hidden, rnn_output = self.rnn(embedded_question)
        output = self.fcl(rnn_output.squeeze(0))

        return output

In [192]:
learning_rate = 0.001
epochs = 20

In [193]:
model = Model(vocab_size=vocab_size)

loss_function = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [194]:
model

Model(
  (embedding): Embedding(324, 50)
  (rnn): RNN(50, 64, batch_first=True)
  (fcl): Linear(in_features=64, out_features=324, bias=True)
)

In [195]:
summary(model)

Layer (type:depth-idx)                   Param #
Model                                    --
├─Embedding: 1-1                         16,200
├─RNN: 1-2                               7,424
├─Linear: 1-3                            21,060
Total params: 44,684
Trainable params: 44,684
Non-trainable params: 0

In [196]:
len(dataloader)

90

In [197]:
# training loop

for epoch in range(epochs):

    total_loss = 0

    for question, answer in dataloader:

        # clear gradients
        optimizer.zero_grad()

        # forward pass
        y_pred = model(question)

        # loss calculate 
        loss = loss_function(y_pred, answer[0])

        # backward pass
        loss.backward()

        # update gradients
        optimizer.step()

        # calculate loss for each batch
        total_loss += loss.item()

    avg_loss = total_loss/len(dataloader)

    print(f"Epoch: {epoch+1}, Loss:{avg_loss:2f}")

Epoch: 1, Loss:5.783175
Epoch: 2, Loss:5.049709
Epoch: 3, Loss:4.212570
Epoch: 4, Loss:3.469476
Epoch: 5, Loss:2.880734
Epoch: 6, Loss:2.344067
Epoch: 7, Loss:1.859088
Epoch: 8, Loss:1.449507
Epoch: 9, Loss:1.111735
Epoch: 10, Loss:0.853565
Epoch: 11, Loss:0.667172
Epoch: 12, Loss:0.523060
Epoch: 13, Loss:0.421015
Epoch: 14, Loss:0.343363
Epoch: 15, Loss:0.283865
Epoch: 16, Loss:0.239178
Epoch: 17, Loss:0.200554
Epoch: 18, Loss:0.171466
Epoch: 19, Loss:0.148110
Epoch: 20, Loss:0.127287


In [198]:
def predict(model, question, threshold=0.5):

    # convert question to numebers
    numerical_question = words_to_indices(question, vocab)

    # convert numerical_question to tensor
    question = torch.tensor(numerical_question).unsqueeze(0)

    # finding logits
    logits = model(question)

    # convert logits to probabilities
    probabs = nn.functional.softmax(logits, dim=1)

    # find index of max probabs
    value, index = torch.max(probabs, dim=1)

    
    if value >= threshold:
        print(list(vocab.keys())[index])

    else:
        print("I don't know")    


In [201]:
predict(model, 'What is the capital of France?	')

paris


In [202]:
predict(model, 'What is the capital of Germany?')

berlin


In [203]:
predict(model, 'What is the boiling point of water in Celsius?')

100


In [200]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100
