#### **Question & Answering System using PyTorch RNN**

Build a model that takes English questions as input and returns correct answers — using RNN and Embedding layers.

In [48]:
import pandas as pd 

df = pd.read_csv("E:\\Notes\\Python'\\PyTorch\Datasets\\100_Unique_QA_Dataset.csv")
df.head()

  df = pd.read_csv("E:\\Notes\\Python'\\PyTorch\Datasets\\100_Unique_QA_Dataset.csv")


Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [49]:
# Tokenize the questions and answers
def tokenize(text):

    text = text.lower()
    text = text.replace("?", " ")
    text = text.replace("'", " ")
    return text.split()

In [50]:
tokenize("What is the capital of France?")  

['what', 'is', 'the', 'capital', 'of', 'france']

In [51]:
vocab = {'<UNK>' : 0}

# Building Vocabulary

def build_vocab(row):

    tokenized_questions = tokenize(row['question'])
    tokenized_answers = tokenize(row['answer'])

    merged_tokens = tokenized_questions + tokenized_answers

    for token in merged_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

In [52]:
# apply is used to apply a function along the axis of the DataFrame.
df.apply(build_vocab, axis=1)   # The code applies the build_vocab function to each row of the DataFrame along axis 1 (i.e., row-wise).


0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [53]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [54]:
# Converting Words to Indices

def text_to_indices(text, vocab):
    indexed_text = []

    for token in tokenize(text):

        if token in vocab:
            indexed_text.append(vocab[token])
        else:
            indexed_text.append(vocab['<UNK>'])

    return indexed_text

In [55]:
import torch 
from torch.utils.data import Dataset, DataLoader

In [67]:
device = torch.device("cuda" if torch.cuda.is_available else "cpu")
print("USing Device:", device)

USing Device: cuda


In [56]:
class QADataset(Dataset):
    
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):

        numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
        numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

        return torch.tensor(numerical_question), torch.tensor(numerical_answer)
    

In [57]:
dataset = QADataset(df, vocab)

dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [58]:
for question, answer in dataloader:
    print(question, answer)

tensor([[ 10,  29, 130, 131]]) tensor([[132]])
tensor([[ 42,   2,   3, 210, 137, 168, 211, 169]]) tensor([[113]])
tensor([[ 10, 140,   3, 141, 171,   5,   3,  70, 172]]) tensor([[173]])
tensor([[ 1,  2,  3, 50, 51, 19,  3, 45]]) tensor([[52]])
tensor([[  1,   2,   3,   4,   5, 236, 237]]) tensor([[238]])
tensor([[ 42, 137,   2, 138,  39, 175, 269]]) tensor([[99]])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([[121]])
tensor([[ 42, 312,   2, 313,  62,  63,   3, 314, 315]]) tensor([[316]])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([[149]])
tensor([[  1,   2,   3,  69,   5, 155]]) tensor([[156]])
tensor([[10, 55,  3, 56,  5, 57]]) tensor([[58]])
tensor([[  1,   2,   3,   4,   5, 279]]) tensor([[280]])
tensor([[ 42, 137,   2, 226,  12,   3, 227, 228]]) tensor([[155]])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 294]]) tensor([[295]])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([[65]])
tensor([[42, 86, 87, 88, 89, 39, 90]]) tensor([[91]])
tensor([[10,  2,  3, 66,  5

##### **RNN Architecture**
There are 3 Main Layers in this model - Input -> Hidden (Recurrent) -> Output

1) Input Layer (50 Neurons)- Each word is converted into a vector of 50 dim using an embedding layer so there are 50 input neurons as only one word is passed at a time into the network.

2) Hidden Layer (64 Neurons)- Main RNN Layer (nn.RNN), this no. is a hyperparameter which we can tune it.

3) Output Layer (324 Neurons)- 324 = No. of Unique words in our vocab -> as we want to predict the answer word.

How Prediction Works-  
When a question is passed to the RNN (as a sequence of word embeddings), the output of the RNN is used to generate a probability distribution over all 324 words.  
This is a multi-class classification problem.  
The word with the highest probability is selected as the final answer.

In [59]:
import torch.nn as nn

In [None]:
class SimpleRNN(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=50)  # Padding not req. as passing only 1 question
        self.rnn = nn.RNN(50, 64, batch_first=True)  # Batch_first ensures that the batch dim remain in front
        self.fc = nn.Linear(64, vocab_size)  # Fully-Connected Layer

    def forward(self, question):
        embedded_question = self.embedding(question)
        hidden , final = self.rnn(embedded_question)  # Hidden represents as Hidden State values and final represents final hidden layer value
        output = self.fc(final.squeeze(0))

        return output

**Why not Use nn.Sequential-** The RNN returns 2 o/p's - i) all hidden states values ii) final hidden state value  
nn.Sequential only expects one output from each layer to pass to the next, so, you must manually code the forward() func.

In [None]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)  # batch_first ensures the shape becomes [batch_size, seq_len, input_size]
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a) # Passing the dataset to embedding layer
print("shape of b:", b.shape)
c, d = y(b) # Passing the embedded data to RNN - c = all hidden state values, d=final hidden state value
print("shape of c:", c.shape)
print("shape of d:", d.shape)  # We will use only d 

e = z(d.squeeze(0)) # Output after all fc gives us 3 dim o/p but the model accepts 2 dim only so we use squeeze(0) to remove 1 dim - [batch_size, vocab_size]

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [66]:
learning_rate = 0.001
epochs = 50

In [68]:
model = SimpleRNN(len(vocab)).to(device)

In [69]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [71]:
# Training Loop

for epoch in range(epochs):
    total_loss=0

    for question, answer in dataloader:
        question, answer = question.to(device), answer.to(device)

        optimizer.zero_grad()

        output = model(question)

        loss = criterion(output, answer[0])
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch: {epoch+1}, Loss: {avg_loss:.4f}")

Epoch: 1, Loss: 5.7820
Epoch: 2, Loss: 5.0084
Epoch: 3, Loss: 4.1360
Epoch: 4, Loss: 3.4614
Epoch: 5, Loss: 2.8840
Epoch: 6, Loss: 2.3584
Epoch: 7, Loss: 1.8726
Epoch: 8, Loss: 1.4489
Epoch: 9, Loss: 1.1084
Epoch: 10, Loss: 0.8434
Epoch: 11, Loss: 0.6469
Epoch: 12, Loss: 0.5041
Epoch: 13, Loss: 0.3997
Epoch: 14, Loss: 0.3260
Epoch: 15, Loss: 0.2699
Epoch: 16, Loss: 0.2260
Epoch: 17, Loss: 0.1914
Epoch: 18, Loss: 0.1648
Epoch: 19, Loss: 0.1405
Epoch: 20, Loss: 0.1222
Epoch: 21, Loss: 0.1059
Epoch: 22, Loss: 0.0929
Epoch: 23, Loss: 0.0830
Epoch: 24, Loss: 0.0739
Epoch: 25, Loss: 0.0658
Epoch: 26, Loss: 0.0593
Epoch: 27, Loss: 0.0538
Epoch: 28, Loss: 0.0488
Epoch: 29, Loss: 0.0446
Epoch: 30, Loss: 0.0407
Epoch: 31, Loss: 0.0375
Epoch: 32, Loss: 0.0345
Epoch: 33, Loss: 0.0318
Epoch: 34, Loss: 0.0295
Epoch: 35, Loss: 0.0273
Epoch: 36, Loss: 0.0254
Epoch: 37, Loss: 0.0237
Epoch: 38, Loss: 0.0220
Epoch: 39, Loss: 0.0206
Epoch: 40, Loss: 0.0192
Epoch: 41, Loss: 0.0179
Epoch: 42, Loss: 0.0168
E

In [83]:
# Evaluation of the Model

def predict(model, question, threshold=0.5):
    # Convert question to nos.
    numerical_question = text_to_indices(question, vocab)

    # Tensor
    question_tensor = torch.tensor(numerical_question).unsqueeze(0).to(device)

    # Send to Model 
    output = model(question_tensor)

    # Convert the Logits into probs using softmax
    probs = nn.functional.softmax(output, dim=1)

    # Value & index of Max prob
    value, index = torch.max(probs, dim=1)

    if value < threshold:
        print("I don't know")

    print(list(vocab.keys())[index])
    

In [84]:
predict(model, "What is the largest planet in our solar system?")

jupiter
