In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/unique-qa-dataset/100_Unique_QA_Dataset.csv


In [2]:
df = pd.read_csv('/kaggle/input/unique-qa-dataset/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [3]:
# tokenization
def tokenize(s):
    s = s.lower()
    s = s.replace('?','')
    s = s.replace("'", "")
    return s.split()

tokenize(df['question'][0])

['what', 'is', 'the', 'capital', 'of', 'france']

In [4]:
# build vocab
vocabulary = {"<UNK>": 0}

def build_vocab(row):
    tokenized_que = tokenize(row['question'])
    tokenized_ans = tokenize(row['answer'])

    merged_token = tokenized_que + tokenized_ans

    for token in merged_token:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)

In [5]:
df.apply(build_vocab, axis=1)

0     None
1     None
2     None
3     None
4     None
      ... 
85    None
86    None
87    None
88    None
89    None
Length: 90, dtype: object

In [6]:
# indexing
def text_to_indices(text, vocabulary):
    indexed_text = []
    for token in tokenize(text):
        if token in vocabulary:
            indexed_text.append(vocabulary[token])
        else:
            indexed_text.append(vocabulary['<UNK>'])
    return indexed_text

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class QADataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        idx_que = text_to_indices(self.df.iloc[index]['question'], self.vocab)
        idx_ans = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

        return torch.tensor(idx_que), torch.tensor(idx_ans)

In [8]:
dataset = QADataset(df, vocabulary)

In [9]:
dataset[1]

(tensor([1, 2, 3, 4, 5, 8]), tensor([9]))

In [10]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

for question, answer in dataloader:
    print(question, answer)

tensor([[ 42, 101,   2,   3,  17]]) tensor([[102]])
tensor([[ 78,  79, 129,  81,  19,   3,  21,  22]]) tensor([[36]])
tensor([[  1,   2,   3,  17, 115,  83,  84]]) tensor([[116]])
tensor([[  1,   2,   3,   4,   5, 135]]) tensor([[136]])
tensor([[  1,   2,   3,  33,  34,   5, 245]]) tensor([[246]])
tensor([[ 42, 167,   2,   3,  17, 168, 169]]) tensor([[170]])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([[185]])
tensor([[ 78,  79, 195,  81,  19,   3, 196, 197, 198]]) tensor([[199]])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([[273]])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[  1,   2,   3,   4,   5, 236, 237]]) tensor([[238]])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([[260]])
tensor([[ 42, 263, 264,  14, 265, 266, 158, 267]]) tensor([[268]])
tensor([[ 10,  75, 111]]) tensor([[112]])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([[149]])
tensor([[ 42, 137,   2, 138,  39, 175, 269]]) tensor([[99]])
tensor([[42, 86, 87, 

### Architecture

input layer (50N) ---> hidden layer (feedback loop) (64N) ---> output layer (324N)

In [11]:
import torch.nn as nn

# can't use sequential container as RNN contains the feedback loop
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim = 50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)

    def forward(self, question):
        embedded_que = self.embedding(question)
        hidden_state, final_state = self.rnn(embedded_que)
        output = self.fc(final_state.squeeze(0))
        return output

In [12]:
# architecture debug

x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True) # indicating first dim will be the batch size (ie.,1)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("Before sending to Embedding layer\nshape of a: ", a.shape)
b = x(a)
print("\nAfter sending to Embedding layer\nshape of b: ", b.shape)
c, d = y(b)
print('\n Intermidiate hidden states\nshape of c: ', c.shape)
print('\n Final output state\nshape of d: ', d.shape)

e = z(d.squeeze(0))
print('\n probabilities\nshape of e: ', e.shape)

Before sending to Embedding layer
shape of a:  torch.Size([1, 6])

After sending to Embedding layer
shape of b:  torch.Size([1, 6, 50])

 Intermidiate hidden states
shape of c:  torch.Size([1, 6, 64])

 Final output state
shape of d:  torch.Size([1, 1, 64])

 probabilities
shape of e:  torch.Size([1, 324])


In [13]:
learning_rate = 0.001
epochs = 20

In [14]:
model = SimpleRNN(len(vocabulary))

lossFunction = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
# training
for epoch in range(epochs):
    total_loss = 0
    for question, answer in dataloader:
        optimizer.zero_grad()
        
        pred = model(question)
        # need to calculate the numerical ans as we get the prob now
        loss = lossFunction(pred, answer[0])

        loss.backward() # calc. grads
        optimizer.step() # update grads
        total_loss = total_loss + loss.item()

    print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 520.952395
Epoch: 2, Loss: 452.768232
Epoch: 3, Loss: 375.116854
Epoch: 4, Loss: 310.436860
Epoch: 5, Loss: 257.495051
Epoch: 6, Loss: 209.023701
Epoch: 7, Loss: 165.199552
Epoch: 8, Loss: 127.763757
Epoch: 9, Loss: 97.596924
Epoch: 10, Loss: 74.569146
Epoch: 11, Loss: 57.970388
Epoch: 12, Loss: 45.692397
Epoch: 13, Loss: 36.641284
Epoch: 14, Loss: 29.997220
Epoch: 15, Loss: 24.905350
Epoch: 16, Loss: 20.962511
Epoch: 17, Loss: 17.730291
Epoch: 18, Loss: 15.193464
Epoch: 19, Loss: 12.999302
Epoch: 20, Loss: 11.334754


In [16]:
def predict(model, question, threshold=0.5):
    # convert question text to vector
    vector = text_to_indices(question, vocabulary)
    ques_tensor = torch.tensor(vector).unsqueeze(0)

    output = model(ques_tensor)
    # convert logits to probs
    probs = nn.functional.softmax(output, dim=1)
    # find max prob
    max_val, idx = torch.max(probs, dim=1)

    if max_val < threshold:
        print("I don't know")

    output = list(vocabulary.keys())[idx]
    print(output)

In [17]:
predict(model, 'What is the chemical symbol for gold?')

au
