In [None]:
# %pip install -q nltk pandas sentence-transformers scikit-learn torch torchvision

## Imports

In [1]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Check for GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


## Data Ingestion

In [3]:
df = pd.read_csv("/content/100_Unique_QA_Dataset.csv")
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [4]:
# tokenize
def tokenize(sentence):
    return word_tokenize(sentence.lower())

In [5]:
# vocab
vocabulary = {'<UNK>':0}

def build_vocabulary(row):
    tokens = tokenize(row['question']) + tokenize(row['answer'])
    for token in tokens:
        vocabulary.setdefault(token, len(vocabulary))


In [6]:
df.apply(build_vocabulary, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [7]:
# convert words to numerical indices
def text_to_num(text, vocabulary):
    indexed_text = []
    for token in tokenize(text):
        if token in vocabulary:
            indexed_text.append(vocabulary[token])
        else:
            indexed_text.append(vocabulary['<UNK>'])
    return indexed_text

In [8]:
class CustomDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocabulary = vocabulary

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        question = text_to_num(self.df.iloc[index]['question'], self.vocabulary)
        answer = text_to_num(self.df.iloc[index]['answer'], self.vocabulary)
        return torch.tensor(question), torch.tensor(answer)

In [9]:
dataset = CustomDataset(df, vocabulary)

In [10]:
dataloader = DataLoader(
    dataset,
    batch_size=1,
    pin_memory=True if device.type == "gpu" else False,
)

## RNN Design

In [11]:
class RNN(nn.Module):
    def __init__(self, vocabulary_size):
        super().__init__()
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim=50)
        self.rnn = nn.RNN(50, 64, batch_first=True)
        self.fc = nn.Linear(64, vocabulary_size)

    def forward(self, question):
        embedded_question = self.embedding(question)
        hidden, final = self.rnn(embedded_question)
        output = self.fc(final.squeeze(0))
        return output

## Model Training

In [12]:
learning_rate = 0.001
epochs = 50

In [13]:
model = RNN(len(vocabulary)).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
# The Training Loop
for epoch in range(epochs):
    epoch_loss = 0
    for Q, A in dataloader:
        # move data for GPU computations
        Q = Q.to(device)
        A = A.to(device)

        # forward pass
        outputs = model(Q)

        # calculate loss
        loss = loss_fn(outputs, A[0])

        # backward pass
        optimizer.zero_grad()
        loss.backward()

        # update weights
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(dataloader)}")

Epoch 1/50, Loss: 5.921916580200195
Epoch 2/50, Loss: 4.82476610077752
Epoch 3/50, Loss: 4.219364629851447
Epoch 4/50, Loss: 3.7819679048326282
Epoch 5/50, Loss: 3.3388798051410253
Epoch 6/50, Loss: 2.896216940879822
Epoch 7/50, Loss: 2.4662009888225134
Epoch 8/50, Loss: 2.06169234779146
Epoch 9/50, Loss: 1.6944254312250349
Epoch 10/50, Loss: 1.3725722647375531
Epoch 11/50, Loss: 1.1002529614501528
Epoch 12/50, Loss: 0.8776591138707267
Epoch 13/50, Loss: 0.7010260706146558
Epoch 14/50, Loss: 0.5635285642411973
Epoch 15/50, Loss: 0.45737808876567415
Epoch 16/50, Loss: 0.37544551144043603
Epoch 17/50, Loss: 0.3118136181599564
Epoch 18/50, Loss: 0.2618289166026645
Epoch 19/50, Loss: 0.2221282775203387
Epoch 20/50, Loss: 0.1902682203385565
Epoch 21/50, Loss: 0.16454540408319898
Epoch 22/50, Loss: 0.14348921589553357
Epoch 23/50, Loss: 0.12618121554454167
Epoch 24/50, Loss: 0.11171999524037043
Epoch 25/50, Loss: 0.0995940708865722
Epoch 26/50, Loss: 0.08927302418483628
Epoch 27/50, Loss: 0.

## Prediction

In [15]:
def predict(model, question, vocabulary, threshold=0.5, device="cpu"):
    indices = text_to_num(question, vocabulary)
    tensor = torch.tensor(indices, dtype=torch.long, device=device).unsqueeze(0)

    # Run model & get logits ---SoftMax()---> probabilities
    with torch.no_grad():
        probs = torch.softmax(model(tensor), dim=1)
        conf, pred_idx = torch.max(probs, dim=1)

    # Return prediction if confident
    if conf.item() < threshold:
        return "I don't know"
    return list(vocabulary.keys())[pred_idx.item()]

In [16]:
question = "Which year did World War II end?"
predict(model, question, vocabulary, device=device)

'1945'