In [None]:
import pandas as pd
import torch
import torch.nn as nn

df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df['question'][4]

'What is the boiling point of water in Celsius?'

In [None]:
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'",'')
  return text.split()
st = tokenize(df['question'][4])
st

['what', 'is', 'the', 'boiling', 'point', 'of', 'water', 'in', 'celsius']

In [None]:
vocab = {'<UNK>':0}
def build_vocab(row):
  tokenized_que = tokenize(row['question'])
  tokenized_ans = tokenize(row['answer'])
  merged_tokens = tokenized_que + tokenized_ans
  for tok in merged_tokens:
    if tok not in vocab:
      vocab[tok] = len(vocab)
  # print(merged_tokens)
df.apply(build_vocab,axis=1)
print(len(vocab))

324


In [None]:
def text_to_indices(text,vocab):
  indexed_text = []
  for tok in tokenize(text):
    if tok in vocab:
      indexed_text.append(vocab[tok])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

text_to_indices(df['question'][4],vocab)

[1, 2, 3, 24, 25, 5, 26, 19, 27]

In [None]:
from torch.utils.data import Dataset,DataLoader
class QADataset(Dataset):

  def __init__(self,df,vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx):
    row = self.df.iloc[idx]
    question = text_to_indices(row['question'],self.vocab)
    answer = text_to_indices(row['answer'],self.vocab)
    return torch.tensor(question),torch.tensor(answer)

dataset = QADataset(df,vocab)

dataloader = DataLoader(dataset,shuffle = True,batch_size=1)


In [None]:
x = nn.Embedding(num_embeddings=len(vocab),embedding_dim = 10)
print(dataset[0][0])
a = x(dataset[0][0])
print(a.shape)
b = nn.RNN(10,4)
print(b(a)) # RNN gives 2 output the 2nd one is the final output as its time series , the first one is intermediate output
# due to this we cant use sequential for this

tensor([1, 2, 3, 4, 5, 6])
torch.Size([6, 10])
(tensor([[ 0.8095, -0.0420,  0.5664,  0.5459],
        [ 0.8267,  0.1697,  0.8288,  0.4065],
        [ 0.7428, -0.0391, -0.4393, -0.1213],
        [ 0.7679,  0.4496,  0.2577,  0.0430],
        [ 0.2064, -0.3706,  0.1613,  0.0528],
        [ 0.2760, -0.8273,  0.7246, -0.3887]], grad_fn=<SqueezeBackward1>), tensor([[ 0.2760, -0.8273,  0.7246, -0.3887]], grad_fn=<SqueezeBackward1>))


In [None]:
class simpleRNN(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim=50)
    self.RNN = nn.RNN(50,64,batch_first=True)
    self.fc = nn.Linear(64,vocab_size)
  def forward(self,x):
    embedding_layer = self.embedding(x)
    hidden,final = self.RNN(embedding_layer)
    fc = self.fc(final.squeeze(0))
    return fc

lr = 0.001
epochs = 20

model = simpleRNN(len(vocab))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lr)

for epoch in range(epochs):
  loss_per =  0
  for batch_features,batch_labels in dataloader:
    y_pred = model(batch_features)
    # print(y_pred.shape)
    # print(batch_labels.shape)
    loss = criterion(y_pred,batch_labels.squeeze(1))
    loss_per += loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  print(f"epoch :{epoch} , loss : {loss_per}")


epoch :0 , loss : 526.3599934577942
epoch :1 , loss : 461.7219581604004
epoch :2 , loss : 385.3423066139221
epoch :3 , loss : 319.2793312072754
epoch :4 , loss : 265.3177890777588
epoch :5 , loss : 215.34896862506866
epoch :6 , loss : 170.37253379821777
epoch :7 , loss : 131.9568995833397
epoch :8 , loss : 99.4663667678833
epoch :9 , loss : 75.6221244931221
epoch :10 , loss : 57.34659454226494
epoch :11 , loss : 44.16183368861675
epoch :12 , loss : 34.604509860277176
epoch :13 , loss : 27.678925216197968
epoch :14 , loss : 22.59835061430931
epoch :15 , loss : 18.6551845818758
epoch :16 , loss : 15.762975797057152
epoch :17 , loss : 13.324132144451141
epoch :18 , loss : 11.502157889306545
epoch :19 , loss : 9.960929203778505


In [None]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")
  else:
    print(list(vocab.keys())[index])



In [None]:
predict(model, "What is the square root of 64")

8
