<a href="https://colab.research.google.com/github/Aarif-Mir/Pytorch/blob/main/14_pytorch_rnn_based_qa_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')

df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [None]:
df.shape

(90, 2)

In [None]:
# tokenize
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  return text.split()

In [None]:
tokenize('What is the capital of France?')

['what', 'is', 'the', 'capital', 'of', 'france']

In [None]:
# vocab
vocab = {'<UNK>':0}

In [None]:
def build_vocab(row):
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])

  merged_tokens = tokenized_question + tokenized_answer

  for token in merged_tokens:

    if token not in vocab:
      vocab[token] = len(vocab)


In [None]:
df.apply(build_vocab, axis=1)

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [None]:
len(vocab)

324

In [None]:
vocab

{'<UNK>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'capital': 4,
 'of': 5,
 'france': 6,
 'paris': 7,
 'germany': 8,
 'berlin': 9,
 'who': 10,
 'wrote': 11,
 'to': 12,
 'kill': 13,
 'a': 14,
 'mockingbird': 15,
 'harper-lee': 16,
 'largest': 17,
 'planet': 18,
 'in': 19,
 'our': 20,
 'solar': 21,
 'system': 22,
 'jupiter': 23,
 'boiling': 24,
 'point': 25,
 'water': 26,
 'celsius': 27,
 '100': 28,
 'painted': 29,
 'mona': 30,
 'lisa': 31,
 'leonardo-da-vinci': 32,
 'square': 33,
 'root': 34,
 '64': 35,
 '8': 36,
 'chemical': 37,
 'symbol': 38,
 'for': 39,
 'gold': 40,
 'au': 41,
 'which': 42,
 'year': 43,
 'did': 44,
 'world': 45,
 'war': 46,
 'ii': 47,
 'end': 48,
 '1945': 49,
 'longest': 50,
 'river': 51,
 'nile': 52,
 'japan': 53,
 'tokyo': 54,
 'developed': 55,
 'theory': 56,
 'relativity': 57,
 'albert-einstein': 58,
 'freezing': 59,
 'fahrenheit': 60,
 '32': 61,
 'known': 62,
 'as': 63,
 'red': 64,
 'mars': 65,
 'author': 66,
 '1984': 67,
 'george-orwell': 68,
 'currency': 69,
 'unit

In [None]:
# convert words to numerical indices
def text_to_indices(text, vocab):

  indexed_text = []

  for token in tokenize(text):

    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])

  return indexed_text

In [None]:
text_to_indices("what is transfer-learning ", vocab)

[1, 2, 0]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
df.shape

(90, 2)

In [None]:
class QADataset(Dataset):

  def __init__(self, df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):

    numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
    numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)
    # print(numerical_question)   # not a tensor

    return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [None]:
dataset = QADataset(df, vocab)

In [None]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [None]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)    # padding required if batch_size > 1

In [None]:
for question, answer in dataloader:
#   print(question, answer)
  print(question, answer[0])

tensor([[ 10,  75, 208]]) tensor([209])
tensor([[ 42, 117, 118,   3, 119,  94, 120]]) tensor([121])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([16])
tensor([[ 42, 101,   2,   3,  17]]) tensor([102])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([260])
tensor([[ 10, 140,   3, 141, 270,  93, 271,   5,   3, 272]]) tensor([273])
tensor([[ 1,  2,  3, 69,  5,  3, 70, 71]]) tensor([72])
tensor([[ 10,  11, 157, 158, 159]]) tensor([160])
tensor([[ 42, 250, 251, 118, 252, 253]]) tensor([254])
tensor([[ 42, 263, 264,  14, 265, 266, 158, 267]]) tensor([268])
tensor([[ 10, 308,   3, 309, 310]]) tensor([311])
tensor([[  1,   2,   3,   4,   5, 236, 237]]) tensor([238])
tensor([[42, 18,  2, 62, 63,  3, 64, 18]]) tensor([65])
tensor([[ 1,  2,  3, 37, 38, 39, 40]]) tensor([41])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([149])
tensor([[ 78,  79, 261, 151,  14, 262, 153]]) tensor([36])
tensor([[ 42,  86,  87, 241, 242,  19,  39, 243]]) tensor([244])
tensor([[  1,   2,   3,  69,   5, 155]]) tensor([1

In [None]:
import torch.nn as nn

In [None]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)   # we will not use the sequential container in it as the rnn will produce a tuple having 2 items and the sequential container expects only one , rnn result:  --> all outputs--> (O1-0i) and Oi(final one)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output


### üìå Important Points about `batch_first=True` and Batch Size

* We **never specify batch size inside the model**; it comes automatically from the input tensor during training.
* `batch_first=True` only tells PyTorch that the input shape should be **(batch_size, sequence_length, features)**.
* The input we pass to the model must already have a batch dimension.
  Example: shape `(1, 6)` for batch of 1, or `(32, 6)` for batch of 32.
* Layers like `Embedding`, `RNN`, and `Linear` automatically adapt to the batch size given in the forward pass.
* The RNN outputs two results:

  * `hidden` ‚Üí outputs for **all time steps**
  * `final` ‚Üí final hidden state used for prediction
* `final.squeeze(0)` removes an unnecessary extra dimension before passing it to `Linear`.



### üìå Notes: RNN Input Shapes (Batch, Sequence, Input Size)

#### üî∑ Key Points

* In an RNN, the expected input shape is:

  ```
  [batch_size, sequence_length, input_size]
  ```
* We **do not specify** `batch_size` or `sequence_length` in the RNN constructor.

  * These values come **automatically from the input data** during the forward pass.
* We only specify:

  * `input_size` ‚Üí dimensionality of the feature vector at each time step
  * `hidden_size` ‚Üí number of units in the RNN‚Äôs hidden state

---

#### üî∑ Relation with Embedding Layer

```python
embedding = nn.Embedding(vocab_size, embedding_dim=50)
rnn = nn.RNN(input_size=50, hidden_size=64, batch_first=True)
```

* The embedding converts each token (word index) into a 50-dimensional vector.
* Therefore:

  ```
  input_size = embedding_dim = 50
  ```
* `sequence_length` = number of tokens in each sentence (comes from data)
* `batch_size` = number of sentences processed together (comes from DataLoader)

---

#### üî∑ Visual Shape Flow

| Stage                       | Shape                                        |
| --------------------------- | -------------------------------------------- |
| Raw tokens from dataset     | `(batch_size, sequence_length)`              |
| After Embedding             | `(batch_size, sequence_length, 50)`          |
| RNN Output (all time steps) | `(batch_size, sequence_length, hidden_size)` |
| Final hidden state          | `(num_layers, batch_size, hidden_size)`      |

---

#### üéØ Final Summary

> We only define `input_size` and `hidden_size` in an RNN.
> `input_size` equals the embedding dimension.
> `batch_size` and `sequence_length` are **not fixed values**, so they are **NOT passed manually** ‚Äî they are determined automatically from the input tensor.



In [None]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)   # shape of d: torch.Size([1, 1, 64])
# y = nn.RNN(50, 64)       #shape of d: torch.Size([1, 6, 64])
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)                                # c = output for each time step ‚Üí (batch, sequence, hidden) and d = last hidden state ‚Üí (layers, batch, hidden)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


**What happens if you don‚Äôt add the batch dim (i.e., skip reshape)?**

- If a.shape == [6]
- then after b = x(a),
- b.shape == [6, 50].

**RNN expects 3-D input. PyTorch‚Äôs RNN can accept either:**

- (seq_len, batch, input_size) when batch_first=False (default), or

- (batch, seq_len, input_size) when batch_first=True.

If batch_first=True and you pass [6, 50], PyTorch will error because it expects 3 dims.

If batch_first=False and you pass [6, 50], PyTorch will interpret that as [seq_len, input_size] ‚Äî but it still expects [seq_len, batch, input_size], so again it‚Äôs missing the batch dimension and you‚Äôll likely get an error. In short: RNN requires 3D input.

So embedding alone can work with [6], but RNN will not, unless you reshape/unsqueeze to provide the missing dimension.

In [None]:
dataset[0][0]

tensor([1, 2, 3, 4, 5, 6])

In [None]:
# x("Who painted the Mona Lisa?")   # embedding(): argument 'indices' (position 2) must be Tensor, not str
x(dataset[0][0].reshape(1,6))

tensor([[[-8.2547e-01, -1.6614e+00, -4.5921e-01,  1.7657e+00,  1.1599e+00,
           1.2349e+00, -8.7584e-02,  9.7212e-01,  2.6486e+00, -6.6074e-01,
          -1.1435e+00, -7.8254e-01,  2.8225e+00,  1.3831e+00,  9.3745e-01,
           9.8360e-02,  4.9870e-01,  8.9597e-01, -2.2822e+00,  1.9356e-01,
           4.8137e-01,  1.1546e+00,  6.8411e-01,  1.3812e+00,  3.1939e-01,
          -9.2934e-01, -5.8292e-01,  2.8390e-01, -2.1280e+00,  4.2512e-01,
           1.4613e+00,  1.0415e+00, -6.2212e-01,  8.8052e-01, -1.0555e+00,
          -5.1885e-01, -2.9878e-01, -3.9683e-01, -1.7984e+00,  6.8381e-01,
          -3.3902e-02,  2.9413e+00,  1.3617e+00, -5.9850e-01, -1.6009e+00,
           1.2908e+00, -3.5289e-01,  3.5460e-01, -9.1069e-01,  2.0511e-01],
         [ 2.4143e-01, -1.6102e+00, -7.7504e-01,  1.2313e-01,  2.8644e-01,
          -1.1484e-01,  4.5664e-01,  9.0020e-01, -5.0772e-01, -1.2571e+00,
           1.9566e-01, -5.8033e-01,  3.1949e-01, -3.8078e-01,  2.5643e-01,
          -1.9639e+00,  

In [None]:
learning_rate = 0.001
epochs = 20

In [None]:
model = SimpleRNN(len(vocab))

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# training loop

for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)
    # print(output.shape)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 522.839575
Epoch: 2, Loss: 459.539747
Epoch: 3, Loss: 383.053963
Epoch: 4, Loss: 315.046708
Epoch: 5, Loss: 260.704545
Epoch: 6, Loss: 211.276192
Epoch: 7, Loss: 166.976117
Epoch: 8, Loss: 128.469853
Epoch: 9, Loss: 97.677594
Epoch: 10, Loss: 74.540930
Epoch: 11, Loss: 57.334798
Epoch: 12, Loss: 44.420794
Epoch: 13, Loss: 35.437986
Epoch: 14, Loss: 28.225866
Epoch: 15, Loss: 23.409019
Epoch: 16, Loss: 19.423714
Epoch: 17, Loss: 16.354384
Epoch: 18, Loss: 13.874365
Epoch: 19, Loss: 11.956675
Epoch: 20, Loss: 10.406005


In [None]:
def predict(model, question, threshold=0.5):

  # convert question to numbers
  numerical_question = text_to_indices(question, vocab)

  # tensor
  question_tensor = torch.tensor(numerical_question).unsqueeze(0)

  # send to model
  output = model(question_tensor)

  # convert logits to probs
  probs = torch.nn.functional.softmax(output, dim=1)

  # find index of max prob
  value, index = torch.max(probs, dim=1)

  if value < threshold:
    print("I don't know")

  print(list(vocab.keys())[index])

In [None]:
predict(model, "What is the largest planet in our solar system?")

jupiter


In [None]:
list(vocab.keys())[7]

'paris'

When **each sequence in the batch has a different length**, you **cannot feed them directly into an RNN**.

Why? Because tensors in a batch must all have the *same shape*, and RNN needs shape:

```
[batch_size, max_sequence_length, input_size]
```

But if lengths differ, example:

```
seq1: [12,  8,  3,  9]           length = 4
seq2: [ 4,  7, 11]               length = 3
seq3: [ 5,  6,  1,  9, 10]       length = 5
```

These cannot form a single rectangular tensor unless we make them the **same length.**

---

## üî¥ Problem When Batch Lengths Differ

You **cannot stack them:**

```
torch.tensor([
  [12, 8, 3, 9],
  [4, 7, 11],        ‚ùå cannot broadcast
  [5, 6, 1, 9, 10]
])
```

So RNN training fails.

---

# ‚úîÔ∏è Solutions

There are **two main correct methods**:

---

## üçè **Method 1: Padding + pack_padded_sequence**

### Step 1: Pad sequences to the same length

For example, pad with a special `<PAD>` value (usually 0):

```
seq1 ‚Üí [12,  8,  3,  9,  0]
seq2 ‚Üí [ 4,  7, 11,  0,  0]
seq3 ‚Üí [ 5,  6,  1,  9, 10]
```

```python
from torch.nn.utils.rnn import pad_sequence

padded = pad_sequence(batch, batch_first=True, padding_value=0)
# padded shape ‚Üí [batch, max_len]
```

### Step 2: Use **pack_padded_sequence** so RNN ignores padded values

```python
from torch.nn.utils.rnn import pack_padded_sequence

embedded = x(padded)  # [batch, max_len, emb]

pack = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
output, hidden = y(pack)  # RNN now ignores padded zeros
```

üìå **Advantage:** faster, ignores padding **correctly**
üìå **Required when training NLP models**

---

## üçé **Method 2: Padding Only (no packing)**

You can pad and directly send into RNN:

```python
padded = pad_sequence(batch, batch_first=True)
output, hidden = y(x(padded))
```

‚ö†Ô∏è But RNN **will compute on padding tokens**, which **hurts training accuracy**, **wastes compute**, etc.

üëâ Use only if model is simple or inference-only.

---

# üß† Summary Table

| Method                 | Padding Needed? | RNN ignores padding? | Best use case          |
| ---------------------- | --------------- | -------------------- | ---------------------- |
| `pack_padded_sequence` | ‚úî Yes           | ‚úî Yes                | NLP training           |
| Pad only               | ‚úî Yes           | ‚ùå No                 | quick tests, inference |
| No padding             | ‚ùå               | ‚ùå                    | ‚ùå impossible           |

---

# üß© Visualization

```
Before pad:      Different lengths ‚ùå
Batch:
  [12, 8, 3, 9]
  [4, 7, 11]
  [5, 6, 1, 9, 10]

After pad:       Same length ‚úî
Batch:
  [12, 8, 3, 9,  0]
  [4, 7,11, 0,  0]
  [5, 6, 1, 9, 10]
```

Then pack ignoring padding.

