<a href="https://colab.research.google.com/github/Arpit1118/Pytorch/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np

In [2]:
class SyntheticDataset(Dataset):
    def __init__(self, num_samples=10000, seq_length=20, vocab_size=50):
        self.num_samples = num_samples
        self.seq_length = seq_length
        self.vocab_size = vocab_size

        # Generate random sequences and labels
        self.sequences = np.random.randint(1, vocab_size, size=(num_samples, seq_length))
        self.labels = np.random.randint(0, 2, size=num_samples)  # Binary labels (0 or 1)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# Generate a synthetic dataset with 10,000 samples, each with a sequence length of 20
dataset = SyntheticDataset(num_samples=10000, seq_length=20, vocab_size=100)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [3]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x shape: (batch_size, seq_length)
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        rnn_out, _ = self.rnn(embedded)  # RNN output shape: (batch_size, seq_length, hidden_size)
        out = rnn_out[:, -1, :]  # Get the output of the last time step (batch_size, hidden_size)
        out = self.fc(out)  # Shape: (batch_size, output_size)
        return out


In [4]:
# Parameters
vocab_size = 100  # Vocabulary size (number of unique words)
embedding_dim = 32  # Dimension of the word embeddings
hidden_size = 64  # Hidden size of the RNN
output_size = 2  # Binary classification (0 or 1)
num_epochs = 10
learning_rate = 0.001

# Instantiate the model, loss function, and optimizer
model = RNNModel(vocab_size, embedding_dim, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for seq, label in dataloader:
        # Forward pass
        optimizer.zero_grad()
        outputs = model(seq)

        # Compute loss and backpropagate
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == label).sum().item()
        total_samples += label.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_samples * 100
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%')


Epoch [1/10], Loss: 0.6988, Accuracy: 50.43%
Epoch [2/10], Loss: 0.6939, Accuracy: 51.96%
Epoch [3/10], Loss: 0.6908, Accuracy: 53.45%
Epoch [4/10], Loss: 0.6892, Accuracy: 53.41%
Epoch [5/10], Loss: 0.6872, Accuracy: 53.91%
Epoch [6/10], Loss: 0.6849, Accuracy: 55.46%
Epoch [7/10], Loss: 0.6819, Accuracy: 55.89%
Epoch [8/10], Loss: 0.6794, Accuracy: 56.79%
Epoch [9/10], Loss: 0.6737, Accuracy: 58.15%
Epoch [10/10], Loss: 0.6698, Accuracy: 58.02%


In [5]:
# Testing the model
model.eval()
test_sequence = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]], dtype=torch.long)
with torch.no_grad():
    output = model(test_sequence)
    predicted_label = torch.argmax(output, dim=1)
    print(f'Predicted label: {predicted_label.item()}')


Predicted label: 1


**Question and Answer Project using RNN**

In [6]:
import pandas as pd
df = pd.read_csv('/content/100_Unique_QA_Dataset.csv')
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [7]:
#tokenize
def tokenize (text):
  text = text.lower()
  text = text.replace('?', '')
  text = text.replace("'", '')
  return text.split()

In [8]:
tokenize(df['question'][0])

['what', 'is', 'the', 'capital', 'of', 'france']

In [9]:
#vocab
vocab = {'<UNK>': 0}

In [10]:
def build_vocab(row):
  print(row['question'], row['answer'])
  tokenized_question = tokenize(row['question'])
  tokenized_answer = tokenize(row['answer'])
  merged_tokens = tokenized_question + tokenized_answer
  for token in merged_tokens:
    if token not in vocab:
      vocab[token] = len(vocab)
df.apply(build_vocab, axis=1)

What is the capital of France? Paris
What is the capital of Germany? Berlin
Who wrote 'To Kill a Mockingbird'? Harper-Lee
What is the largest planet in our solar system? Jupiter
What is the boiling point of water in Celsius? 100
Who painted the Mona Lisa? Leonardo-da-Vinci
What is the square root of 64? 8
What is the chemical symbol for gold? Au
Which year did World War II end? 1945
What is the longest river in the world? Nile
What is the capital of Japan? Tokyo
Who developed the theory of relativity? Albert-Einstein
What is the freezing point of water in Fahrenheit? 32
Which planet is known as the Red Planet? Mars
Who is the author of '1984'? George-Orwell
What is the currency of the United Kingdom? Pound
What is the capital of India? Delhi
Who discovered gravity? Newton
How many continents are there on Earth? 7
Which gas do plants use for photosynthesis? CO2
What is the smallest prime number? 2
Who invented the telephone? Alexander-Graham-Bell
What is the capital of Australia? Canber

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
85,
86,
87,
88,


In [11]:
len(vocab)

324

In [12]:
#convert words to numerical indices
def text_to_indices(text, vocab):
  indexed_text = []
  for token in tokenize(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

In [13]:
text_to_indices(df['question'][0], vocab)

[1, 2, 3, 4, 5, 6]

In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

In [15]:
class QADatset(Dataset):
  def __init__(self,df, vocab):
    self.df = df
    self.vocab = vocab

  def __len__(self):
    return self.df.shape[0]

  def __getitem__(self, index):
     numerical_question = text_to_indices(self.df.iloc[index]['question'], self.vocab)
     numerical_answer = text_to_indices(self.df.iloc[index]['answer'], self.vocab)

     return torch.tensor(numerical_question), torch.tensor(numerical_answer)

In [16]:
dataset = QADatset(df, vocab)

In [17]:
dataset[0]

(tensor([1, 2, 3, 4, 5, 6]), tensor([7]))

In [18]:
dataset[10]

(tensor([ 1,  2,  3,  4,  5, 53]), tensor([54]))

In [19]:
dataloader = DataLoader(dataset, batch_size=1,shuffle = True)

In [20]:
for question, answer in dataloader:
  print(question, answer)


tensor([[  1,   2,   3,  37,  38,  39, 161]]) tensor([[162]])
tensor([[  1,   2,   3, 163, 164, 165,  83,  84]]) tensor([[166]])
tensor([[  1,  87, 229, 230, 231, 232]]) tensor([[233]])
tensor([[ 1,  2,  3,  4,  5, 53]]) tensor([[54]])
tensor([[ 1,  2,  3, 69,  5, 53]]) tensor([[260]])
tensor([[  1,   2,   3, 212,   5,  14, 213, 214]]) tensor([[215]])
tensor([[  1,   2,   3,   4,   5, 109]]) tensor([[317]])
tensor([[ 42, 101,   2,   3,  17]]) tensor([[102]])
tensor([[ 42, 290, 291, 118, 292, 158, 293, 294]]) tensor([[295]])
tensor([[  1,   2,   3,  92, 137,  19,   3,  45]]) tensor([[185]])
tensor([[ 1,  2,  3, 59, 25,  5, 26, 19, 60]]) tensor([[61]])
tensor([[  1,   2,   3,   4,   5, 135]]) tensor([[136]])
tensor([[  1,   2,   3, 146, 147,  19, 148]]) tensor([[149]])
tensor([[ 42,  18,   2,   3, 281,  12,   3, 282]]) tensor([[205]])
tensor([[  1,   2,   3,   4,   5, 286]]) tensor([[287]])
tensor([[1, 2, 3, 4, 5, 6]]) tensor([[7]])
tensor([[10, 11, 12, 13, 14, 15]]) tensor([[16]])
tenso

In [33]:
class SimpleRNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim=50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, vocab_size)

  def forward(self, question):
    embedded_question = self.embedding(question)
    hidden, final = self.rnn(embedded_question)
    output = self.fc(final.squeeze(0))

    return output

In [34]:
x = nn.Embedding(324, embedding_dim=50)
y = nn.RNN(50, 64, batch_first=True)
z = nn.Linear(64, 324)

a = dataset[0][0].reshape(1,6)
print("shape of a:", a.shape)
b = x(a)
print("shape of b:", b.shape)
c, d = y(b)
print("shape of c:", c.shape)
print("shape of d:", d.shape)

e = z(d.squeeze(0))

print("shape of e:", e.shape)

shape of a: torch.Size([1, 6])
shape of b: torch.Size([1, 6, 50])
shape of c: torch.Size([1, 6, 64])
shape of d: torch.Size([1, 1, 64])
shape of e: torch.Size([1, 324])


In [35]:
learning_rate = 0.001
epochs = 20
model = SimpleRNN(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [36]:
#training loop
for epoch in range(epochs):

  total_loss = 0

  for question, answer in dataloader:

    optimizer.zero_grad()

    # forward pass
    output = model(question)

    # loss -> output shape (1,324) - (1)
    loss = criterion(output, answer[0])

    # gradients
    loss.backward()

    # update
    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch+1}, Loss: {total_loss:4f}")

Epoch: 1, Loss: 527.442824
Epoch: 2, Loss: 450.401902
Epoch: 3, Loss: 372.538307
Epoch: 4, Loss: 314.403866
Epoch: 5, Loss: 262.642605
Epoch: 6, Loss: 215.114728
Epoch: 7, Loss: 170.816049
Epoch: 8, Loss: 133.988042
Epoch: 9, Loss: 103.086330
Epoch: 10, Loss: 79.212963
Epoch: 11, Loss: 60.857895
Epoch: 12, Loss: 47.842462
Epoch: 13, Loss: 38.288151
Epoch: 14, Loss: 30.724669
Epoch: 15, Loss: 25.302699
Epoch: 16, Loss: 21.125778
Epoch: 17, Loss: 17.889012
Epoch: 18, Loss: 15.288737
Epoch: 19, Loss: 13.155446
Epoch: 20, Loss: 11.413942
