<a href="https://colab.research.google.com/github/Arpit1118/Pytorch/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install --upgrade torchtext



In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data import Field, BucketIterator
import numpy as np

# Step 1: Preprocessing the Data
TEXT = Field(sequential=True, tokenize='spacy', include_lengths=True)
LABEL = Field(sequential=False, use_vocab=True, is_target=True)

# Load the IMDB dataset
train_data, test_data = IMDB.splits(TEXT, LABEL)

# Build vocab using pretrained embeddings
TEXT.build_vocab(train_data, vectors="glove.6B.100d", max_size=25000)
LABEL.build_vocab(train_data)

# Create iterators for batching
BATCH_SIZE = 64
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=BATCH_SIZE, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

# Step 2: Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(packed_embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

# Hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
DROPOUT = 0.5

# Model initialization
model = RNNModel(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)

# Load pretrained word embeddings
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

# Step 3: Training the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

def binary_accuracy(preds, y):
    rounded_preds = torch.argmax(preds, dim=1, keepdim=True)
    correct = (rounded_preds.squeeze(1) == y).float()
    return correct.sum() / len(correct)

# Training loop
N_EPOCHS = 5
for epoch in range(N_EPOCHS):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for batch in train_iterator:
        text, text_lengths = batch.text
        labels = batch.label

        optimizer.zero_grad()
        predictions = model(text, text_lengths)

        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    avg_epoch_loss = epoch_loss / len(train_iterator)
    avg_epoch_acc = epoch_acc / len(train_iterator)

    print(f'Epoch {epoch+1}/{N_EPOCHS} | Loss: {avg_epoch_loss:.3f} | Accuracy: {avg_epoch_acc*100:.2f}%')

# Step 4: Evaluate the model on the test set
model.eval()
test_loss = 0
test_acc = 0

with torch.no_grad():
    for batch in test_iterator:
        text, text_lengths = batch.text
        labels = batch.label

        predictions = model(text, text_lengths)

        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)

        test_loss += loss.item()
        test_acc += acc.item()

avg_test_loss = test_loss / len(test_iterator)
avg_test_acc = test_acc / len(test_iterator)

print(f'Test Loss: {avg_test_loss:.3f} | Test Accuracy: {avg_test_acc*100:.2f}%')


OSError: /usr/local/lib/python3.11/dist-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs