# Task
Build and train an RNN model using PyTorch for a text classification task.

## Install compatible torchtext

### Subtask:
Address the `torchtext` installation error by installing a compatible version with the installed PyTorch version.


**Reasoning**:
The previous `pip install torchtext==0.19.0` failed because that version is not available. I need to install a `torchtext` version that is compatible with PyTorch 2.8.0. Based on PyTorch documentation, `torchtext` version 0.18.0 is compatible with PyTorch 2.8.0.



# Pytorch

## Load and preprocess data

### Subtask:
Load a suitable dataset (like IMDB) and preprocess it for the RNN model. This includes tokenization and numericalization.


**Reasoning**:
The previous attempts to import `get_tokenizer` failed due to an `OSError`. This error suggests a potential issue with the torchtext installation or its compatibility with the installed PyTorch version, even though a compatible version was previously installed. To address this, I will try reinstalling torchtext to ensure all dependencies are correctly linked. After reinstalling, I will attempt to import `get_tokenizer` again as the first step in preprocessing the data.



**Reasoning**:
The task is to drop rows with missing values and display the number of remaining rows. I will drop the rows with missing values and then print the shape of the dataframe to show the number of remaining rows.



In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load the IMDB dataset
dataset = load_dataset("imdb")

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

print(tokenized_datasets)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [None]:
import torch

# Set the format for PyTorch tensors
tokenized_datasets.set_format("torch")

# Create DataLoader for training and testing
train_dataloader = torch.utils.data.DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(tokenized_datasets["test"], batch_size=32)

print("Train DataLoader:", train_dataloader)
print("Test DataLoader:", test_dataloader)

Train DataLoader: <torch.utils.data.dataloader.DataLoader object at 0x7c79b1edbe90>
Test DataLoader: <torch.utils.data.dataloader.DataLoader object at 0x7c79b0e20830>


In [None]:
batch = next(iter(train_dataloader))

print(batch.keys())

dict_keys(['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])


## RNN

In [None]:
import torch.nn as nn
import torch.optim as optim

class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()
    self.embedding = nn.Embedding(input_dim,embedding_dim)
    self.rnn = nn.RNN(embedding_dim,hidden_dim)
    self.fc = nn.Linear(hidden_dim,output_dim)

  def forward(self , text):
    embedded = self.embedding(text)

    __ , hidden = self.rnn(embedded)

    hidden = hidden.squeeze(0)

    output = self.fc(hidden)

    return output

In [None]:
INPUT_DIM = len(tokenizer.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2

model_RNN = RNN(INPUT_DIM , EMBEDDING_DIM , HIDDEN_DIM , OUTPUT_DIM)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_RNN.parameters())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_RNN = model_RNN.to(device)
criterion = criterion.to(device)
print("Model instantiated and moved to device:", device)


Model instantiated and moved to device: cuda


In [None]:

def train(model, iterator, optimizer, criterion, device):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()

        # Ensure data is on the correct device
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        # The current RNN model expects input shape [sequence_length, batch_size]
        # The tokenizer output is [batch_size, sequence_length]
        # We need to permute the dimensions
        input_ids = input_ids.permute(1, 0)

        predictions = model(input_ids)

        loss = criterion(predictions, labels)

        # Calculate accuracy for binary classification
        # The model outputs logits for two classes. Get the class with the highest logit.
        _, predicted_classes = torch.max(predictions, 1)
        correct = (predicted_classes == labels).float()
        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, device):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            input_ids = input_ids.permute(1, 0)

            predictions = model(input_ids)

            loss = criterion(predictions, labels)

            # Calculate accuracy for binary classification
            # The model outputs logits for two classes. Get the class with the highest logit.
            _, predicted_classes = torch.max(predictions, 1)
            correct = (predicted_classes == labels).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model_RNN, train_dataloader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model_RNN, test_dataloader, criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_RNN.state_dict(), 'rnn_model.pt')

    print(f'Epoch RNN : {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch RNN : 01
	Train Loss: 0.697 | Train Acc: 50.66%
	 Val. Loss: 0.697 |  Val. Acc: 50.14%
Epoch RNN : 02
	Train Loss: 0.697 | Train Acc: 50.70%
	 Val. Loss: 0.697 |  Val. Acc: 50.14%
Epoch RNN : 03
	Train Loss: 0.697 | Train Acc: 50.68%
	 Val. Loss: 0.697 |  Val. Acc: 50.14%
Epoch RNN : 04
	Train Loss: 0.697 | Train Acc: 50.68%
	 Val. Loss: 0.697 |  Val. Acc: 50.14%
Epoch RNN : 05
	Train Loss: 0.697 | Train Acc: 50.70%
	 Val. Loss: 0.697 |  Val. Acc: 50.14%


## GRU

In [None]:
import torch.nn as nn

class GRU(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # text = [sent len, batch size]

        embedded = self.embedding(text)

        # embedded = [sent len, batch size, emb dim]

        output, hidden = self.rnn(embedded)

        # output = [sent len, batch size, hid dim]
        # hidden = [1, batch size, hid dim]

        hidden = hidden.squeeze(0)

        # hidden = [batch size, hid dim]

        output = self.fc(hidden)

        # output = [batch size, output dim]

        return output

In [None]:
import torch.optim as optim

# Define hyperparameters
INPUT_DIM = len(tokenizer.vocab) # Use the vocabulary size from the tokenizer
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2 # Binary classification (positive/negative)

# Instantiate the model
model_GRU = GRU(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_GRU.parameters())

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_GRU = model_GRU.to(device)
criterion = criterion.to(device)

print("model_GRU instantiated and moved to device:", device)

model_GRU instantiated and moved to device: cuda


In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model_GRU, train_dataloader, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model_GRU, test_dataloader, criterion, device)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_GRU.state_dict(), 'GRU_model.pt')

    print(f'Epoch GRU: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch GRU: 01
	Train Loss: 0.697 | Train Acc: 50.09%
	 Val. Loss: 0.693 |  Val. Acc: 50.04%
Epoch GRU: 02
	Train Loss: 0.694 | Train Acc: 50.98%
	 Val. Loss: 0.693 |  Val. Acc: 50.37%
Epoch GRU: 03
	Train Loss: 0.590 | Train Acc: 63.82%
	 Val. Loss: 0.347 |  Val. Acc: 84.78%
Epoch GRU: 04
	Train Loss: 0.256 | Train Acc: 89.67%
	 Val. Loss: 0.274 |  Val. Acc: 88.45%
Epoch GRU: 05
	Train Loss: 0.142 | Train Acc: 95.00%
	 Val. Loss: 0.301 |  Val. Acc: 88.68%


## LSTM pytorch

In [None]:
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader , TensorDataset
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

x_train = pad_sequences(x_train, maxlen=500, padding='post')
x_test = pad_sequences(x_test, maxlen=500, padding='post')

train_data = TensorDataset(torch.tensor(x_train, dtype=torch.long),
                           torch.tensor(y_train, dtype=torch.long))

test_data = TensorDataset(torch.tensor(x_test, dtype=torch.long),
                          torch.tensor(y_test, dtype=torch.long))


train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, text):
        # text = [batch_size, seq_len]
        embedded = self.embedding(text)              # [batch, seq_len, emb_dim]
        output, (hidden, cell) = self.lstm(embedded) # hidden = [1, batch, hid_dim]
        hidden = hidden[-1]                          # take last hidden state [batch, hid_dim]
        hidden = self.dropout(hidden)
        return self.fc(hidden)                       # [batch, output_dim]


In [None]:
INPUT_DIM = 10000   # vocab size
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 2      # binary classification (pos/neg)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_LSTM = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_LSTM.parameters())

In [None]:
def train_lstm(model, iterator, optimizer, criterion,device,epoch=5):
  model.train()
  for epoch in range(epoch):
    epoch_loss = 0
    epoch_acc = 0
    for x_batch , y_batch in iterator:
      x_batch, y_batch = x_batch.to(device), y_batch.to(device)
      optimizer.zero_grad()

      predictions = model(x_batch) # The model now outputs [batch_size]
      loss = criterion(predictions, y_batch) # y_batch is [batch_size], matching predictions

      # accuracy
      _, preds = torch.max(predictions, 1)
      acc = (preds == y_batch).float().mean() # Use round() for binary prediction and calculate accuracy correctly

      loss.backward()
      optimizer.step()

      epoch_loss += loss.item()
      epoch_acc += acc.item()
    print(f"Epoch LSTM: {epoch+1:02} | Train Loss: {epoch_loss/len(iterator):.3f} | Train Acc: {epoch_acc/len(iterator)*100:.2f}%")


train_lstm(model_LSTM, train_loader, optimizer, criterion,device,epoch=10)

Epoch LSTM: 01 | Train Loss: 0.692 | Train Acc: 50.78%
Epoch LSTM: 02 | Train Loss: 0.682 | Train Acc: 52.12%
Epoch LSTM: 03 | Train Loss: 0.664 | Train Acc: 52.76%
Epoch LSTM: 04 | Train Loss: 0.607 | Train Acc: 63.26%
Epoch LSTM: 05 | Train Loss: 0.439 | Train Acc: 80.80%
Epoch LSTM: 06 | Train Loss: 0.333 | Train Acc: 86.47%
Epoch LSTM: 07 | Train Loss: 0.259 | Train Acc: 90.04%
Epoch LSTM: 08 | Train Loss: 0.205 | Train Acc: 92.60%
Epoch LSTM: 09 | Train Loss: 0.151 | Train Acc: 95.00%
Epoch LSTM: 10 | Train Loss: 0.117 | Train Acc: 96.38%


In [None]:
def train(model , iterator , optimizer , criterion,device,epochs=5):
  model.train()
  for epoch in range(epochs):
    epoch_loss = 0
    epoch_acc = 0
    for x_batch ,y_batch in iterator :
      x_batch , y_batch = x_batch.to(device),y_batch.to(device)
      optimizer.zero_grad()

      prediction = model(x_batch)
      loss = criterion(prediction,y_batch)

      preds = torch.max(prediction,1)[1]

      acc = (preds == y_batch).float().mean()

      loss.backward()
      optimzer.step()

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  print(f'Epoch {epoch}')
  print(f'Loss :{epoch_loss/len(iterator) : .4f} || Train acc : {epoch_acc/len(iterator)*100 : .2f}%')



In [None]:
# 5. Evaluation function
# -----------------------------
def evaluate(model, loader, criterion, device):
    model.eval()
    epoch_loss, epoch_acc = 0, 0

    with torch.no_grad():
        for batch_x, batch_y in loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            predictions = model(batch_x)
            loss = criterion(predictions, batch_y)

            # accuracy
            _, preds = torch.max(predictions, 1)
            acc = (preds == batch_y).float().mean()

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(loader), epoch_acc / len(loader)


test_loss, test_acc = evaluate(model_LSTM, test_loader, criterion, device)
print(f"\nTest Loss: {test_loss:.3f} | Test Accuracy: {test_acc:.3f}")



Test Loss: 0.432 | Test Accuracy: 0.858


# TensorFlow

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense , SimpleRNN , Dropout , Embedding ,GRU,LSTM

In [None]:
maxlen=500
vocab_size = 50000


(x_train , y_train),(x_test , y_test) = imdb.load_data(num_words=vocab_size)

x_train = pad_sequences(x_train,maxlen=maxlen,padding='post')
x_test = pad_sequences(x_test,maxlen=maxlen,padding='post')

print("Training Data shape : ", x_train.shape)
print("Testing Data shape : " ,x_test.shape)

Training Data shape :  (25000, 500)
Testing Data shape :  (25000, 500)


In [None]:
tf_model_RNN = Sequential([
    Embedding(input_dim=vocab_size ,output_dim=128),
    SimpleRNN(128 , activation='relu',return_sequences=False),
    Dense(1,activation='sigmoid')
])

tf_model_RNN.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy',metrics=['accuracy'])
tf_model_RNN.summary()

In [None]:
tf_model_RNN.fit(x_train,y_train,epochs=5,batch_size=32,validation_batch_size=0.2)

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 39ms/step - accuracy: 0.5069 - loss: 0.6833
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 40ms/step - accuracy: 0.5140 - loss: 0.6809
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 40ms/step - accuracy: 0.5229 - loss: 0.6780
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - accuracy: 0.5204 - loss: 0.6737
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 39ms/step - accuracy: 0.5343 - loss: 0.6646


<keras.src.callbacks.history.History at 0x78bf7b19dfd0>

In [None]:
tf_model_RNN.evaluate(x_test,y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.5245 - loss: 0.6872


[0.6875472068786621, 0.5158399939537048]

In [None]:
tf_model_gru = Sequential([
    Embedding(input_dim=vocab_size , output_dim = 128),
    GRU(128, activation='relu', return_sequences=False),
    Dense(1, activation='sigmoid')
])

tf_model_gru.compile(optimizer ='Adam', loss='binary_crossentropy' , metrics=['accuracy'])
tf_model_gru.summary()

In [None]:
history_gru = tf_model_gru.fit(x_train,y_train,epochs=5,batch_size=32,validation_split=0.2)

In [None]:
loss_gru , accuracy_gru = history_gru.evaluate(x_test,y_test)
print(f'Loss GRU : {loss_gru :.5f} ||  accuracy GRU : {accuracy_gru :.5f}')

In [None]:
tf_model_lstm = Sequential([
    Embedding(input_dim=vocab_size , output_dim = 128),
    LSTM(128, activation='relu', return_sequences=False),
    Dense(1, activation='sigmoid')
])

tf_model_lstm.compile(optimizer ='Adam', loss='binary_crossentropy' , metrics=['accuracy'])
tf_model_lstm.summary()

In [None]:
history_lstm = tf_model_lstm.fit(x_train,y_train,epochs=5,batch_size=32,validation_split=0.2)

In [None]:
loss_lstm , accuracy_lstm = history_lstm.evaluate(x_test,y_test)
print(f'Loss LSTM : {loss_lstm :.5f} ||  accuracy LSTM : {accuracy_lstm :.5f}')