

> **Load imdb datasets:**



In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

print(len(train_data), len(test_data))


  from .autonotebook import tqdm as notebook_tqdm


25000 25000


**Preprocess text data using tokenization:**

In [2]:
def tokenize(text):
    return text.lower().split()


**Vocabulary Creation:**

In [3]:
word_freq = {}

for item in train_data:
    for word in tokenize(item["text"]):
        word_freq[word] = word_freq.get(word, 0) + 1

MAX_VOCAB_SIZE = 20000
vocab = sorted(word_freq, key=word_freq.get, reverse=True)[:MAX_VOCAB_SIZE]

word2idx = {word: idx + 2 for idx, word in enumerate(vocab)}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1

vocab_size = len(word2idx)
print("Vocab Size:", vocab_size)

import pickle

with open("word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)

print(" word2idx.pkl saved successfully")


Vocab Size: 20002
 word2idx.pkl saved successfully


In [5]:
!dir


 Volume in drive C is OS
 Volume Serial Number is E2EC-D9F3

 Directory of C:\Users\Chanda Mishra\Downloads\campusx\Untitled Folder

02-01-2026  15:12    <DIR>          .
01-01-2026  22:14    <DIR>          ..
02-01-2026  14:54    <DIR>          .ipynb_checkpoints
01-01-2026  23:51        10,773,447 custom_lstm_sentiment.pth
02-01-2026  00:30    <DIR>          models
02-01-2026  14:55             3,679 my_code.py
02-01-2026  15:12            87,238 Sentiment_Analysis_Part1_Custom_LSTM.ipynb
02-01-2026  14:40           118,636 Sentiment_Analysis_Part2_AWD_LSTM_ULMFiT.ipynb
02-01-2026  14:56             5,573 Untitled.ipynb
02-01-2026  14:56               617 Untitled1.ipynb
02-01-2026  15:10           258,109 word2idx.pkl
               7 File(s)     11,247,299 bytes
               4 Dir(s)  356,885,422,080 bytes free


**Padding and truncation to fixed length:**

In [6]:
import torch

MAX_LEN = 150

def encode(text):
    tokens = tokenize(text)
    encoded = [word2idx.get(word, 1) for word in tokens]
    return torch.tensor(encoded[:MAX_LEN])


**Prepare data for model input:**

In [7]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class IMDbDataset(Dataset):
    def __init__(self, data):
        self.texts = [encode(item["text"]) for item in data]
        self.labels = [item["label"] for item in data]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return texts, torch.tensor(labels)


**Dataloader**:Train-test split handling

In [8]:
train_loader = DataLoader(
    IMDbDataset(train_data),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    IMDbDataset(test_data),
    batch_size=32,
    collate_fn=collate_fn
)


**Build a custom LSTM model in PyTorch:**

In [9]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return self.sigmoid(out)


**Training Setup:**

In [10]:
import torch.optim as optim

model = SentimentLSTM(vocab_size, 128, 128)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cpu")
model.to(device)


SentimentLSTM(
  (embedding): Embedding(20002, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

**Train custom LSTM:**

In [11]:
EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for texts, labels in train_loader:
        texts = texts.to(device)
        labels = labels.float().to(device)

        optimizer.zero_grad()
        outputs = model(texts).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.2f}")


Epoch 1, Loss: 541.92
Epoch 2, Loss: 537.08
Epoch 3, Loss: 522.15


**Evaluation Metrics:** Evaluate both models using identical metrics.

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
preds, true = [], []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts).squeeze()
        preds.extend((outputs > 0.5).int().tolist())
        true.extend(labels.tolist())

print("Accuracy:", accuracy_score(true, preds))
print("Precision:", precision_score(true, preds))
print("Recall:", recall_score(true, preds))
print("F1:", f1_score(true, preds))


Accuracy: 0.58228
Precision: 0.5609481481481482
Recall: 0.75728
F1: 0.6444936170212766


In [14]:
torch.save(model.state_dict(), "custom_lstm_sentiment.pth")
