# **IMDB**

# **CNN**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Parameters from your paper
vocab_size = 5000
embedding_dim = 32
max_sequence_length = 500
kernel_sizes = [3, 5, 7, 9]  # Multiple branches
num_filters = 128
pool_size = 2
lstm_units = 128
dropout_rate = 0.5

class MultiBranchCNN_LSTM(nn.Module):
    def __init__(self):
        super(MultiBranchCNN_LSTM, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Create branches for each kernel size
        self.branches = nn.ModuleList()
        for k in kernel_sizes:
            branch = nn.ModuleDict({
                'conv': nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=k, padding='same'),
                'pool': nn.MaxPool1d(kernel_size=pool_size),
                'dropout': nn.Dropout(dropout_rate),
                'batch_norm': nn.BatchNorm1d(num_filters),
                'lstm': nn.LSTM(input_size=num_filters, hidden_size=lstm_units, batch_first=True)
            })
            self.branches.append(branch)

        # Final dense layer
        self.fc = nn.Linear(lstm_units * len(kernel_sizes), 1)

    def forward(self, x):
        # x: [batch_size, seq_len]
        x = self.embedding(x)                   # [batch_size, seq_len, embedding_dim]
        x = x.permute(0, 2, 1)                 # [batch_size, embedding_dim, seq_len] for Conv1d

        branch_outputs = []
        for branch in self.branches:
            out = F.relu(branch['conv'](x))      # Conv + ReLU
            out = branch['pool'](out)            # MaxPooling
            out = branch['dropout'](out)         # Dropout
            out = branch['batch_norm'](out)      # BatchNorm

            out = out.permute(0, 2, 1)           # [batch_size, seq_len//pool, num_filters] for LSTM
            out, (h_n, c_n) = branch['lstm'](out)
            out = out[:, -1, :]                  # Take last timestep
            branch_outputs.append(out)

        # Concatenate all branch outputs
        out = torch.cat(branch_outputs, dim=1)   # [batch_size, lstm_units * num_branches]
        out = torch.sigmoid(self.fc(out))        # Final output
        return out

# Example usage
model = MultiBranchCNN_LSTM()
print(model)

# Example input
batch_size = 32
dummy_input = torch.randint(0, vocab_size, (batch_size, max_sequence_length))
output = model(dummy_input)
print(output.shape)  # Should be [32, 1]


MultiBranchCNN_LSTM(
  (embedding): Embedding(5000, 32)
  (branches): ModuleList(
    (0): ModuleDict(
      (conv): Conv1d(32, 128, kernel_size=(3,), stride=(1,), padding=same)
      (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (dropout): Dropout(p=0.5, inplace=False)
      (batch_norm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (lstm): LSTM(128, 128, batch_first=True)
    )
    (1): ModuleDict(
      (conv): Conv1d(32, 128, kernel_size=(5,), stride=(1,), padding=same)
      (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (dropout): Dropout(p=0.5, inplace=False)
      (batch_norm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (lstm): LSTM(128, 128, batch_first=True)
    )
    (2): ModuleDict(
      (conv): Conv1d(32, 128, kernel_size=(7,), stride=(1,), padding=same)
      (pool): MaxPool1d(kernel_size=2, stride=2, p

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np

nltk.download("punkt")
nltk.download("punkt_tab")

# Load IMDB Dataset
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

# Tokenization + Vocabulary Building
def tokenize(text):
    return word_tokenize(text.lower())

# Build vocab from training set
word_counts = Counter()
for item in train_data:
    tokens = tokenize(item["text"])
    word_counts.update(tokens)

vocab_size = 5000  # keep top 5000 words
most_common = word_counts.most_common(vocab_size - 2)

# Reserve special tokens
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
for i, (word, _) in enumerate(most_common, start=2):
    word_to_idx[word] = i

idx_to_word = {idx: w for w, idx in word_to_idx.items()}

# Text → Integer Sequence Conversion
max_len = 500

def encode_text(text):
    tokens = tokenize(text)
    sequence = [word_to_idx.get(token, 1) for token in tokens]  # 1 = <UNK>

    if len(sequence) > max_len:
        sequence = sequence[:max_len]
    else:
        sequence = sequence + [0] * (max_len - len(sequence))  # pad with <PAD>

    return sequence

# PyTorch Dataset Class
class IMDBDataset(Dataset):
    def __init__(self, imdb_split):
        self.texts = [encode_text(item["text"]) for item in imdb_split]
        self.labels = [item["label"] for item in imdb_split]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx]), torch.tensor(self.labels[idx]).float()


train_dataset = IMDBDataset(train_data)
test_dataset = IMDBDataset(test_data)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Multi-Branch CNN + LSTM Model
embedding_dim = 32
kernel_sizes = [3, 5, 7, 9]
num_filters = 128
pool_size = 2
lstm_units = 128
dropout_rate = 0.5

class MultiBranchCNN_LSTM(nn.Module):
    def __init__(self):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.branches = nn.ModuleList()
        for k in kernel_sizes:
            branch = nn.ModuleDict({
                'conv': nn.Conv1d(embedding_dim, num_filters, kernel_size=k, padding='same'),
                'batch_norm': nn.BatchNorm1d(num_filters),
                'dropout': nn.Dropout(dropout_rate),
                'pool': nn.MaxPool1d(pool_size),
                'lstm': nn.LSTM(num_filters, lstm_units, batch_first=True)
            })
            self.branches.append(branch)

        self.fc = nn.Linear(lstm_units * len(kernel_sizes), 1)

    def forward(self, x):
        x = self.embedding(x)              # [B, L, E]
        x = x.permute(0, 2, 1)            # [B, E, L]

        branch_outputs = []
        for branch in self.branches:
            out = F.relu(branch['conv'](x))
            out = branch['batch_norm'](out)
            out = branch['pool'](out)
            out = branch['dropout'](out)

            # For LSTM: [B, L', C]
            out = out.permute(0, 2, 1)
            out, (h, c) = branch['lstm'](out)
            out = out[:, -1, :]           # last timestep

            branch_outputs.append(out)

        combined = torch.cat(branch_outputs, dim=1)
        return torch.sigmoid(self.fc(combined))

model = MultiBranchCNN_LSTM()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Training Setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
epochs = 2

# Training Loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        preds = model(x_batch).squeeze()
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss = {total_loss/len(train_loader):.4f}")


# Evaluation
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        preds = model(x_batch).squeeze()
        preds = (preds >= 0.5).float()

        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

accuracy = correct / total
print(f"\nTest Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Epoch 1/2, Loss = 0.6958
Epoch 2/2, Loss = 0.6919

Test Accuracy: 0.5060
