In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import re

In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()

In [4]:
all_tokens = [token for txt in df['review'] for token in tokenize(txt)]
vocab = {word: i+2 for i, (word, _) in enumerate(Counter(all_tokens).most_common(20000))}
vocab["<pad>"] = 0
vocab["<unk>"] = 1

def encode(text):
    return [vocab.get(tok, 1) for tok in tokenize(text)]


In [5]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [torch.tensor(encode(t)) for t in texts]
        self.labels = torch.tensor(labels)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [6]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True)
    return texts, torch.tensor(labels)

In [7]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [8]:
train_ds = IMDBDataset(X_train.tolist(), y_train.tolist())
test_ds = IMDBDataset(X_test.tolist(), y_test.tolist())
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(test_ds, batch_size=32, collate_fn=collate_fn)

In [9]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, nhead, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)
        encoded = self.transformer_encoder(x)
        out = encoded.mean(dim=0)
        return self.fc(out)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerClassifier(len(vocab), embed_dim=128, nhead=4, num_layers=2, num_classes=2).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()



In [11]:
for epoch in range(3):
    model.train()
    total_loss = 0
    for text, label in train_dl:
        text, label = text.to(device), label.to(device)
        optimizer.zero_grad()
        out = model(text)
        loss = criterion(out, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_dl):.4f}")

Epoch 1: Loss = 0.5590
Epoch 2: Loss = 0.3831
Epoch 3: Loss = 0.3312


In [12]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for text, label in test_dl:
        text, label = text.to(device), label.to(device)
        preds = model(text).argmax(dim=1)
        correct += (preds == label).sum().item()
        total += label.size(0)
print(f"Accuracy: {correct/total:.4f}")

Accuracy: 0.8644
