In [34]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [35]:
df = pd.read_csv('IMDB_Dataset.csv', engine='python',on_bad_lines='skip')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [36]:
df['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [37]:
df['review'] = df['review'].str.strip()
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [38]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['sentiment'])

In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval().to(device)

MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3

In [40]:
class IMDBDataset(Dataset):
    def __init__(self, df):
        self.texts = df['review'].tolist()
        self.labels = df['sentiment'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        enc = tokenizer(text, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'label': torch.tensor(labels, dtype=torch.long)
        }


In [41]:
train_ds = IMDBDataset(train_df)
test_ds = IMDBDataset(test_df)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [42]:
@torch.no_grad()
def get_cls_embedding(input_ids, attention_mask):
    output = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    return output.last_hidden_state[:, 0, :]


In [43]:
class CLSClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 2)
        )

    def forward(self, x):
        return self.fc(x)

model = CLSClassifier().to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.05)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [44]:
print("Training started...")
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    progress = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}")

    for batch in progress:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        cls_emb = get_cls_embedding(input_ids, attention_mask)
        preds = model(cls_emb)

        loss = criterion(preds, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch} complete. Avg Loss = {avg_loss:.4f}\n")


Training started...


Epoch 1/3: 100%|██████████| 2500/2500 [06:56<00:00,  6.00it/s, loss=0.385]


Epoch 1 complete. Avg Loss = 0.4958



Epoch 2/3: 100%|██████████| 2500/2500 [06:56<00:00,  6.00it/s, loss=0.47]


Epoch 2 complete. Avg Loss = 0.4582



Epoch 3/3: 100%|██████████| 2500/2500 [06:56<00:00,  6.00it/s, loss=0.544]

Epoch 3 complete. Avg Loss = 0.4486






In [45]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        cls_emb = get_cls_embedding(input_ids, attention_mask)
        preds = model(cls_emb)

        all_preds.extend(preds.argmax(dim=1).cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

acc = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {acc * 100:.2f}%")


Evaluating: 100%|██████████| 625/625 [01:42<00:00,  6.08it/s]

Test Accuracy: 81.73%





In [46]:
sample_texts = [
    "Absolutely loved it, fantastic performances!",
    "Terrible plot, wouldn't recommend it at all.",
    "Not bad, could have been better.",
    "Worst movie ever made.",
    "A touching story with great characters.",
]

print("Sample Predictions:")
label_map = {0: "Negative", 1: "Positive"}

with torch.no_grad():
    for text in sample_texts:
        enc = tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=MAX_LEN,
            return_tensors='pt'
        )
        input_ids = enc['input_ids'].to(device)
        attention_mask = enc['attention_mask'].to(device)

        cls_emb = get_cls_embedding(input_ids, attention_mask)
        pred = model(cls_emb).argmax(dim=1).item()

        print(f"{text} -> {label_map[pred]}")


Sample Predictions:
Absolutely loved it, fantastic performances! -> Positive
Terrible plot, wouldn't recommend it at all. -> Negative
Not bad, could have been better. -> Negative
Worst movie ever made. -> Negative
A touching story with great characters. -> Positive
