In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
from torch.utils.data import DataLoader

from transformers import BertModel, BertTokenizerFast, get_linear_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, accuracy_score
from tqdm import tqdm, trange
from torch.optim import AdamW

from datasets import load_dataset, Dataset, DatasetDict

In [None]:
from dataclasses import dataclass
@dataclass
class Config:
    model_name: str = "bert-base-uncased"
    max_length: int = 256
    train_batch_size: int = 12
    eval_batch_size: int = 32
    gradient_accumulation_steps: int = 1
    epochs: int = 3
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    warmup_steps: int = 0
    seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    output_dir: str = "./saved_model"
    num_labels: int = 2  # IMDb is binary (pos/neg)
    logging_steps: int = 50
    max_grad_norm: float = 1.0

In [None]:
cfg = Config()

In [None]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
set_seed(cfg.seed)

In [None]:
os.makedirs(cfg.output_dir, exist_ok=True)

In [None]:
raw = load_dataset('imdb')

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
raw['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [None]:
# tokenizer
tokenizer = BertTokenizerFast.from_pretrained(cfg.model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def preprocess(examples):
    texts = examples['text']
    tokenized_texts = tokenizer(texts, padding='max_length', max_length=cfg.max_length, truncation=True)
    return tokenized_texts

In [None]:
tokenized_datasets = raw.map(preprocess, batched=True, remove_columns=['text'])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets['train']

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [None]:
data_collector = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

In [None]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=cfg.train_batch_size, shuffle=True, collate_fn=data_collector)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=cfg.eval_batch_size, shuffle=False, collate_fn=data_collector)

In [None]:
# classificaton model
class BertForSentimentAnalysis(nn.Module):
    def __init__(self, model_name: str, num_labels: int = 2, dropout_prob: float = 0.1):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size  # typically 768 for base
        # A simple classification head
        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout_prob),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(p=dropout_prob / 2),
            nn.Linear(hidden_size // 2, num_labels),
        )

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        # Get last hidden states from BERT (we use pooled output / CLS token representation)
        bert_outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True,
        )
        pooled_output = bert_outputs.pooler_output  # shape: (batch_size, hidden_size)
        logits = self.classifier(pooled_output)  # shape: (batch_size, num_labels)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits, "hidden": pooled_output}

In [None]:
cfg.device

'cuda'

In [None]:
model = BertForSentimentAnalysis(cfg.model_name, cfg.num_labels)
model.to(cfg.device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForSentimentAnalysis(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [None]:
optimizer = AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
total_steps = len(train_dataloader) // cfg.gradient_accumulation_steps * cfg.epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=cfg.warmup_steps, num_training_steps=total_steps)

TypeError: 'DataLoader' object is not subscriptable

In [None]:
# training
def train():
    model.train()
    global_step = 0
    scaler = None
    for epoch in range(cfg.epochs):
        print(f"Epoch {epoch + 1}/{cfg.epochs}")
        epoch_loss = 0
        progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
        for step, batch in enumerate(progress_bar):
            input_id = batch['input_ids'].to(cfg.device)
            attention_mask = batch['attention_mask'].to(cfg.device)
            token_type_ids = batch['token_type_ids']
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(cfg.device)
            # labels = batch['label'].to(cfg.device)
            labels = (batch["label"] if "label" in batch else batch["labels"]).to(cfg.device)


            outputs = model(input_ids=input_id, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)
            loss = outputs["loss"]
            loss_value = loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            epoch_loss += loss_value
            global_step += 1

            if global_step % cfg.logging_steps == 0:
                avg = epoch_loss / global_step
                progress_bar.set_postfix({"avg_loss": avg, "lost_loss": loss_value})
        avg_epoch_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch: {epoch+1} finished. Avg loss: {avg_epoch_loss}")

        eval_metrics = evaluate()
        print(f"Epoch: {epoch+1} finished. Eval metrics\n: {eval_metrics}")

        # save model
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(cfg.output_dir)
        torch.save(model_to_save.state_dict(), os.path.join(cfg.output_dir, "pytorch_model.bin"))
        tokenizer.save_pretrained(cfg.output_dir)

In [None]:
def predict_dataloader(dataloader: DataLoader):
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting", leave=False):
            input_ids = batch["input_ids"].to(cfg.device)
            attention_mask = batch["attention_mask"].to(cfg.device)
            token_type_ids = batch.get("token_type_ids")
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(cfg.device)
            # labels = batch["label"].to(cfg.device)
            labels = (batch["label"] if "label" in batch else batch["labels"]).to(cfg.device)


            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            logits = outputs["logits"]  # shape (batch, num_labels)
            probs = torch.softmax(logits, dim=-1).cpu().numpy()
            preds = np.argmax(probs, axis=-1).tolist()

            all_preds.extend(preds)
            all_probs.append(probs)
            all_labels.extend(labels.cpu().tolist())

    all_probs = np.vstack(all_probs)
    return all_labels, all_preds, all_probs


In [None]:
def evaluate():
    y_true, y_pred, y_probs = predict_dataloader(eval_dataloader)
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, average="binary")
    # Also report macro F1 in case of class imbalance
    p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(y_true, y_pred, average="macro")
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", classification_report(y_true, y_pred, digits=4))
    metrics = {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f1_macro": f1_macro,
        "confusion_matrix": cm,
    }
    return metrics

In [None]:
# Train the model
train()

Epoch 1/3




Epoch: 1 finished. Avg loss: 0.1735165485492845




Confusion Matrix:
 [[11634   866]
 [ 1116 11384]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9125    0.9307    0.9215     12500
           1     0.9293    0.9107    0.9199     12500

    accuracy                         0.9207     25000
   macro avg     0.9209    0.9207    0.9207     25000
weighted avg     0.9209    0.9207    0.9207     25000

Epoch: 1 finished. Eval metrics
: {'accuracy': 0.92072, 'precision': 0.9293061224489796, 'recall': 0.91072, 'f1': 0.9199191919191919, 'f1_macro': 0.9207120712071207, 'confusion_matrix': array([[11634,   866],
       [ 1116, 11384]])}


AttributeError: 'BertForSentimentAnalysis' object has no attribute 'save_pretrained'

In [None]:
final_metrics = evaluate()
print("Final metrics:", final_metrics)



Confusion Matrix:
 [[11634   866]
 [ 1116 11384]]
Classification Report:
               precision    recall  f1-score   support

           0     0.9125    0.9307    0.9215     12500
           1     0.9293    0.9107    0.9199     12500

    accuracy                         0.9207     25000
   macro avg     0.9209    0.9207    0.9207     25000
weighted avg     0.9209    0.9207    0.9207     25000

Final metrics: {'accuracy': 0.92072, 'precision': 0.9293061224489796, 'recall': 0.91072, 'f1': 0.9199191919191919, 'f1_macro': 0.9207120712071207, 'confusion_matrix': array([[11634,   866],
       [ 1116, 11384]])}


In [None]:
def predict_sentences(sentences, neutral_threshold = (0.45, 0.55)):
    model.eval()
    enc = tokenizer(sentences, truncation=True, padding=True, max_length=cfg.max_length, return_tensors="pt")
    input_ids = enc["input_ids"].to(cfg.device)
    attention_mask = enc["attention_mask"].to(cfg.device)
    token_type_ids = enc.get("token_type_ids")
    if token_type_ids is not None:
        token_type_ids = token_type_ids.to(cfg.device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs["logits"]
        probs = torch.softmax(logits, dim=-1).cpu().numpy()  # shape (n, num_labels)

    results = []
    for i, p in enumerate(probs):
        # For binary: assume label 1 is 'positive', label 0 is 'negative'
        if cfg.num_labels == 2:
            pos_prob = float(p[1])
            low, high = neutral_threshold
            if pos_prob < low:
                label = "negative"
                label_id = 0
            elif pos_prob > high:
                label = "positive"
                label_id = 1
            else:
                label = "neutral"
                label_id = None
        else:
            # Multiclass: choose max
            label_id = int(np.argmax(p))
            label = f"label_{label_id}"

        results.append({
            "text": sentences[i],
            "predicted_label": label,
            "label_id": label_id,
            "probabilities": p.tolist(),
        })
    return results

In [None]:
example_texts = [
        "This movie was fantastic! I loved the characters and the storyline.",
        "Terrible film. Waste of time and money.",
        "It was okay — some parts were good, some were boring."
    ]

In [None]:
preds = predict_sentences(example_texts)
for p in preds:
    print("Text:", p["text"])
    print("Predicted:", p["predicted_label"], "Probs:", p["probabilities"])
    print("-" * 60)

Text: This movie was fantastic! I loved the characters and the storyline.
Predicted: positive Probs: [0.0023781508207321167, 0.9976218342781067]
------------------------------------------------------------
Text: Terrible film. Waste of time and money.
Predicted: negative Probs: [0.997158408164978, 0.00284164072945714]
------------------------------------------------------------
Text: It was okay — some parts were good, some were boring.
Predicted: negative Probs: [0.9868199229240417, 0.013180060312151909]
------------------------------------------------------------
