In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm


In [2]:
#Configs class
class Configs:
    model_name = "bert-base-uncased"
    epochs = 5
    num_workers = 2
    learning_rate = 2e-5
    batch_size = 16
    max_len = 256
    weight_decay = 0.05
    gradient_accumulation_steps = 1
    max_grad_norm = 1.0
    num_class = 2
    tokenizer = None

cfg = Configs()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using accelerator device: {device}')


Using accelerator device: cuda


In [5]:
!pip install --upgrade datasets fsspec

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency resolver 

In [3]:
# Load dataset
dataset = load_dataset("imdb")

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
cfg.tokenizer = tokenizer

# Custom Dataset class
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
# Custom Model class
class IMDBClassifier(nn.Module):
    def __init__(self, model_name, num_labels, mlp_hidden_size=512):
        super(IMDBClassifier, self).__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.base_model = AutoModel.from_pretrained(model_name, config=self.config)

        # Custom MLP layer
        self.mlp = nn.Sequential(
            nn.Linear(self.config.hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        self.classifier = nn.Linear(mlp_hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        mlp_output = self.mlp(pooled_output)

        logits = self.classifier(mlp_output)
        return logits

In [5]:
# Prepare datasets

from torch.cuda.amp import GradScaler, autocast

train_texts = dataset["train"]["text"]
train_labels = dataset["train"]["label"]
test_texts = dataset["test"]["text"]
test_labels = dataset["test"]["label"]

train_dataset = IMDBDataset(train_texts, train_labels, cfg.tokenizer, cfg.max_len)
test_dataset = IMDBDataset(test_texts, test_labels, cfg.tokenizer, cfg.max_len)

# DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers)

# Initialize model optimizer and loss function
model = IMDBClassifier(cfg.model_name, num_labels=cfg.num_class).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay)
criterion = nn.CrossEntropyLoss()

scaler = GradScaler()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  scaler = GradScaler()


In [6]:
# Training loop

best_accuracy = 0.0
best_model_path = "/content/model best/best_model.pth"

for epoch in range(cfg.epochs):
    model.train()
    total_train_loss = 0
    for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        if batch_idx % cfg.gradient_accumulation_steps == 0:
             optimizer.zero_grad()

        with autocast():
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)

        total_train_loss += loss.item()
        loss = loss / cfg.gradient_accumulation_steps

        scaler.scale(loss).backward()

        if (batch_idx + 1) % cfg.gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()


    if cfg.gradient_accumulation_steps > 1 and (batch_idx + 1) % cfg.gradient_accumulation_steps != 0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()


    print(f"Epoch {epoch+1} Training Loss: {total_train_loss / len(train_dataloader):.4f}")

    # Evaluation loop
    model.eval()
    predictions = []
    true_labels = []
    total_eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast():
              logits = model(input_ids=input_ids, attention_mask=attention_mask)

            eval_loss = criterion(logits, labels)
            total_eval_loss += eval_loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    avg_eval_loss = total_eval_loss / len(test_dataloader)
    print(f"Epoch {epoch+1} Test Loss: {avg_eval_loss:.4f} - Test Accuracy: {accuracy:.4f}")

    # Save the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), best_model_path)
        print(f"Saved best model with accuracy: {best_accuracy:.4f} to {best_model_path}")



Epoch 1 Training:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 1 Training Loss: 0.2878


Evaluation:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 1 Test Loss: 0.2657 - Test Accuracy: 0.9057
Saved best model with accuracy: 0.9057 to /content/model best/best_model.pth


Epoch 2 Training:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 2 Training Loss: 0.1780


Evaluation:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 2 Test Loss: 0.3434 - Test Accuracy: 0.9116
Saved best model with accuracy: 0.9116 to /content/model best/best_model.pth


Epoch 3 Training:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 3 Training Loss: 0.1052


Evaluation:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 3 Test Loss: 0.3680 - Test Accuracy: 0.9186
Saved best model with accuracy: 0.9186 to /content/model best/best_model.pth


Epoch 4 Training:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 4 Training Loss: 0.0638


Evaluation:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 4 Test Loss: 0.4363 - Test Accuracy: 0.9151


Epoch 5 Training:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 5 Training Loss: 0.0473


Evaluation:   0%|          | 0/1563 [00:00<?, ?it/s]

  with autocast():


Epoch 5 Test Loss: 0.5192 - Test Accuracy: 0.9178


In [7]:
print(f"Saved best model with accuracy: {best_accuracy:.4f} to {best_model_path}")

Saved best model with accuracy: 0.9186 to /content/model best/best_model.pth


In [8]:
import os

def predict_sentiment(text):

    inference_model = IMDBClassifier(cfg.model_name, num_labels=cfg.num_class).to(device)
    if os.path.exists(best_model_path):
        inference_model.load_state_dict(torch.load(best_model_path, map_location=device))
        print(f"Loaded best model from {best_model_path}")
    else:
        raise FileNotFoundError(f"Model file not found at {best_model_path}")

    inference_model.eval()
    encoding = cfg.tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=cfg.max_len,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        with autocast():
            logits = inference_model(input_ids=input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(logits, dim=-1).item()
    return "positive" if prediction == 1 else "negative"

# Example
print("\n--- Making predictions with the best saved model ---")
print(predict_sentiment("This movie was absolutely wonderful!"))
print(predict_sentiment("This movie was a complete waste of time."))
print(predict_sentiment("It was an average film, nothing too exciting or disappointing."))


--- Making predictions with the best saved model ---
Loaded best model from /content/model best/best_model.pth
positive


  with autocast():


Loaded best model from /content/model best/best_model.pth
negative
Loaded best model from /content/model best/best_model.pth
positive
