In [16]:
class Basket:
    def __init__(self, fruits: list):
        self.fruits = fruits

    def __len__(self) -> int:
        return len(self.fruits)

    def __getitem__(self, index: int) -> str:
        return self.fruits[index]

basket = Basket(['apple', 'banana', 'mango'])

print(len(basket))
print(basket[0])

3
apple


In [17]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer,
    BertTokenizerFast,
    BertForSequenceClassification,
    BertForTokenClassification,
    BertForQuestionAnswering,
    get_linear_schedule_with_warmup ###### Gradually warms up then decays learning rate for stable BERT training.
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report, f1_score
import numpy as np
from tqdm import tqdm
from torch.optim import AdamW #Adam Optimizer with Weight Decay

In [18]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"using device {device}")

using device cuda


In [19]:
# dataset class

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.texts)


    def __getitem__(self, index: int) -> dict:
        text = str(self.texts[index])
        label = int(self.labels[index])

        encoding = self.tokenizer(
            text,
            truncation = True,
            padding = 'max_length',
            max_length = self.max_length,
            return_tensors = 'pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

        """
        {
            'input_ids': tensor([101, 2023, 2003, 1037, ...]),
            'attention_mask': tensor([1, 1, 1, 0, 0, ...]),
            'labels': tensor(1)
        }

        """




In [None]:
# BERT text classifier

class BERTTextClassifier:
    def __init__(self, model_name='bert-base-uncased', num_classes=2, max_length=512):
        self.model_name = model_name
        self.num_classes = num_classes
        self.max_length = max_length

        self.tokenizer = BertTokenizerFast.from_pretrained(         
            model_name
        )

        self.model = BertForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_classes
        )

        self.model.to(device)



    def load_imdb_data(self, sample_size=5000):
        print("Loading IMDB dataset ...")

        dataset = load_dataset("imdb")

        # sample data from faster training
        train_indices = np.random.choice(
            len(dataset['train']), # n, pick random int from 0 - (n-1)
            min(sample_size, len(dataset['train'])), # num of sample need (this logic - pick only what in dataset )
            replace=False # pick unique samples only
        )

        test_indices = np.random.choice(
            len(dataset['test']),
            min(sample_size//4, len(dataset['test'])),
            replace=False
        )

        # convert numpy.int64 -> int for indexing
        train_texts = [dataset['train'][int(i)]['text'] for i in train_indices]
        train_labels = [dataset['train'][int(i)]['label'] for i in train_indices]

        test_texts = [dataset['test'][int(i)]['text'] for i in test_indices]
        test_labels = [dataset['test'][int(i)]['label'] for i in test_indices]

        print(f"train samples: {len(train_texts)}")
        print(f"test samples: {len(test_texts)}")

        return train_texts, train_labels, test_texts, test_labels


    def train(self, train_texts, train_labels, epochs=1, batch_size=8, learning_rate=2e-5) -> None:
        train_dataset = TextClassificationDataset(
            train_texts, train_labels, self.tokenizer, self.max_length
        ) # get dataset, each get tokenized(truned into token id, attention maske)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # make batches

        optimizer = AdamW(self.model.parameters(), lr=learning_rate, weight_decay=0.01) # update model weights based on gradients

        total_steps = len(train_loader) * epochs # total number of training steps (batches) across all epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=total_steps//10, num_training_steps=total_steps
        ) # Adjusts learning rate as training progresses

        self.model.train()


        for epoch in range(epochs):
            total_loss = 0
            progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')

            for batch in progress_bar: # batch in train_loader
                optimizer.zero_grad() # clears out previous gradients

                # load everything to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                # forward pass
                outputs = self.model(
                    input_ids = input_ids,
                    attention_mask = attention_mask,
                    labels = labels
                )

                # get the loss
                loss = outputs.loss
                total_loss += loss.item() # add to total loss

                # backward pass -
                loss.backward()

                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # This prevents exploding gradients, especially in large models. If any gradient exceeds 1.0, it’s scaled down proportionally.

                optimizer.step() # update model weight - weight = weight - learning_rate * gradient

                scheduler.step() #

                progress_bar.set_postfix({'Loss': f'{loss.item():.4f}'})

            avg_loss = total_loss/len(train_loader)
            print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')


    def evaluate(self, test_texts, test_labels, batch_size=8):

        test_dataset = TextClassificationDataset(
            test_texts, test_labels, self.tokenizer, self.max_length
        )

        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        self.model.eval()

        predictions = []

        true_labels = []

        with torch.no_grad(): # prevents PyTorch from building a computational graph (no need backpropagration)
            for batch in tqdm(test_loader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                # - The model predicts logits, which are raw, unnormalized scores (not yet probabilities).
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                preds = torch.argmax(logits, dim=1).cpu().numpy() # # preds = torch.argmax(logits, dim=1).cpu().numpy()

                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy())

            accuracy = accuracy_score(true_labels, predictions)
            f1 = f1_score(true_labels, predictions, average='weighted')
            report = classification_report(true_labels, predictions, target_names=['Nagative', 'Positive'])

            return accuracy, f1, report


    def predict(self, texts):
        predictions = [] # what model predict
        probabilities = [] # true values

        self.model.eval()

        for text in texts:
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            with torch.no_grad():
                # - The model predicts logits, which are raw, unnormalized scores (not yet probabilities).
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits


                probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
                pred = torch.argmax(logits, dim=1).cpu().numpy()[0]


                predictions.append(pred)
                probabilities.append(probs)

        return predictions, probabilities


In [21]:
# if batch size is 5 that mean something like this
"""
{
  'input_ids': tensor([
      [101, 2023, 2003, ..., 0, 0],
      [101, 2054, 2003, ..., 0, 0],
      [101, 1045, 2293, ..., 0, 0],
      [101, 2009, 2003, ..., 0, 0],
      [101, 2057, 2024, ..., 0, 0]
  ]),
  'attention_mask': tensor([
      [1, 1, 1, 1, 0, 0, ...],
      [1, 1, 1, 0, 0, 0, ...],
      [1, 1, 1, 1, 1, 0, ...],
      [1, 1, 1, 0, 0, 0, ...],
      [1, 1, 1, 1, 1, 0, ...]
  ]),
  'labels': tensor([1, 0, 1, 0, 1])
}

not ->

[
    { 'input_ids', 'attention_mask', 'labels' },
    { 'input_ids', 'attention_mask', 'labels' },
    { 'input_ids', 'attention_mask', 'labels' },
    { 'input_ids', 'attention_mask', 'labels' },
    { 'input_ids', 'attention_mask', 'labels' },
]


"""



"\n{\n  'input_ids': tensor([\n      [101, 2023, 2003, ..., 0, 0],\n      [101, 2054, 2003, ..., 0, 0],\n      [101, 1045, 2293, ..., 0, 0],\n      [101, 2009, 2003, ..., 0, 0],\n      [101, 2057, 2024, ..., 0, 0]\n  ]),\n  'attention_mask': tensor([\n      [1, 1, 1, 1, 0, 0, ...],\n      [1, 1, 1, 0, 0, 0, ...],\n      [1, 1, 1, 1, 1, 0, ...],\n      [1, 1, 1, 0, 0, 0, ...],\n      [1, 1, 1, 1, 1, 0, ...]\n  ]),\n  'labels': tensor([1, 0, 1, 0, 1])\n}\n\nnot ->\n\n[ \n    { 'input_ids', 'attention_mask', 'labels' }, \n    { 'input_ids', 'attention_mask', 'labels' }, \n    { 'input_ids', 'attention_mask', 'labels' }, \n    { 'input_ids', 'attention_mask', 'labels' }, \n    { 'input_ids', 'attention_mask', 'labels' }, \n]\n\n\n"

In [30]:
def run_text_classification_demo():

    """Demo for text classification"""

    print("\n" + "="*60)
    print("TEXT CLASSIFICATION (Sentiment Analysis) DEMO")
    print("="*60)

    classifier = BERTTextClassifier()

    # Load data
    train_texts, train_labels, test_texts, test_labels = classifier.load_imdb_data(sample_size=1000)

    # Show sample
    print(f"\nSample Review: {train_texts[0][:200]}...")
    print(f"Label: {'Positive' if train_labels[0] == 1 else 'Negative'}")

    # Train for 2 epochs (small for demo)
    classifier.train(train_texts, train_labels, epochs=1, batch_size=8)

    # Evaluate
    accuracy, f1, report = classifier.evaluate(test_texts, test_labels, batch_size=8)

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Test custom examples
    custom_reviews = [
        "This movie was fantastic! Amazing acting and great plot.",
        "Boring and terrible. Waste of time.",
        "Not bad, could be better though."
    ]

    predictions, probabilities = classifier.predict(custom_reviews)

    print(f"\nCustom Predictions:")

    for text, pred, prob in zip(custom_reviews, predictions, probabilities):

        sentiment = "Positive" if pred == 1 else "Negative"

        confidence = prob[pred] * 100

        print(f"'{text[:50]}...' -> {sentiment} ({confidence:.1f}%)")


In [31]:
run_text_classification_demo()


TEXT CLASSIFICATION (Sentiment Analysis) DEMO


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading IMDB dataset ...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

train samples: 1000
test samples: 250

Sample Review: I LOVE Jack's jokes like 'The cliché is...' or "Over the top cliché guy, black, oily skin, kinda spooky...". He is just hilarious! Daniel's starting to catch up on him to! Good thing Jack's not on the...
Label: Positive


Epoch 1/1: 100%|██████████| 125/125 [01:30<00:00,  1.39it/s, Loss=0.3125]


Epoch 1, Average Loss: 0.5465


Evaluating: 100%|██████████| 32/32 [00:06<00:00,  4.68it/s]


Accuracy: 0.8640
F1 Score: 0.8639

Custom Predictions:
'This movie was fantastic! Amazing acting and great...' -> Positive (84.0%)
'Boring and terrible. Waste of time....' -> Negative (82.2%)
'Not bad, could be better though....' -> Negative (65.4%)



