# Step 1: Environment setup

Install required libraries and set up the PyTorch device (CPU or GPU).


In [1]:
# Install required libraries (run once)
!pip install -q transformers datasets scikit-learn accelerate

import sys
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score, f1_score, classification_report

print("Python version:", sys.version)
print("Torch version:", torch.__version__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

[0mPython version: 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0]
Torch version: 2.3.1+cu121


device(type='cuda')

# Step 2: Load AG News dataset and create splits

Load the AG News dataset from HuggingFace Datasets and split it into training, validation, and test sets for later use.

In [2]:
# Load AG News dataset from HuggingFace
raw_datasets = load_dataset("ag_news")
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [3]:
# Create train / validation / test splits
full_train = raw_datasets["train"]
full_test = raw_datasets["test"]

# Take 10% of train as validation
splits = full_train.train_test_split(test_size=0.1, seed=42)
train_dataset = splits["train"]
valid_dataset = splits["test"]
test_dataset = full_test  

len(train_dataset), len(valid_dataset), len(test_dataset)

(108000, 12000, 7600)

# Step 3: Naïve keyword-based baseline

We implement a simple rule-based classifier that assigns each headline
to a class based on hand-crafted keyword lists, and evaluate it on the test set.


In [4]:
# Step 3.1: Define label mappings and keyword lists for the baseline
import re

# AG News label mapping: 0=World, 1=Sports, 2=Business, 3=Sci/Tech
label_names = ["World", "Sports", "Business", "Sci/Tech"]
id2label = {i: name for i, name in enumerate(label_names)}
label2id = {name: i for i, name in enumerate(label_names)}

# Hand-crafted keyword lists for each class
keyword_dict = {
    0: ["war", "peace", "election", "government", "president", "minister",
        "attack", "conflict", "talks", "summit", "union", "party"],
    1: ["game", "match", "season", "team", "coach", "league", "score",
        "win", "loss", "cup", "tournament", "player"],
    2: ["stock", "market", "share", "profit", "losses", "bank", "company",
        "deal", "trade", "merger", "economy", "business"],
    3: ["software", "internet", "computer", "technology", "phone", "chip",
        "research", "scientist", "space", "nasa", "online", "web"],
}


In [5]:
# Step 3.2: Text cleaning and keyword-based prediction function
def clean_text(text: str) -> str:
    """Lowercase and remove punctuation for simple keyword matching."""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def baseline_predict(text: str) -> int:
    """Return predicted label id using simple keyword counts."""
    text_clean = clean_text(text)
    tokens = text_clean.split()

    scores = {label: 0 for label in keyword_dict}
    for label, keywords in keyword_dict.items():
        for kw in keywords:
            if kw in tokens:
                scores[label] += 1

    # If all scores are zero, fall back to most frequent class in training (World=0)
    if all(score == 0 for score in scores.values()):
        return 0

    # Otherwise choose the class with the highest score
    best_label = max(scores, key=scores.get)
    return best_label

In [6]:
# Step 3.3: Evaluate the keyword baseline on the test set
from sklearn.metrics import confusion_matrix

def evaluate_baseline(dataset, max_samples=None):
    texts = dataset["text"]
    labels = dataset["label"]

    if max_samples is not None:
        texts = texts[:max_samples]
        labels = labels[:max_samples]

    preds = [baseline_predict(t) for t in texts]

    acc = accuracy_score(labels, preds)
    macro = f1_score(labels, preds, average="macro")
    print(f"Baseline accuracy: {acc:.4f}")
    print(f"Baseline macro-F1: {macro:.4f}")
    print()
    print(classification_report(labels, preds, target_names=label_names))
    return preds

baseline_test_preds = evaluate_baseline(test_dataset)


Baseline accuracy: 0.5332
Baseline macro-F1: 0.5263

              precision    recall  f1-score   support

       World       0.39      0.92      0.55      1900
      Sports       0.82      0.53      0.64      1900
    Business       0.60      0.36      0.45      1900
    Sci/Tech       0.80      0.33      0.46      1900

    accuracy                           0.53      7600
   macro avg       0.65      0.53      0.53      7600
weighted avg       0.65      0.53      0.53      7600



# Step 4: Tokenize the dataset for DistilBERT

Prepare a DistilBERT tokenizer and convert the raw text into input IDs, attention masks, and labels so that the model can consume the data.

In [7]:
# Step 4.1: Prepare DistilBERT tokenizer and preprocessing function
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 64  # headlines are short; 64 tokens are enough

def preprocess_function(examples):
    """Tokenize the 'text' field and attach labels for Trainer."""
    result = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )
    result["labels"] = examples["label"]
    return result


In [8]:
# Step 4.2: Apply tokenization to train/validation/test splits
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=valid_dataset.column_names,
)

tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=test_dataset.column_names,
)

example = tokenized_train[0]
print(example.keys())  
print("decoded text:", tokenizer.decode(example["input_ids"], skip_special_tokens=True))
print("label:", example["labels"])

dict_keys(['input_ids', 'attention_mask', 'labels'])
decoded text: despair and anger in small russian town after siege beslan, russia ( reuters ) - the killing of more than 320 children, parents and teachers during the bloody end to a 53 - hour school siege left barely a family untouched in the small russian town of beslan.
label: 0


# Step 5: Define DistilBERT model and training setup

Instantiate a DistilBERT-based sequence classification model and specify the evaluation metrics (accuracy and macro-F1) together with training hyperparameters

In [9]:
# Step 5.1: Create DistilBERT classification model
num_labels = 4

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
model.to(device)

print("Loaded model:", model_name)
print("Number of labels:", num_labels)
print("Model device:", next(model.parameters()).device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: distilbert-base-uncased
Number of labels: 4
Model device: cuda:0


In [10]:
# Step 5.2: Define metric function for Trainer (accuracy + macro-F1)
import numpy as np

def compute_metrics(eval_pred):
    """Compute accuracy and macro-F1 for evaluation."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    macro = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "macro_f1": macro}

# Step 6: Fine-tune DistilBERT on AG News

Fine-tune the DistilBERT classifier on the tokenized training set, using the validation set to monitor performance and select the best checkpoint.

In [11]:
from transformers import TrainingArguments, Trainer

# Step 6.1: Set training hyperparameters (compatible version)
batch_size = 32

training_args = TrainingArguments(
    output_dir="outputs",
    num_train_epochs=5,                     
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=500,  
    warmup_ratio=0.05,
)

In [12]:
# Step 6.2: Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
# Step 6.3: Train the model
train_result = trainer.train()
train_result

Step,Training Loss
500,0.569
1000,0.2714
1500,0.254
2000,0.229
2500,0.2113
3000,0.208
3500,0.1899
4000,0.1433
4500,0.1465
5000,0.1377


TrainOutput(global_step=16875, training_loss=0.11609061434710467, metrics={'train_runtime': 816.0729, 'train_samples_per_second': 661.706, 'train_steps_per_second': 20.678, 'total_flos': 8941868328960000.0, 'train_loss': 0.11609061434710467, 'epoch': 5.0})

# Step 7: Evaluate DistilBERT and compare with the keyword baseline

We evaluate the fine-tuned DistilBERT model on the validation and test sets,
then compare its performance with the naïve keyword-based baseline on the same
AG News test set.

In [14]:
# Step 7.1: Evaluate on validation and test sets
val_metrics = trainer.evaluate(tokenized_valid)
print("Validation metrics:", val_metrics)

test_metrics = trainer.evaluate(tokenized_test)
print("Test metrics:", test_metrics)

Validation metrics: {'eval_loss': 0.3276974856853485, 'eval_accuracy': 0.942, 'eval_macro_f1': 0.9417004095170786, 'eval_runtime': 5.6497, 'eval_samples_per_second': 2123.991, 'eval_steps_per_second': 66.375, 'epoch': 5.0}
Test metrics: {'eval_loss': 0.32963278889656067, 'eval_accuracy': 0.9413157894736842, 'eval_macro_f1': 0.9413301112983662, 'eval_runtime': 3.8136, 'eval_samples_per_second': 1992.884, 'eval_steps_per_second': 62.409, 'epoch': 5.0}


In [15]:
# Step 7.2: Detailed classification report on the test set
from sklearn.metrics import classification_report
import numpy as np

pred_output = trainer.predict(tokenized_test)
test_preds = np.argmax(pred_output.predictions, axis=-1)
test_labels = pred_output.label_ids

print(classification_report(test_labels, test_preds, target_names=label_names))

              precision    recall  f1-score   support

       World       0.96      0.95      0.95      1900
      Sports       0.98      0.99      0.99      1900
    Business       0.91      0.92      0.91      1900
    Sci/Tech       0.92      0.91      0.91      1900

    accuracy                           0.94      7600
   macro avg       0.94      0.94      0.94      7600
weighted avg       0.94      0.94      0.94      7600



In [16]:
# Step 7.3: Recompute baseline metrics for comparison
baseline_test_preds = evaluate_baseline(test_dataset)

from sklearn.metrics import accuracy_score, f1_score

baseline_acc = accuracy_score(test_dataset["label"], baseline_test_preds)
baseline_macro = f1_score(test_dataset["label"], baseline_test_preds, average="macro")

print(f"Baseline accuracy: {baseline_acc:.4f}")
print(f"Baseline macro-F1: {baseline_macro:.4f}")

Baseline accuracy: 0.5332
Baseline macro-F1: 0.5263

              precision    recall  f1-score   support

       World       0.39      0.92      0.55      1900
      Sports       0.82      0.53      0.64      1900
    Business       0.60      0.36      0.45      1900
    Sci/Tech       0.80      0.33      0.46      1900

    accuracy                           0.53      7600
   macro avg       0.65      0.53      0.53      7600
weighted avg       0.65      0.53      0.53      7600

Baseline accuracy: 0.5332
Baseline macro-F1: 0.5263


In [17]:
# Step 7.4: Print a small comparison summary
bert_acc = test_metrics["eval_accuracy"]
bert_macro = test_metrics["eval_macro_f1"]

print("=== Test set comparison ===")
print(f"Keyword baseline  - acc: {baseline_acc:.4f}, macro-F1: {baseline_macro:.4f}")
print(f"DistilBERT model  - acc: {bert_acc:.4f}, macro-F1: {bert_macro:.4f}")
print(f"Accuracy gain     : {bert_acc - baseline_acc:.4f}")
print(f"Macro-F1 gain     : {bert_macro - baseline_macro:.4f}")


=== Test set comparison ===
Keyword baseline  - acc: 0.5332, macro-F1: 0.5263
DistilBERT model  - acc: 0.9413, macro-F1: 0.9413
Accuracy gain     : 0.4082
Macro-F1 gain     : 0.4150


# Step 8: Qualitative examples where baseline and DistilBERT disagree

We inspect a few test examples where the keyword-based baseline and the
DistilBERT model make different predictions. For each example, we show the
headline, the gold label, the baseline prediction, and the DistilBERT
prediction, and briefly discuss why the AI model works better (or worse).


In [18]:
# Step 8.1: Find indices where baseline and DistilBERT predictions differ
import random

diff_indices = [
    i for i, (b, m) in enumerate(zip(baseline_test_preds, test_preds))
    if b != m
]

len(diff_indices)

3514

In [19]:
# Step 8.2: Sample 3 examples where they disagree
sample_indices = random.sample(diff_indices, 5)
sample_indices

[5666, 928, 222, 6596, 2373]

In [20]:
# Step 8.3: Print the selected examples
for idx in sample_indices:
    text = test_dataset["text"][idx]
    true_label = label_names[test_dataset["label"][idx]]
    base_label = label_names[baseline_test_preds[idx]]
    bert_label = label_names[test_preds[idx]]

    print("=" * 80)
    print(f"Index        : {idx}")
    print(f"Headline     : {text}")
    print(f"Gold label   : {true_label}")
    print(f"Baseline pred: {base_label}")
    print(f"DistilBERT   : {bert_label}")

Index        : 5666
Headline     : Indian PM pledges to protect poor from oil-driven inflation NEW DELHI : Indian Prime Minister Manmohan Singh pledged to try to shield the poor by keeping down prices of essential goods amid rising inflation.
Gold label   : Business
Baseline pred: World
DistilBERT   : Business
Index        : 928
Headline     : Coming to a TV near you: Ads for desktop Linux Linspire CEO points out that recent TV ads serve as indication of acceptance in mainstream populace.
Gold label   : Sci/Tech
Baseline pred: World
DistilBERT   : Sci/Tech
Index        : 222
Headline     : Selling Houston Warts and All, Especially Warts Descriptions of urban afflictions and images of giant mosquitoes and cockroaches to convey a sense of how Houston is nevertheless beloved by many residents.
Gold label   : Business
Baseline pred: World
DistilBERT   : Business
Index        : 6596
Headline     : South Korea, Singapore seal free-trade pact Korea and Singapore sealed a free-trade agreement 