<a href="https://colab.research.google.com/github/Adeel777eng/TASK-No.1/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
"""
train_finetune_agnews.py

Fine-tunes 'bert-base-uncased' on AG News (train/validation/test),
evaluates on test set, and saves model + tokenizer + label file.

No local dataset files required (Hugging Face Datasets auto-downloads).
Output directory (default): ./models/bert-agnews/

Usage:
    python train_finetune_agnews.py
"""

import os
import numpy as np
from dataclasses import dataclass
from typing import Dict

import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
)
from sklearn.metrics import accuracy_score, f1_score

# -----------------------
# Config
# -----------------------
MODEL_NAME = "bert-base-uncased"
OUTPUT_DIR = "models/bert-agnews"
SEED = 42

@dataclass
class CFG:
    model_name: str = MODEL_NAME
    output_dir: str = OUTPUT_DIR
    seed: int = SEED
    num_train_epochs: int = 3
    per_device_train_batch_size: int = 8  # safe default (decrease if OOM)
    per_device_eval_batch_size: int = 16
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    max_length: int = 128
    validation_split_ratio: float = 0.1
    save_total_limit: int = 2
    logging_steps: int = 100

cfg = CFG()

def compute_metrics(eval_pred) -> Dict[str, float]:
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1}

def main():
    set_seed(cfg.seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # -----------------------
    # Load AG News and create a validation split
    # -----------------------
    print("Loading AG News dataset...")
    raw = load_dataset("ag_news")  # has 'train' and 'test'
    # create validation split from train
    split = raw["train"].train_test_split(test_size=cfg.validation_split_ratio, seed=cfg.seed)
    datasets = DatasetDict({
        "train": split["train"],
        "validation": split["test"],
        "test": raw["test"]
    })

    # capture label names (AG News provides them)
    label_names = raw["train"].features["label"].names
    print("Label names:", label_names)

    # -----------------------
    # Tokenizer & Model
    # -----------------------
    print("Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name, num_labels=len(label_names))

    # -----------------------
    # Preprocessing function
    # -----------------------
    def preprocess(examples):
        # examples["text"] is a list when batched=True
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            max_length=cfg.max_length,
        )
        # Keep labels under 'labels' key for Trainer
        tokenized["labels"] = examples["label"]
        return tokenized

    # apply preprocessing (remove original columns to avoid duplication)
    print("Tokenizing datasets (this may take a while)...")
    tokenized = datasets.map(
        preprocess,
        batched=True,
        remove_columns=datasets["train"].column_names,  # removes 'text' & 'label'
    )

    # -----------------------
    # Data collator (dynamic padding)
    # -----------------------
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # -----------------------
    # Training arguments
    # -----------------------
    os.makedirs(cfg.output_dir, exist_ok=True)
    training_args = TrainingArguments(
        output_dir=cfg.output_dir,
        num_train_epochs=cfg.num_train_epochs,
        per_device_train_batch_size=cfg.per_device_train_batch_size,
        per_device_eval_batch_size=cfg.per_device_eval_batch_size,
        learning_rate=cfg.learning_rate,
        weight_decay=cfg.weight_decay,
        eval_strategy="epoch", # Changed evaluation_strategy to eval_strategy
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=cfg.logging_steps,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=cfg.save_total_limit,
        fp16=torch.cuda.is_available(),  # only enable fp16 if CUDA available
        report_to=[],
    )

    # -----------------------
    # Trainer
    # -----------------------
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # -----------------------
    # Train
    # -----------------------
    print("Starting training...")
    trainer.train()

    # -----------------------
    # Evaluate on test
    # -----------------------
    print("Evaluating on test set...")
    test_metrics = trainer.evaluate(eval_dataset=tokenized["test"])
    print("\n=== Test set results ===")
    for k, v in test_metrics.items():
        print(f"{k}: {v:.4f}" if isinstance(v, (int, float)) else f"{k}: {v}")

    # -----------------------
    # Save model, tokenizer, and label names
    # -----------------------
    print(f"Saving model and tokenizer to: {cfg.output_dir}")
    trainer.save_model(cfg.output_dir)  # saves model and config
    tokenizer.save_pretrained(cfg.output_dir)

    label_file = os.path.join(cfg.output_dir, "label_names.txt")
    with open(label_file, "w", encoding="utf-8") as f:
        for name in label_names:
            f.write(name + "\n")
    print(f"Saved label names to: {label_file}")

    print("\n✅ Training + evaluation complete. Model saved.")

if __name__ == "__main__":
    main()

Using device: cpu
Loading AG News dataset...
Label names: ['World', 'Sports', 'Business', 'Sci/Tech']
Loading tokenizer and model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing datasets (this may take a while)...


Map:   0%|          | 0/108000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting training...




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 