In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# BERT News Topic Classifier — AG News (End‑to‑End)

This mini‑project fine‑tunes **bert-base-uncased** on the **AG News** dataset, evaluates with **accuracy** and **macro F1**, and deploys a lightweight **Gradio** demo for live predictions.

---

## 📦 Project Structure

```
bert-agnews/
├── train.py              # Fine-tune & evaluate BERT
├── app.py                # Gradio demo for live inference
├── requirements.txt      # Dependencies
└── README.md             # (optional) Notes & tips
```

---

## 🔧 requirements.txt

```
transformers>=4.44.0
datasets>=2.19.0
torch>=2.1.0
scikit-learn>=1.1.0
gradio>=4.44.0
accelerate>=0.33.0
numpy>=1.24.0
```

> **Tip:** If using GPU, install the appropriate CUDA build of PyTorch from pytorch.org.

---

## 🧠 train.py (fine‑tune + evaluate + save)

```python

In [None]:
! pip install -q --upgrade transformers datasets accelerate


In [1]:

import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch

MODEL_NAME = "bert-base-uncased"
OUTPUT_DIR = "outputs/bert-agnews"
SAVE_DIR = "models/bert-agnews"
SEED = 42

# 1) Load AG News
# The HF "ag_news" dataset has 4 classes: 0=World, 1=Sports, 2=Business, 3=Sci/Tech
raw = load_dataset("ag_news")

# 2) Build a clean text field if needed (some variants have 'text'; others have 'title'/'description')

def build_text(example):
    if "text" in example and example["text"]:
        example["clean_text"] = example["text"]
    else:
        title = example.get("title", "") or ""
        desc = example.get("description", "") or ""
        example["clean_text"] = (title + ". " + desc).strip()
    return example

raw = raw.map(build_text)

# 3) Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

max_length = 256

def tokenize(batch):
    return tokenizer(
        batch["clean_text"],
        truncation=True,
        max_length=max_length,
    )

tok = raw.map(tokenize, batched=True, remove_columns=[c for c in raw["train"].column_names if c not in ["label"]])

# 4) Label maps
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
label2id = {v: k for k, v in id2label.items()}

# 5) Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=4, id2label=id2label, label2id=label2id
)

# 6) Data collator (dynamic padding)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 7) Metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

# 8) Training args
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    seed=SEED,
    fp16=torch.cuda.is_available(),
    report_to=[]  # no wandb by default
)

# 9) Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tok["train"],
    eval_dataset=tok["test"],  # AG News doesn't ship a validation split; we eval on test for simplicity
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

# 10) Train
trainer.train()

# 11) Evaluate (accuracy & F1)
metrics = trainer.evaluate(tok["test"])  # returns dict with loss, accuracy, f1_macro
print("\n*** Test Metrics ***")
for k, v in metrics.items():
    if isinstance(v, float):
        print(f"{k}: {v:.4f}")
    else:
        print(f"{k}: {v}")

# 12) Save final model & tokenizer
os.makedirs(SAVE_DIR, exist_ok=True)
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"\nSaved fine-tuned model to: {SAVE_DIR}")



2025-09-03 05:48:43.450537: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756878523.496467     150 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756878523.506332     150 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
## 🚀 app.py (Gradio live demo)


import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gradio as gr

MODEL_DIR = "models/bert-agnews"  # path saved by train.py
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

# Load once
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
model.eval()

@torch.inference_mode()
def predict(text: str):
    if not text or not text.strip():
        return {"label": "", "scores": {}}
    enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
    outputs = model(**enc)
    probs = torch.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0]
    top_id = int(np.argmax(probs))
    result = {id2label[i]: float(probs[i]) for i in range(len(probs))}
    return {"label": id2label[top_id], "scores": result}

# Gradio UI
with gr.Blocks(title="BERT News Topic Classifier") as demo:
    gr.Markdown("""
    # 📰 BERT News Topic Classifier
    Enter a news headline or short blurb to predict its topic.
    """)

    inp = gr.Textbox(label="News headline", placeholder="e.g., Apple unveils new AI features for iPhone...")
    btn = gr.Button("Classify")
    out_label = gr.Label(label="Predicted Topic")
    out_scores = gr.JSON(label="Class Probabilities")

    def _run(text):
        res = predict(text)
        return res["label"], res["scores"]

    btn.click(_run, inputs=inp, outputs=[out_label, out_scores])

    gr.Examples(
        examples=[
            ["Stocks surge as central bank hints at rate cuts"],
            ["NASA announces new mission to explore asteroid belt"],
            ["Premier League champions sign star striker"],
            ["UN condemns violence amid escalating border tensions"],
        ],
        inputs=inp,
    )

if __name__ == "__main__":
    demo.launch()



## ▶️ How to Run

```bash
# 1) Create environment (optional but recommended)
python -m venv .venv && source .venv/bin/activate  # (Linux/Mac)
# on Windows: python -m venv .venv && .venv\Scripts\activate

# 2) Install deps
pip install -r requirements.txt

# 3) Fine-tune + evaluate
python train.py
# You’ll see accuracy and macro F1 on AG News test set printed at the end.

# 4) Launch the demo
python app.py
# Open the local URL that Gradio prints in the terminal.
```

---

## 🧪 Notes on Metrics

* **Accuracy**: overall correctness across 4 classes.
* **Macro F1**: averages F1 across classes equally (robust when classes are imbalanced).

> Expect \~93–95% **accuracy** on AG News with this setup (varies with seed/epochs/GPU).

---

## ⚙️ Tweaks & Tips

* **Faster training**: try `distilbert-base-uncased`.
* **Longer texts**: increase `max_length` (trade‑off: speed/memory).
* **More epochs**: 3–5 often helps; watch validation metrics for overfitting.
* **Class weights**: AG News is fairly balanced; usually not needed.
* **Export**: package `models/bert-agnews` as a folder for deployment or push to Hugging Face Hub.

---

## ✅ Skills Practiced

* Tokenization & preprocessing with **Hugging Face Datasets**
* Fine‑tuning **BERT** for sequence classification
* Evaluating with **accuracy** & **macro F1**
* Lightweight deployment via **Gradio**