<a href="https://colab.research.google.com/github/Arzooilistan/task1/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ---------- 1) Install required packages (run once) ----------
!pip install -q transformers datasets evaluate sentencepiece accelerate gradio

# ---------- 2) Imports ----------
import os
from datasets import load_dataset, ClassLabel
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
import numpy as np
import evaluate
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# ---------- 3) Load AG News dataset ----------
# Hugging Face Datasets provides AG News with split train/test
dataset = load_dataset("ag_news")
# dataset: dict with splits 'train' and 'test'
print(dataset)

# ---------- 4) Prepare tokenizer and model ----------
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# map labels -> ensure ClassLabel type (should already be)
# Check labels
print("Label feature:", dataset["train"].features["label"])

num_labels = len(dataset["train"].features["label"].names) if isinstance(dataset["train"].features["label"], ClassLabel) else len(set(dataset["train"]["label"]))
print("num_labels =", num_labels)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

# ---------- 5) Tokenization / Preprocessing ----------
max_length = 128

def preprocess_fn(examples):
    # AG News: fields 'text' (some splits have title+description) — dataset has 'text' field
    # For safety, handle both 'text' or 'title'+'description'
    if "text" in examples:
        texts = examples["text"]
    else:
        # fallback
        texts = [ (t + " " + d) if (t and d) else (t or d or "") for t, d in zip(examples.get("title", []), examples.get("description", [])) ]
    return tokenizer(texts, padding=False, truncation=True, max_length=max_length)

# Apply tokenization (batched)
tokenized = dataset.map(preprocess_fn, batched=True, remove_columns=dataset["train"].column_names)
print(tokenized)

# ---------- 6) Train/validation split (optional) ----------
# We'll create a small validation split from train (say 5% or 10%) if not provided
split = tokenized["train"].train_test_split(test_size=0.05, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
test_dataset = tokenized["test"]

print("Train size:", len(train_dataset), "Eval size:", len(eval_dataset), "Test size:", len(test_dataset))

# ---------- 7) Data collator ----------
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ---------- 8) Metrics ----------
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average="macro")
    f1_micro = f1_score(labels, preds, average="micro")
    precision = precision_score(labels, preds, average="macro", zero_division=0)
    recall = recall_score(labels, preds, average="macro", zero_division=0)
    return {
        "accuracy": acc,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "precision_macro": precision,
        "recall_macro": recall
    }

# ---------- 9) Training arguments ----------
output_dir = "./bert-agnews"
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",        # evaluate every `eval_steps`
    eval_steps=500,                     # adjust depending on dataset size / batch size
    per_device_train_batch_size=16,     # reduce if OOM
    per_device_eval_batch_size=32,
    num_train_epochs=2,                 # tune as needed (2-3 is common baseline)
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=torch.cuda.is_available(),     # use mixed precision if GPU
    push_to_hub=False
)

# ---------- 10) Trainer ----------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# ---------- 11) Train ----------
# Warning: training may take ~minutes to hours depending on GPU/time. Adjust epochs/batch size as needed.
trainer.train()

# ---------- 12) Evaluate on test set ----------
print("Evaluating on TEST set...")
test_metrics = trainer.evaluate(test_dataset)
print(test_metrics)

# ---------- 13) Save model & tokenizer ----------
model_save_path = "./bert-agnews-final"
trainer.model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print("Model + tokenizer saved to", model_save_path)

# ---------- 14) Quick inference helper ----------
label_names = dataset["train"].features["label"].names
print("Label names:", label_names)

from transformers import pipeline
classif = pipeline("text-classification", model=model_save_path, tokenizer=model_save_path, return_all_scores=False, device=0 if torch.cuda.is_available() else -1)

def predict_label(text):
    res = classif(text, truncation=True, max_length=max_length)
    # pipeline returns [{'label': 'LABEL_0', 'score': 0.98}] or actual label name depending on saved model config
    return res

# Example:
print(predict_label("Apple releases new iPhone features in latest update"))
print(predict_label("Stock markets tumble amid global economic uncertainty"))

# ---------- 15) Lightweight interactive demo using Gradio ----------
import gradio as gr

def classify_text(text):
    out = predict_label(text)
    # Format nicely
    if isinstance(out, list):
        # pipeline returns list of dicts
        label = out[0]["label"]
        score = out[0]["score"]
    else:
        label = out["label"]
        score = out["score"]
    # Convert label to readable (if MODEL produced 'LABEL_i')
    try:
        # If label like 'LABEL_2' map to label_names
        if label.startswith("LABEL_"):
            idx = int(label.split("_")[-1])
            label_readable = label_names[idx]
        else:
            label_readable = label
    except Exception:
        label_readable = label
    return f"{label_readable} (score={score:.3f})"

title = "AG News Topic Classifier (bert-base-uncased)"
description = "Enter a news headline or short text. Model fine-tuned on AG News (4 classes)."

iface = gr.Interface(fn=classify_text,
                     inputs=gr.Textbox(lines=3, placeholder="Type a news headline..."),
                     outputs="text",
                     title=title,
                     description=description,
                     allow_flagging="never")

# In Colab, set share=True to create a public link (optional)
iface.launch(share=True)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h