In [29]:

# Task 1: News Topic Classifier using BERT
# Task by: Anoosha Ikram
# Step 1: Install required libraries
!pip install transformers datasets gradio torch scikit-learn

# Step 2: Import Libraries
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import gradio as gr




In [13]:
!pip install --upgrade datasets




In [31]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"  # hides symlink warning
os.environ["HF_DATASETS_OFFLINE"] = "0"              # make sure online download works
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"


In [None]:
# Step 1: Load Dataset

We will use the AG News dataset from Hugging Face.  
It contains 4 classes of news topics:
1. World
2. Sports
3. Business
4. Sci/Tech


In [33]:
from datasets import load_dataset

# Load AG News Dataset
dataset = load_dataset("ag_news")

# print the first sample safely
print("Sample record:", dataset["train"][0])

# Print available labels
labels = dataset["train"].features["label"].names
print("Labels:", labels)


Sample record: {'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}
Labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [None]:
# Step 2: Tokenization

We use bert-base-uncased tokenizer to preprocess the text.  
This converts raw text into tokens and attention masks.


In [34]:
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()   # turn off widget-style progress bars

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_fn(batch):
    return tokenizer(
        batch["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_fn, batched=True)

# Format for PyTorch training
tokenized_dataset.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "label"]
)

# Confirm with one sample
print("Sample after tokenization:\n", tokenized_dataset["train"][0])
print("✅ Tokenization completed successfully!")


Sample after tokenization:
 {'label': tensor(2), 'input_ids': tensor([  101,  2813,  2358,  1012,  6468, 15020,  2067,  2046,  1996,  2304,
         1006, 26665,  1007, 26665,  1011,  2460,  1011, 19041,  1010,  2813,
         2395,  1005,  1055,  1040, 11101,  2989,  1032,  2316,  1997, 11087,
         1011, 22330,  8713,  2015,  1010,  2024,  3773,  2665,  2153,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
  

In [None]:
## Step 3: Model Setup

We load a BERT model for sequence classification         with 4 output classes.


In [35]:
import huggingface_hub
huggingface_hub.constants.HF_HUB_DOWNLOAD_TIMEOUT = 1000  # seconds


In [36]:
import os
print(os.listdir(".bert-ag-news"))


['config.json', 'pytorch_model.bin', 'vocab.txt']


In [51]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

# Load config from your local folder
config = AutoConfig.from_pretrained(".bert-ag-news")

# Load tokenizer and model from HuggingFace (or local if you downloaded them too)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    config=config
)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [45]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.56.0-py3-none-any.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
    --------------------------------------- 0.3/11.6 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.6 MB 2.1 MB/s eta 0:00:06
   --- ------------------------------------ 1.0/11.6 MB 2.0 MB/s eta 0:00:06
   ----- ---------------------------------- 1.6/11.6 MB 2.0 MB/s eta 0:00:06
   ----- ---------------------------------- 1.6/11.6 MB 2.0 MB/s eta 0:00:06
   ------ --------------------------------- 1.8/11.6 MB 1.7 MB/s eta 0:00:06
   ------- -------------------------------- 2.1/11.6 MB 1.6 MB/s eta 0:00:07
   ------- -------------------------------- 2.1/11.6 MB 1.6 MB/s eta 0:00:07
   ------- --------------------------

  You can safely remove it manually.


In [None]:
# Training arguments (for older transformers versions)
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    evaluate_during_training=True,   # replaces evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500                   # replaces save_strategy
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(10000)), # subset for speed
    eval_dataset=tokenized_dataset["test"].select(range(2000)),
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()


In [None]:
## Step 4: Evaluation

# Now we evaluate the fine-tuned model on the test dataset.
results = trainer.evaluate()
print(results)



In [None]:
## Step 5: Deployment with Gradio

We create a simple web interface where users can input a news headline and get a predicted category.


In [None]:
def predict_news(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**tokens)
    preds = torch.argmax(outputs.logits, dim=1).item()
    return labels[preds]

gr.Interface(fn=predict_news, inputs="text", outputs="label").launch()
