# ***News Topic Classifier Using BERT***

## ***Import Libraries***

In [None]:
# Install the necessary Hugging Face and processing libraries
!pip install -q transformers[torch] datasets evaluate accelerate

In [None]:
# 1. Install Gradio
!pip install -q gradio

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report

import evaluate

from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, pipeline
from datasets import load_dataset
import gradio as gr
from huggingface_hub import notebook_login




In [None]:
# Load the dataset
dataset = load_dataset("ag_news")

# Check the labels mapping
labels = dataset["train"].features["label"].names
num_labels = len(labels)
print(num_labels)
print(f"Labels: {labels}")


In [None]:
dataset["train"][:5]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    # Padding and truncation are essential for BERT's 512-token limit
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization to the whole dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels
)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [None]:
training_args = TrainingArguments(
    output_dir="./bert-news-classifier",
    eval_strategy="epoch",      # Evaluate at the end of every epoch
    save_strategy="epoch",
    learning_rate='2e-5',          # Standard BERT fine-tuning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,          # BERT usually converges in 2-4 epochs
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,                   # Enable mixed precision for faster training on GPU
    report_to="none"             # Prevents logging to external tools like W&B unless set up
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

In [None]:
# Create a classification pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

# Test headline
headline = "The central bank decided to raise interest rates to combat inflation."
prediction = classifier(headline)

# Map the label ID back to the class name
label_idx = int(prediction[0]['label'].split('_')[-1])
print(f"Headline: {headline}")
print(f"Predicted Category: {labels[label_idx]} (Score: {prediction[0]['score']:.4f})")


In [None]:
# 1. Get predictions on the test set
output = trainer.predict(tokenized_dataset["test"])
y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)

# 2. Generate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# 3. Plotting using Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix: BERT News Topic Classifier')
plt.show()

# 4. Print detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=labels))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save model and tokenizer to your Drive
model.save_pretrained("/content/drive/MyDrive/bert-news-classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/bert-news-classifier")

In [None]:
# 2. Define the prediction function
def classify_news(text):
    # Get predictions from the pipeline
    # We set top_k=None to get probabilities for ALL categories
    predictions = classifier(text, top_k=None)

    # Format the results for Gradio's Label component
    # It expects a dictionary: {"Category Name": probability_float}
    formatted_outputs = {}
    for pred in predictions:
        label_idx = int(pred['label'].split('_')[-1])
        label_name = labels[label_idx]
        formatted_outputs[label_name] = float(pred['score'])

    return formatted_outputs



In [None]:
# 3. Create the Gradio Interface
demo = gr.Interface(
    fn=classify_news,
    inputs=gr.Textbox(lines=3, placeholder="Enter a news headline or snippet here...", label="News Text"),
    outputs=gr.Label(num_top_classes=4, label="Topic Prediction"),
    title="BERT News Topic Classifier",
    description="This model uses BERT fine-tuned on the AG News dataset to categorize news into World, Sports, Business, or Sci/Tech.",
    examples=[
        ["The local soccer team won the championship after a dramatic penalty shootout."],
        ["Global markets tumbled today as investors reacted to new inflation data."],
        ["NASA's latest rover has successfully landed on the Martian surface to look for signs of water."],
        ["Diplomats are meeting in Geneva to discuss a new ceasefire agreement."]
    ]
)



In [None]:
# 4. Launch the app
# share=True creates a public URL you can send to anyone for 72 hours
demo.launch(share=True)


In [None]:
!zip -r model.zip ./bert-news-classifier
from google.colab import files
files.download('model.zip')

In [None]:
# 1. Log in (it will provide a link to get your Access Token)
# Ensure your token has 'WRITE' permissions
notebook_login()

# 2. Push the model and tokenizer to the Hub
# Replace 'your-username' with your actual Hugging Face username
model_name = "bert-news-classifier-agnews"
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)