# **News Topic Classifier using BERT**

### **Import Libraries**

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch
import gradio as gr

### **Load Dataset and Tokenize the Dataset**

In [2]:
# Load dataset
dataset = load_dataset("ag_news")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True)

# Tokenize datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'label'])


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

### **Load model and Initialize Training**

In [3]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="no",
    learning_rate=2.5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to="none",
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Step,Training Loss
500,0.2945
1000,0.2051
1500,0.1786
2000,0.1639
2500,0.126
3000,0.1184
3500,0.1178


TrainOutput(global_step=3750, training_loss=0.1685925287882487, metrics={'train_runtime': 11618.0635, 'train_samples_per_second': 20.657, 'train_steps_per_second': 0.323, 'total_flos': 6.314778722304e+16, 'train_loss': 0.1685925287882487, 'epoch': 2.0})

### **Evaluate the model**

In [4]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.16561096906661987, 'eval_accuracy': 0.9465789473684211, 'eval_f1': 0.9466114276214136, 'eval_runtime': 126.6466, 'eval_samples_per_second': 60.01, 'eval_steps_per_second': 0.474, 'epoch': 2.0}


### **Save the model**

In [5]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

### **Gradio Interface**

In [6]:
# Load model and tokenizer
model = BertForSequenceClassification.from_pretrained("./saved_model")
tokenizer = BertTokenizer.from_pretrained("./saved_model")
model.eval()  # set to evaluation mode

# Label mapping (AG News)
label_map = ["World", "Sports", "Business", "Sci/Tech"]

# Prediction function
def classify_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    return label_map[prediction]

# Create Gradio interface
demo = gr.Interface(
    fn=classify_news,
    inputs=gr.Textbox(lines=2, placeholder="Enter a news headline..."),
    outputs="text",
    title="News Topic Classifier (BERT)",
    description="Classifies a news headline into one of four categories: World, Sports, Business, Sci/Tech."
)

# Launch interface
demo.launch(share=True)  # share=True will give a public link

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://46b529681d9735a217.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


