In [None]:
!pip install transformers datasets evaluate accelerate scikit-learn gradio torch

In [None]:
import os
import gc
import numpy as np
import torch
import evaluate
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

torch.cuda.empty_cache()
gc.collect()

Using device: cuda


240

In [6]:
dataset = load_dataset("ag_news")

# Select small subsets FIRST (before tokenization)
train_dataset = dataset["train"].shuffle(seed=42).select(range(5000))
test_dataset  = dataset["test"].shuffle(seed=42).select(range(500))


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
# Tokenize ONLY the subsets
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]  # ðŸ”¥ removes raw text from RAM
)

test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Convert to torch tensors (saves memory + faster)
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# Free original dataset
del dataset
gc.collect()

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

150

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4
)
model.to(device)

# ---------------------------
# 4. METRICS
# ---------------------------
metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1": metric_f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),  # âœ… mixed precision
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none",                # saves RAM
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [13]:
print("ðŸš€ Training started...")
trainer.train()



ðŸš€ Training started...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4728,0.399734,0.896,0.896304
2,0.2691,0.401169,0.896,0.895944


TrainOutput(global_step=1250, training_loss=0.33977227478027344, metrics={'train_runtime': 222.0892, 'train_samples_per_second': 45.027, 'train_steps_per_second': 5.628, 'total_flos': 657789450240000.0, 'train_loss': 0.33977227478027344, 'epoch': 2.0})

In [14]:
model.save_pretrained("./news_classifier_bert")
tokenizer.save_pretrained("./news_classifier_bert")

print("âœ… Model saved!")

# Free memory
del trainer, model, train_dataset, test_dataset, tokenizer
torch.cuda.empty_cache()
gc.collect()

âœ… Model saved!


4574

In [15]:
import gradio as gr
from transformers import pipeline

# Load the fine-tuned model
model_path = "./news_classifier_bert"
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)

# Label Mapping for AG News
labels = ["World", "Sports", "Business", "Sci/Tech"]

def predict_news_topic(headline):
    result = classifier(headline)[0]
    # Extract label index (e.g., 'LABEL_1') and map to string
    label_idx = int(result['label'].split('_')[1])
    return f"Topic: {labels[label_idx]} (Confidence: {result['score']:.2f})"

# Create Gradio Interface
interface = gr.Interface(
    fn=predict_news_topic,
    inputs=gr.Textbox(lines=2, placeholder="Enter news headline here..."),
    outputs="text",
    title="News Topic Classifier (BERT)",
    description="Enter a news headline to classify it into World, Sports, Business, or Sci/Tech."
)

if __name__ == "__main__":
    interface.launch()

Device set to use cuda:0


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://588792606edd7c62c7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [16]:
from google.colab import files
import shutil

folder_path = "/content/news_classifier_bert"
zip_path = "/content/news_classifier_bert.zip"

shutil.make_archive(zip_path.replace(".zip", ""), 'zip', folder_path)

files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
from google.colab import files
import shutil

folder_path = "/content/results"
zip_path = "/content/results.zip"

shutil.make_archive(zip_path.replace(".zip", ""), 'zip', folder_path)

files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>