In [None]:
!pip install evaluate streamlit
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import streamlit as st

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit, evaluate
Successfully installed evaluate-0.4.5 pydeck-0.9.1 streamlit-1.49.1


In [None]:
print("Loading AG News dataset...")
dataset = load_dataset("ag_news")

print("Sample from training set:")
print(dataset['train'][0])

Loading AG News dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Sample from training set:
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


In [None]:
print("🔧 Loading BERT tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

print("🧽 Tokenizing dataset...")
encoded_dataset = dataset.map(tokenize_function, batched=True)

🔧 Loading BERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

🧽 Tokenizing dataset...


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
print("🤖 Loading BERT model for classification...")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

model.to(device)

print("📊 Loading evaluation metrics...")
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

🤖 Loading BERT model for classification...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Using device: cuda
📊 Loading evaluation metrics...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
print("CUDA available:", torch.cuda.is_available())
training_args = TrainingArguments(
    output_dir="bert-news-classifier-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",    fp16=torch.cuda.is_available(),
    remove_unused_columns=True,
)


CUDA available: True


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print("🚀 Starting training...")
trainer.train()


print("📊 Evaluating...")
metrics = trainer.evaluate()
print(f"Test Accuracy: {metrics['eval_accuracy']:.4f}")
print(f"Test F1-Score: {metrics['eval_f1']:.4f}")


print("💾 Saving fine-tuned model...")
model.save_pretrained("bert-news-classifier-finetuned")
tokenizer.save_pretrained("bert-news-classifier-finetuned")

🚀 Starting training...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2763,0.245027,0.935132,0.935314
2,0.1804,0.246242,0.943816,0.943824


📊 Evaluating...


Test Accuracy: 0.9438
Test F1-Score: 0.9438
💾 Saving fine-tuned model...


('bert-news-classifier-finetuned/tokenizer_config.json',
 'bert-news-classifier-finetuned/special_tokens_map.json',
 'bert-news-classifier-finetuned/vocab.txt',
 'bert-news-classifier-finetuned/added_tokens.json',
 'bert-news-classifier-finetuned/tokenizer.json')

In [None]:
def predict_topic(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    labels = ["World", "Sports", "Business", "Science/Tech"]
    print(f"Text: {text}")
    print(f"Predicted Topic: {labels[predicted_class]} (Confidence: {confidence:.2f})")
    return labels[predicted_class], confidence


In [None]:
print("\n🔍 Running sample predictions:")
predict_topic("Apple launches new AI-powered iPhone")
predict_topic("Lionel Messi scores winning goal in World Cup final")
predict_topic("Stock market hits new high as tech shares surge")
predict_topic("NASA launches new Mars rover mission")



🔍 Running sample predictions:
Text: Apple launches new AI-powered iPhone
Predicted Topic: Science/Tech (Confidence: 0.99)
Text: Lionel Messi scores winning goal in World Cup final
Predicted Topic: World (Confidence: 0.98)
Text: Stock market hits new high as tech shares surge
Predicted Topic: Science/Tech (Confidence: 0.81)
Text: NASA launches new Mars rover mission
Predicted Topic: Science/Tech (Confidence: 0.99)


('Science/Tech', 0.9924407005310059)

In [None]:
@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained("bert-news-classifier-finetuned")
    model = AutoModelForSequenceClassification.from_pretrained("bert-news-classifier-finetuned")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return tokenizer, model

# Only launch app if running directly
if __name__ == "__main__":
    st.title("📰 News Topic Classifier")
    st.write("Enter a news headline below to classify its topic using a fine-tuned BERT model.")

    text = st.text_area("Headline:", placeholder="E.g., NASA launches new Mars rover mission...")

    if st.button("Classify"):
        if not text.strip():
            st.warning("Please enter a headline to classify.")
        else:
            tokenizer, model = load_model()
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
                pred_class = torch.argmax(probs, dim=-1).item()
                confidence = probs[0][pred_class].item()

            labels = ["World", "Sports", "Business", "Science/Tech"]
            st.success(f"**Predicted Topic:** {labels[pred_class]}")
            st.info(f"**Confidence:** {confidence:.2f}")
            st.bar_chart({labels[i]: probs[0][i].item() for i in range(4)})
    else:
        st.info("Enter a headline and click **Classify** to see the result.")

    st.markdown("---")
    st.markdown("Built with 🤗 Transformers & Streamlit | Fine-tuned on AG News dataset")

2025-09-03 12:20:14.006 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-09-03 12:20:14.016 Session state does not function when running a script without `streamlit run`


In [None]:
import json
import nbformat
import ipynbname

try:
    # Get current notebook path using ipynbname
    notebook_path = ipynbname.path()
    print(f"Found notebook at: {notebook_path}")

    # Read the notebook
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)

    # Fix or remove broken widget metadata
    # This iterates through all cells and removes the 'widgets' metadata if present
    for cell in nb.cells:
        if 'metadata' in cell and 'widgets' in cell.metadata:
            del cell.metadata['widgets']

    # Write back the cleaned notebook
    with open(notebook_path, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)

    print("✅ Fixed widget metadata. Notebook should now render correctly.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Could not automatically determine notebook path or fix metadata. Please try manually saving the notebook or ensure ipynbname is installed and working.")

Found notebook at: /fileId=1bjcUSBB_7_OWep-NKdOH0_yvvIS2LI63
An error occurred: [Errno 2] No such file or directory: '/fileId=1bjcUSBB_7_OWep-NKdOH0_yvvIS2LI63'
Could not automatically determine notebook path or fix metadata. Please try manually saving the notebook or ensure ipynbname is installed and working.


In [None]:
import json
import nbformat
import ipynbname

try:
    # Get current notebook path using ipynbname
    notebook_path = ipynbname.path()
    print(f"Found notebook at: {notebook_path}")

    # Read the notebook
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)

    # Fix or remove broken widget metadata
    # This iterates through all cells and removes the 'widgets' metadata if present
    for cell in nb.cells:
        if 'metadata' in cell and 'widgets' in cell.metadata:
            del cell.metadata['widgets']

    # Write back the cleaned notebook
    with open(notebook_path, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)

    print("✅ Fixed widget metadata. Notebook should now render correctly.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Could not automatically determine notebook path or fix metadata. Please try manually saving the notebook or ensure ipynbname is installed and working.")

Found notebook at: /fileId=1bjcUSBB_7_OWep-NKdOH0_yvvIS2LI63
An error occurred: [Errno 2] No such file or directory: '/fileId=1bjcUSBB_7_OWep-NKdOH0_yvvIS2LI63'
Could not automatically determine notebook path or fix metadata. Please try manually saving the notebook or ensure ipynbname is installed and working.


In [None]:
!pip install ipynbname



In [None]:
import json
import nbformat
import ipynbname

try:
    # Get current notebook path using ipynbname
    notebook_path = ipynbname.path()
    print(f"Found notebook at: {notebook_path}")

    # Read the notebook
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)

    # Fix or remove broken widget metadata
    # This iterates through all cells and removes the 'widgets' metadata if present
    for cell in nb.cells:
        if 'metadata' in cell and 'widgets' in cell.metadata:
            del cell.metadata['widgets']

    # Write back the cleaned notebook
    with open(notebook_path, 'w', encoding='utf-8') as f:
        nbformat.write(nb, f)

    print("✅ Fixed widget metadata. Notebook should now render correctly.")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Could not automatically determine notebook path or fix metadata. Please try manually saving the notebook or ensure ipynbname is installed and working.")

Found notebook at: /fileId=1bjcUSBB_7_OWep-NKdOH0_yvvIS2LI63
An error occurred: [Errno 2] No such file or directory: '/fileId=1bjcUSBB_7_OWep-NKdOH0_yvvIS2LI63'
Could not automatically determine notebook path or fix metadata. Please try manually saving the notebook or ensure ipynbname is installed and working.
