In [1]:
import torch
from datasets import load_dataset
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, precision_score, recall_score
from google.colab import files, drive
import pandas as pd

# ============================================================
# BEST HYPERPARAMETERS (Manually Applied)
# ============================================================
# Based on your experimentation results:
# learning_rate: 3e-05
# batch_size: 16
# weight_decay: 0.05
# num_train_epochs: 3 (from Run 3)
# Expected Performance: F1 ~0.9971, Accuracy ~0.9934
# ============================================================

BEST_LEARNING_RATE = 3e-05
BEST_BATCH_SIZE = 16
BEST_WEIGHT_DECAY = 0.05
BEST_NUM_EPOCHS = 3

print("=" * 60)
print("CLICKBAIT DETECTION - BEST HYPERPARAMETERS")
print("=" * 60)
print(f"Learning Rate: {BEST_LEARNING_RATE}")
print(f"Batch Size: {BEST_BATCH_SIZE}")
print(f"Weight Decay: {BEST_WEIGHT_DECAY}")
print(f"Number of Epochs: {BEST_NUM_EPOCHS}")
print("=" * 60)

# Check GPU availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"\n✓ Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("\n⚠ GPU not available, using CPU.")

# ============================================================
# 1. DATA LOADING AND PREPROCESSING
# ============================================================
print("\n--- Loading and Preprocessing Data ---")

# Upload the CSV from your computer
uploaded = files.upload()

# Load dataset
dataset = load_dataset("csv", data_files=list(uploaded.keys()))
print(f"Dataset loaded: {len(dataset['train'])} total samples")

# Split into train/test (80/20 split)
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Use subset for faster training (adjust as needed)
train_data = dataset["train"].select(range(min(2000, len(dataset["train"]))))
eval_data = dataset["test"].select(range(min(500, len(dataset["test"]))))

print(f"Training samples: {len(train_data)}")
print(f"Evaluation samples: {len(eval_data)}")

# ============================================================
# 2. TOKENIZATION
# ============================================================
print("\n--- Tokenizing Data ---")

MODEL_NAME = "distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    """Tokenize headlines for model input"""
    return tokenizer(examples["headline"], truncation=True, padding=True, max_length=128)

# Apply tokenization
tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_eval = eval_data.map(tokenize_function, batched=True)

# Set format to PyTorch tensors
tokenized_train.set_format("torch", columns=['input_ids', 'attention_mask', 'clickbait'])
tokenized_eval.set_format("torch", columns=['input_ids', 'attention_mask', 'clickbait'])

# Rename 'clickbait' column to 'label' (expected by Trainer)
tokenized_train = tokenized_train.rename_column("clickbait", "label")
tokenized_eval = tokenized_eval.rename_column("clickbait", "label")

print("✓ Tokenization complete")

# ============================================================
# 3. MODEL INITIALIZATION
# ============================================================
print("\n--- Loading Pre-trained Model ---")

model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
).to(device)

print(f"✓ Model loaded: {MODEL_NAME}")
print(f"  Number of parameters: {model.num_parameters():,}")

# ============================================================
# 4. DEFINE METRICS
# ============================================================
def compute_metrics(p):
    """Calculate accuracy, F1, precision, and recall"""
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="binary")
    precision = precision_score(labels, preds, average="binary")
    recall = recall_score(labels, preds, average="binary")

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# ============================================================
# 5. TRAINING CONFIGURATION (BEST HYPERPARAMETERS)
# ============================================================
print("\n--- Configuring Training with Best Hyperparameters ---")

training_args = TrainingArguments(
    output_dir="./clickbait_model_best",

    # BEST HYPERPARAMETERS
    learning_rate=BEST_LEARNING_RATE,
    per_device_train_batch_size=BEST_BATCH_SIZE,
    per_device_eval_batch_size=BEST_BATCH_SIZE,
    weight_decay=BEST_WEIGHT_DECAY,
    num_train_epochs=BEST_NUM_EPOCHS,

    # Evaluation and saving strategy
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",

    # Logging
    logging_dir="./logs_best",
    logging_steps=50,

    # Optimization
    warmup_steps=100,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available

    # Disable external logging
    report_to=[],
)

print("✓ Training configuration complete")

# ============================================================
# 6. INITIALIZE TRAINER
# ============================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

# ============================================================
# 7. TRAIN THE MODEL
# ============================================================
print("\n" + "=" * 60)
print("TRAINING MODEL WITH BEST HYPERPARAMETERS")
print("=" * 60)

trainer.train()

print("\n✓ Training complete!")

# ============================================================
# 8. FINAL EVALUATION
# ============================================================
print("\n" + "=" * 60)
print("FINAL EVALUATION ON TEST SET")
print("=" * 60)

eval_results = trainer.evaluate()

print("\n📊 FINAL TEST SET RESULTS:")
print(f"  Accuracy:  {eval_results['eval_accuracy']:.4f}")
print(f"  F1 Score:  {eval_results['eval_f1']:.4f}")
print(f"  Precision: {eval_results['eval_precision']:.4f}")
print(f"  Recall:    {eval_results['eval_recall']:.4f}")

# ============================================================
# 9. DETAILED CLASSIFICATION REPORT
# ============================================================
print("\n--- Generating Detailed Classification Report ---")

predictions = trainer.predict(tokenized_eval)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print("\n" + "=" * 60)
print("CLASSIFICATION REPORT")
print("=" * 60)
print(classification_report(y_true, y_pred, target_names=["Not Clickbait", "Clickbait"]))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_true, y_pred)
print(cm)
print(f"\nTrue Negatives:  {cm[0][0]}")
print(f"False Positives: {cm[0][1]}")
print(f"False Negatives: {cm[1][0]}")
print(f"True Positives:  {cm[1][1]}")

# ============================================================
# 10. SAVE RESULTS TO GOOGLE DRIVE
# ============================================================
print("\n--- Saving Results ---")

# Mount Google Drive
drive.mount('/content/drive')

# Prepare results dictionary
results_dict = {
    "Hyperparameter": ["Learning Rate", "Batch Size", "Weight Decay", "Num Epochs"],
    "Value": [BEST_LEARNING_RATE, BEST_BATCH_SIZE, BEST_WEIGHT_DECAY, BEST_NUM_EPOCHS]
}

metrics_dict = {
    "Metric": ["Accuracy", "F1 Score", "Precision", "Recall"],
    "Score": [
        eval_results['eval_accuracy'],
        eval_results['eval_f1'],
        eval_results['eval_precision'],
        eval_results['eval_recall']
    ]
}

# Create DataFrames
results_df = pd.DataFrame(results_dict)
metrics_df = pd.DataFrame(metrics_dict)

# Save to Excel with multiple sheets
with pd.ExcelWriter('/content/drive/MyDrive/clickbait_best_results.xlsx') as writer:
    results_df.to_excel(writer, sheet_name='Hyperparameters', index=False)
    metrics_df.to_excel(writer, sheet_name='Metrics', index=False)

print("✓ Results saved to Google Drive: clickbait_best_results.xlsx")

# ============================================================
# 11. TEST ON NEW HEADLINES
# ============================================================
print("\n" + "=" * 60)
print("TESTING ON NEW HEADLINES")
print("=" * 60)

from transformers import pipeline

# Create prediction pipeline
clickbait_detector = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test headlines
new_headlines = [
    "MMDA Launches New Traffic Scheme to Ease Congestion Along EDSA",
    "DepEd Confirms Opening of Classes Will Proceed as Scheduled",
    "DOH Reports Steady Decline in Dengue Cases Nationwide",
    "You Won't Believe What This Filipino Celebrity Did After Winning the Lottery!",
    "This One Trick Can Help You Save Thousands on Your Meralco Bill!",
    "Students in Manila Tried This Study Method—The Results Will Shock You!",
    "Comelec Prepares for 2025 Elections With Improved Voter Registration System",
    "PH Economy Grows by 5.8% in Third Quarter, Says PSA",
    "A Mayor's Secret Finally Revealed—The Whole Town Is Talking About It!",
    "Here's Why Everyone Is Rushing to Try This New Food Trend in Quezon City!",
]

results = clickbait_detector(new_headlines)

print("\n📰 PREDICTIONS ON NEW HEADLINES:\n")
for text, result in zip(new_headlines, results):
    label = result["label"]
    prediction = "🎣 Clickbait" if label in ["LABEL_1", "1"] else "✓ Not Clickbait"
    confidence = result['score']
    print(f"{prediction} ({confidence:.2%})")
    print(f"   {text}\n")

# ============================================================
# 12. SAVE THE MODEL
# ============================================================
print("\n--- Saving Trained Model ---")

model.save_pretrained("./clickbait_model_final")
tokenizer.save_pretrained("./clickbait_model_final")

print("✓ Model saved to: ./clickbait_model_final")

print("\n" + "=" * 60)
print("ALL TASKS COMPLETED SUCCESSFULLY!")
print("=" * 60)

CLICKBAIT DETECTION - BEST HYPERPARAMETERS
Learning Rate: 3e-05
Batch Size: 16
Weight Decay: 0.05
Number of Epochs: 3

✓ Using GPU: Tesla T4

--- Loading and Preprocessing Data ---


Saving archive (5).zip to archive (5).zip


Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded: 32000 total samples
Training samples: 2000
Evaluation samples: 500

--- Tokenizing Data ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

✓ Tokenization complete

--- Loading Pre-trained Model ---


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded: distilbert-base-multilingual-cased
  Number of parameters: 135,326,210

--- Configuring Training with Best Hyperparameters ---
✓ Training configuration complete

TRAINING MODEL WITH BEST HYPERPARAMETERS


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1621,0.058024,0.984,0.984314,0.972868,0.996032
2,0.0435,0.031453,0.99,0.99006,0.992032,0.988095
3,0.0095,0.034366,0.99,0.990138,0.984314,0.996032



✓ Training complete!

FINAL EVALUATION ON TEST SET



📊 FINAL TEST SET RESULTS:
  Accuracy:  0.9900
  F1 Score:  0.9901
  Precision: 0.9843
  Recall:    0.9960

--- Generating Detailed Classification Report ---

CLASSIFICATION REPORT
               precision    recall  f1-score   support

Not Clickbait       1.00      0.98      0.99       248
    Clickbait       0.98      1.00      0.99       252

     accuracy                           0.99       500
    macro avg       0.99      0.99      0.99       500
 weighted avg       0.99      0.99      0.99       500


--- Confusion Matrix ---
[[244   4]
 [  1 251]]

True Negatives:  244
False Positives: 4
False Negatives: 1
True Positives:  251

--- Saving Results ---


ValueError: mount failed