In [None]:
# ======== CONFIGURATION CELL ========
# 6 experiment configurations varying: learning_rate, num_epochs

EXPERIMENT_CONFIGS = [
    {
        "id": "CONFIGURATION-1",
        "model_name": "distilbert-base-uncased",
        "learning_rate": 2e-5,
        "num_epochs": 1
    },
    {
        "id": "CONFIGURATION-2",
        "model_name": "distilbert-base-uncased",
        "learning_rate": 2e-5,
        "num_epochs": 2
    },
    {
        "id": "CONFIGURATION-3",
        "model_name": "distilbert-base-uncased",
        "learning_rate": 2e-5,
        "num_epochs": 3
    },
    {
        "id": "CONFIGURATION-4",
        "model_name": "distilbert-base-uncased",
        "learning_rate": 5e-5,
        "num_epochs": 1
    },
    {
        "id": "CONFIGURATION-5",
        "model_name": "distilbert-base-uncased",
        "learning_rate": 5e-5,
        "num_epochs": 2
    },
    {
        "id": "CONFIGURATION-6",
        "model_name": "distilbert-base-uncased",
        "learning_rate": 5e-5,
        "num_epochs": 3
    },
]

# How many repetitions per configuration
NUM_RUNS_PER_CONFIG = 5

# Please modify this field with your name / machine id
MACHINE_ID = "PC_Birk"   # Karol ‚Üí "PC_Karol", etc.

print("Configured experiments:")
for cfg in EXPERIMENT_CONFIGS:
    print(f"  {cfg['id']}: lr={cfg['learning_rate']}, epochs={cfg['num_epochs']}")

In [None]:
import numpy as np
import pandas as pd
import os

# confirm VS Code sees your data folder:
print(os.listdir("../data"))

In [None]:
# Load pre-split data
train_df = pd.read_csv("../data/train.csv")
val_df   = pd.read_csv("../data/val.csv")
test_df  = pd.read_csv("../data/test.csv")

train_df.head()

In [None]:
import re

# Simple sanity check: make sure Reuters source tags were removed in preprocessing
pattern = r'[\(\[]\s*Reuters\s*[\)\]]|^\s*Reuters\s*-\s*'


def check_reuters(df, split_name):
    cols_to_check = [c for c in ["text_full", "text", "title"] if c in df.columns]
    total = 0
    for col in cols_to_check:
        count = df[col].astype(str).str.contains(pattern, regex=True).sum()
        print(f"{split_name}: {count} rows still contain Reuters-tag pattern in '{col}'")
        total += count
    if total == 0:
        print(f"‚úÖ No Reuters source tags found in {split_name} split.\n")
    else:
        print(f"‚ö†Ô∏è WARNING: Found {total} Reuters-tagged rows in {split_name}.\n"
              f"   ‚Üí You may need to re-run preprocessing.py to regenerate the CSVs.\n")


check_reuters(train_df, "train")
check_reuters(val_df, "val")
check_reuters(test_df, "test")


In [None]:
# Our input text and labels
X_train_text = train_df["text_full"].astype(str).tolist()
y_train      = train_df["label"].tolist()

X_val_text   = val_df["text_full"].astype(str).tolist()
y_val        = val_df["label"].tolist()

X_test_text  = test_df["text_full"].astype(str).tolist()
y_test       = test_df["label"].tolist()

len(X_train_text), len(X_val_text), len(X_test_text)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Define metric computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
################################################### NOTES ###################################################
# We use our preprocessed CSVs ‚Üí X_train_text, y_train, etc.
# No more transformed_text_title_combined or Kaggle paths.
# DistilBERT sees: text_full (title + body) and label (0 = fake, 1 = real).
# We use train + val for training/validation; test stays untouched for final evaluation.
#############################################################################################################

import os
os.environ["WANDB_DISABLED"] = "true"  # disable Weights & Biases spam

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import torch

# Prefer MPS (Apple Silicon), then CUDA, then CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple MPS device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

print(f"Final device: {device}")


# 1. Load tokenizer (from CONFIG)
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

# 2. Convert our lists -> Hugging Face Dataset objects
train_ds = Dataset.from_dict({"text": X_train_text, "label": y_train})
val_ds   = Dataset.from_dict({"text": X_val_text,   "label": y_val})
test_ds  = Dataset.from_dict({"text": X_test_text,  "label": y_test})

# 3. Tokenization
MAX_LENGTH = 256

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val   = val_ds.map(tokenize_function,   batched=True)
tokenized_test  = test_ds.map(tokenize_function,  batched=True)


# 4. Data collator (handles padding dynamically)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import set_seed
import csv
import time
import shutil

log_csv_path = "../experiment_results.csv"

# Root folder para los TXT legibles
txt_logs_root = "../run_logs_results"
os.makedirs(txt_logs_root, exist_ok=True)

BATCH_SIZE = 32

csv_fieldnames = [
    "timestamp",
    "experiment_id",
    "run_index",
    "model_name",
    "machine_id",
    "learning_rate",
    "num_epochs",
    "val_accuracy",
    "val_precision",
    "val_recall",
    "val_f1",
]

csv_exists = os.path.isfile(log_csv_path)
if not csv_exists:
    with open(log_csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=csv_fieldnames)
        writer.writeheader()

# =========================
# NEW: loop over configs
# =========================
for CONFIG in EXPERIMENT_CONFIGS:
    print("\n" + "#" * 80)
    print(f"STARTING {CONFIG['id']}  (lr={CONFIG['learning_rate']}, "
          f"epochs={CONFIG['num_epochs']})")
    print("#" * 80)

    for run_idx in range(NUM_RUNS_PER_CONFIG):
        print("=" * 80)
        print(f"RUN {run_idx + 1}/{NUM_RUNS_PER_CONFIG}  |  {CONFIG['id']}")
        print("=" * 80)

        # 0) Different Seed per run
        seed_value = 42 + run_idx
        set_seed(seed_value)

        # 1) Reinitialize model from DistilBERT base
        model = AutoModelForSequenceClassification.from_pretrained(
            CONFIG["model_name"],
            num_labels=2,
        )
        model.to(device)

        # 2) Directorios temporales para checkpoints/logs de este run
        run_output_dir = f"../results/{CONFIG['id']}/run_{run_idx + 1}"
        run_logging_dir = f"../logs/{CONFIG['id']}/run_{run_idx + 1}"

        # 3) TrainingArguments
        training_args = TrainingArguments(
            output_dir=run_output_dir,
            learning_rate=CONFIG["learning_rate"],
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            num_train_epochs=CONFIG["num_epochs"],
            logging_dir=run_logging_dir,
            report_to=[],
            seed=seed_value,
            eval_strategy="no",
            save_strategy="no",
            save_total_limit=1,
            metric_for_best_model="eval_f1",
            logging_strategy="epoch",
        )

        # 4) Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        # 5) Training (the model in the end is the one of the last epoch)
        trainer.train()
        print("Training finished.")

        # 6) Evaluate only on the validation with the model of the last epoch.
        val_results = trainer.evaluate()

        def pretty_print_metrics(name, metrics):
            print(f"\n{name} metrics:")
            print(f"  Accuracy : {metrics.get('eval_accuracy', 0):.4f}")
            print(f"  Precision: {metrics.get('eval_precision', 0):.4f}")
            print(f"  Recall   : {metrics.get('eval_recall', 0):.4f}")
            print(f"  F1-score : {metrics.get('eval_f1', 0):.4f}")

        pretty_print_metrics("Validation", val_results)
        print()

        # 7) Save the model of this run (last epoch) in a stable folder.
        final_model_dir = os.path.join("../models", CONFIG["id"], f"run_{run_idx + 1}")
        os.makedirs(final_model_dir, exist_ok=True)

        trainer.save_model(final_model_dir)
        tokenizer.save_pretrained(final_model_dir)

        print(f"üíæ Last epoch model for this run: {final_model_dir}")

        # 7.1) Delete temporary folder of results (to maintain clean)
        shutil.rmtree(run_output_dir, ignore_errors=True)

        # 8) Log a CSV immediately after finishing the run.
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")

        row = {
            "timestamp": timestamp,
            "experiment_id": CONFIG["id"],
            "run_index": run_idx + 1,
            "model_name": CONFIG["model_name"],
            "machine_id": MACHINE_ID,
            "learning_rate": CONFIG["learning_rate"],
            "num_epochs": CONFIG["num_epochs"],
            "val_accuracy": val_results.get("eval_accuracy"),
            "val_precision": val_results.get("eval_precision"),
            "val_recall": val_results.get("eval_recall"),
            "val_f1": val_results.get("eval_f1"),
        }

        with open(log_csv_path, "a", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=csv_fieldnames)
            writer.writerow(row)

        print("üìù Logged validation metrics to experiment_results.csv")

        # 9) Log TXT legible por run
        config_logs_dir = os.path.join(txt_logs_root, CONFIG["id"])
        os.makedirs(config_logs_dir, exist_ok=True)

        txt_path = os.path.join(config_logs_dir, f"run_{run_idx + 1}.txt")
        with open(txt_path, "w") as f:
            f.write(f"Timestamp: {timestamp}\n")
            f.write(f"Experiment ID: {CONFIG['id']}\n")
            f.write(f"Run index: {run_idx + 1}\n")
            f.write(f"Model: {CONFIG['model_name']}\n")
            f.write(f"Machine: {MACHINE_ID}\n")
            f.write(f"Learning rate: {CONFIG['learning_rate']}\n")
            f.write(f"Epochs: {CONFIG['num_epochs']}\n")

            f.write("Validation metrics:\n")
            f.write(f"  Accuracy:  {val_results.get('eval_accuracy')}\n")
            f.write(f"  Precision: {val_results.get('eval_precision')}\n")
            f.write(f"  Recall:    {val_results.get('eval_recall')}\n")
            f.write(f"  F1:        {val_results.get('eval_f1')}\n")

        print(f"üìÑ Saved TXT log to: {txt_path}")
        print(f"‚úÖ Finished run {run_idx + 1}/{NUM_RUNS_PER_CONFIG} for {CONFIG['id']}\n")


In [None]:
import pandas as pd

# Load all logged runs
results = pd.read_csv("../experiment_results.csv")

# Group by configuration + hyperparams
group_cols = [
    "experiment_id",
    "model_name",
    "learning_rate",
    "num_epochs",
]

summary = (
    results
    .groupby(group_cols)
    .agg(
        runs=("run_index", "nunique"),
        val_accuracy_mean=("val_accuracy", "mean"),
        val_accuracy_std=("val_accuracy", "std"),
        val_f1_mean=("val_f1", "mean"),
        val_f1_std=("val_f1", "std"),
    )
    .reset_index()
    .sort_values(["experiment_id"])
)

summary

In [None]:
import matplotlib.pyplot as plt

# Simple bar plot: mean Test F1 per configuration
plt.figure(figsize=(8, 4))
plt.bar(summary["experiment_id"], summary["test_f1_mean"])
plt.xticks(rotation=45)
plt.ylabel("Mean Test F1")
plt.title("Mean Test F1 score per configuration")
plt.ylim(0, 1.0)
plt.tight_layout()
plt.show()

# Optional: bar plot for Validation Accuracy
plt.figure(figsize=(8, 4))
plt.bar(summary["experiment_id"], summary["val_accuracy_mean"])
plt.xticks(rotation=45)
plt.ylabel("Mean Validation Accuracy")
plt.title("Mean Validation accuracy per configuration")
plt.ylim(0, 1.0)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import os

# 1) Load all the results
results = pd.read_csv("../experiment_results.csv")

# 2) Filter specifically this configuration.
config_id = CONFIG["id"]
config_rows = results[results["experiment_id"] == config_id]

# 3) Choose the run with the best F1 in Validation
best_row = config_rows.sort_values("val_f1", ascending=False).iloc[0]
best_run = int(best_row["run_index"])

print(f"üèÜ Best run for {config_id} based on validation F1:")
display(best_row)

# 4) Load that model
best_model_dir = os.path.join("../models", config_id, f"run_{best_run}")
print(f"\nLoading best model from: {best_model_dir}")

best_model = AutoModelForSequenceClassification.from_pretrained(best_model_dir)
best_model.to(device)

# 5) Create a Trainer only for evaluation in test.
test_args = TrainingArguments(
    output_dir="./tmp_best_eval",
    per_device_eval_batch_size=BATCH_SIZE,
    report_to=[],
)

best_trainer = Trainer(
    model=best_model,
    args=test_args,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

test_results = best_trainer.evaluate()

print("\nüìä FINAL TEST METRICS (single evaluation for this configuration):")
print(f"  Accuracy : {test_results.get('eval_accuracy', 0):.4f}")
print(f"  Precision: {test_results.get('eval_precision', 0):.4f}")
print(f"  Recall   : {test_results.get('eval_recall', 0):.4f}")
print(f"  F1-score : {test_results.get('eval_f1', 0):.4f}")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load fine-tuned model + tokenizer from disk
model_path = "../models/distilbert_finetuned"
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)

loaded_model.eval()

# Label mapping: 0 = Fake, 1 = Real (our convention)
label_names = ["Fake", "Real"]

# Example news texts (you can change these to whatever you like) (ChatGPT's idea)
sample_texts = [
    "Government announces new education reform to support low-income students.",
    "Shocking! Scientists prove that drinking only coffee for a week makes you immortal.",
    "Major tech company releases open-source AI model for medical diagnosis.",
    "Experts claim that the moon will crash into Earth next year according to secret documents.",
]

for text in sample_texts:
    # Tokenize
    inputs = loaded_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )

    # Get predictions
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()

    print("Text:", text)
    print("Prediction:", label_names[predicted_class_id])
    print("-" * 80)

In [None]:
# Pick one known real and one known fake from your data
real_example = true_example = train_df[train_df["label"] == 1]["text_full"].iloc[0]
fake_example = train_df[train_df["label"] == 0]["text_full"].iloc[0]

def predict_text(text):
    inputs = loaded_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        pred_id = torch.argmax(outputs.logits, dim=-1).item()
    return label_names[pred_id]

print("REAL example pred:", predict_text(real_example))
print("FAKE example pred:", predict_text(fake_example))