In [1]:
# ======== CONFIGURATION CELL ========

# When running configs, change the values in this cell, restart kernel, and re-run all subsequentcells

CONFIG = {
    "model_name": "distilbert-base-uncased",
    "learning_rate": 2e-5,
    "batch_size": 16,
    "num_epochs": 3,
    "max_length": 256,
}
print("Current configuration:", CONFIG)

Current configuration: {'model_name': 'distilbert-base-uncased', 'learning_rate': 2e-05, 'batch_size': 16, 'num_epochs': 3, 'max_length': 256}


In [2]:
import numpy as np
import pandas as pd
import os

# confirm VS Code sees your data folder:
print(os.listdir("../data"))

['Fake.csv', 'val.csv', 'test.csv', 'merged_dataset.csv', 'README.md', 'train.csv', 'True.csv']


In [3]:
# Load pre-split data
train_df = pd.read_csv("../data/train.csv")
val_df   = pd.read_csv("../data/val.csv")
test_df  = pd.read_csv("../data/test.csv")

train_df.head()

Unnamed: 0,title,text,subject,date,label,text_full
0,SocGen says no wrongdoing in handling of Natio...,PARIS (Reuters) - French bank Societe Generale...,worldnews,"November 22, 2017",1,SocGen says no wrongdoing in handling of Natio...
1,North Carolina governor concedes election to D...,"WINSTON-SALEM, N.C. (Reuters) - North Carolina...",politicsNews,"December 5, 2016",1,North Carolina governor concedes election to D...
2,TRUMP FEVER! W. VA Dem Senator Says He Won’t V...,Civil political discourse took a beating in We...,left-news,"Aug 7, 2017",0,TRUMP FEVER! W. VA Dem Senator Says He Won’t V...
3,New York vows to sue Trump over immigrant chil...,(Reuters) - New York and Washington state on M...,politicsNews,"September 4, 2017",1,New York vows to sue Trump over immigrant chil...
4,Orlando killer expressed support for multiple ...,"ORLANDO, Fla. (Reuters) - Orlando nightclub ki...",politicsNews,"June 12, 2016",1,Orlando killer expressed support for multiple ...


In [4]:
# Our input text and labels
X_train_text = train_df["text_full"].astype(str).tolist()
y_train      = train_df["label"].tolist()

X_val_text   = val_df["text_full"].astype(str).tolist()
y_val        = val_df["label"].tolist()

X_test_text  = test_df["text_full"].astype(str).tolist()
y_test       = test_df["label"].tolist()

len(X_train_text), len(X_val_text), len(X_test_text)

(35918, 4490, 4490)

In [5]:
################################################### NOTES ###################################################
# We use our preprocessed CSVs → X_train_text, y_train, etc.
# No more transformed_text_title_combined or Kaggle paths.
# DistilBERT sees: text_full (title + body) and label (0 = fake, 1 = real).
# We use train + val for training/validation; test stays untouched for final evaluation.
#############################################################################################################

import os
os.environ["WANDB_DISABLED"] = "true"  # disable Weights & Biases spam

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# 1. Load tokenizer (from CONFIG)
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

# 2. Convert our lists -> Hugging Face Dataset objects
train_ds = Dataset.from_dict({"text": X_train_text, "label": y_train})
val_ds   = Dataset.from_dict({"text": X_val_text,   "label": y_val})
test_ds  = Dataset.from_dict({"text": X_test_text,  "label": y_test})

# 3. Tokenization
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=CONFIG["max_length"],
    )

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val   = val_ds.map(tokenize_function,   batched=True)
tokenized_test  = test_ds.map(tokenize_function,  batched=True)

# 4. Load pre-trained DistilBERT for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG["model_name"],
    num_labels=2,  # 0 = Fake, 1 = Real
)

# 5. Data collator (handles padding dynamically)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 6. Training arguments (all from CONFIG)
training_args = TrainingArguments(
    output_dir="../results",
    learning_rate=CONFIG["learning_rate"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    num_train_epochs=CONFIG["num_epochs"],
    weight_decay=0.01,
    eval_strategy="epoch",  # use validation set each epoch
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="../logs",
)

# 7. Trainer – IMPORTANT: use validation set as eval_dataset, not test
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 8. Train (fine-tuning)
trainer.train()

# 9. Save fine-tuned model
trainer.save_model("../models/distilbert_finetuned")

Using device: cpu


Map:   0%|          | 0/35918 [00:00<?, ? examples/s]

Map:   0%|          | 0/4490 [00:00<?, ? examples/s]

Map:   0%|          | 0/4490 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0115,0.001356




KeyboardInterrupt: 

In [6]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Define metric computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Re-create a Trainer JUST for evaluation, using the trained model
eval_trainer = Trainer(
    model=model,                  # this is the fine-tuned model from Cell 5
    args=training_args,           # same TrainingArguments
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,   # default eval dataset = validation set
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 1) Evaluate on validation set
val_results = eval_trainer.evaluate()
print("Validation results:")
print(val_results)

# 2) Evaluate on test set (explicitly pass tokenized_test)
test_results = eval_trainer.evaluate(eval_dataset=tokenized_test)
print("\nTest results:")
print(test_results)

  eval_trainer = Trainer(


Validation results:
{'eval_loss': 4.3559000914683565e-05, 'eval_model_preparation_time': 0.004, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 41.6157, 'eval_samples_per_second': 107.892, 'eval_steps_per_second': 6.752}





Test results:
{'eval_loss': 0.0003635552420746535, 'eval_model_preparation_time': 0.004, 'eval_accuracy': 0.9997772828507795, 'eval_precision': 1.0, 'eval_recall': 0.9995331465919701, 'eval_f1': 0.999766518795237, 'eval_runtime': 40.2702, 'eval_samples_per_second': 111.497, 'eval_steps_per_second': 6.978}


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load fine-tuned model + tokenizer from disk
model_path = "../models/distilbert_finetuned"
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)

loaded_model.eval()

# Label mapping: 0 = Fake, 1 = Real (our convention)
label_names = ["Fake", "Real"]

# Example news texts (you can change these to whatever you like) (ChatGPT's idea)
sample_texts = [
    "Government announces new education reform to support low-income students.",
    "Shocking! Scientists prove that drinking only coffee for a week makes you immortal.",
    "Major tech company releases open-source AI model for medical diagnosis.",
    "Experts claim that the moon will crash into Earth next year according to secret documents.",
]

for text in sample_texts:
    # Tokenize
    inputs = loaded_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=CONFIG["max_length"],
    )

    # Get predictions
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()

    print("Text:", text)
    print("Prediction:", label_names[predicted_class_id])
    print("-" * 80)

Text: Government announces new education reform to support low-income students.
Prediction: Fake
--------------------------------------------------------------------------------
Text: Shocking! Scientists prove that drinking only coffee for a week makes you immortal.
Prediction: Fake
--------------------------------------------------------------------------------
Text: Major tech company releases open-source AI model for medical diagnosis.
Prediction: Fake
--------------------------------------------------------------------------------
Text: Experts claim that the moon will crash into Earth next year according to secret documents.
Prediction: Fake
--------------------------------------------------------------------------------


In [8]:
# Pick one known real and one known fake from your data
real_example = true_example = train_df[train_df["label"] == 1]["text_full"].iloc[0]
fake_example = train_df[train_df["label"] == 0]["text_full"].iloc[0]

def predict_text(text):
    inputs = loaded_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=CONFIG["max_length"],
    )
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        pred_id = torch.argmax(outputs.logits, dim=-1).item()
    return label_names[pred_id]

print("REAL example pred:", predict_text(real_example))
print("FAKE example pred:", predict_text(fake_example))

REAL example pred: Real
FAKE example pred: Fake


In [None]:
import csv
import time
import os

# Prepare a summary of this experiment
experiment_row = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "model_name": CONFIG["model_name"],
    "learning_rate": CONFIG["learning_rate"],
    "batch_size": CONFIG["batch_size"],
    "num_epochs": CONFIG["num_epochs"],
    "max_length": CONFIG["max_length"],
    "val_accuracy": val_results.get("eval_accuracy", None),
    "val_f1": val_results.get("eval_f1", None),
    "test_accuracy": test_results.get("eval_accuracy", None),
    "test_f1": test_results.get("eval_f1", None)
}

# Create log file if it doesn't exist
log_path = "../experiment_results.csv"
file_exists = os.path.isfile(log_path)

with open(log_path, "a", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=experiment_row.keys())
    
    if not file_exists:
        writer.writeheader()   # first time: write column names

    writer.writerow(experiment_row)

print("Experiment logged!")
print(experiment_row)