In [2]:
import os
import pandas as pd
import tqdm
from tqdm.auto import tqdm

# For Hugging Face datasets + transformers
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score

import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

PyTorch version: 2.7.1+cu118
CUDA available: True


### Load the same splits from TF-IDF

In [3]:
train_df = pd.read_csv("../data/train.csv")
val_df = pd.read_csv("../data/val.csv")
test_df = pd.read_csv("../data/test.csv")

# Ensure no NaNs in text or labels
for df in (train_df, val_df, test_df):
    df["content"] = df["content"].fillna("")
    df["label"] = df["label"].fillna(-1).astype(int)

train_df.head()

Unnamed: 0,title,text,subject,date,label,content,word_count,clean_content
0,FANTASTIC! TRUMP BUDGET DIRECTOR Rips Into Rep...,WOW! THIS IS FANTASTIC! WE HIGHLY RECOMMEND TH...,politics,"May 24, 2017",0,FANTASTIC! TRUMP BUDGET DIRECTOR Rips Into Rep...,119,fantastic! trump budget director rips into rep...
1,Trump Just Got Ripped To SHREDS After Blaming...,"As you know, Donald Trump put aside his disdai...",News,"January 7, 2017",0,Trump Just Got Ripped To SHREDS After Blaming...,1150,trump just got ripped to shreds after blaming ...
2,GLOBAL CLIMATE CHANGE LIARS Ignore Truth About...,Global Climate cooling warming change frauds a...,Government News,"Apr 22, 2016",0,GLOBAL CLIMATE CHANGE LIARS Ignore Truth About...,878,global climate change liars ignore truth about...
3,WILL VILE LEFTISTS Turn Democrats Away?…WATCH ...,A group of mostly pro-life college students ca...,left-news,"Feb 24, 2017",0,WILL VILE LEFTISTS Turn Democrats Away?…WATCH ...,287,will vile leftists turn democrats away? watch ...
4,No-confidence vote against Pennsylvania approv...,"HARRISBURG, Pa. (Reuters) - The city council o...",politicsNews,"January 20, 2016",1,No-confidence vote against Pennsylvania approv...,406,no confidence vote against pennsylvania approv...


### Convert to Hugging Face Dataset

In [4]:
# Keep only the needed columns
train_ds = Dataset.from_pandas(train_df[["content", "label"]], preserve_index=False)
val_ds = Dataset.from_pandas(val_df[["content", "label"]], preserve_index=False)
test_ds = Dataset.from_pandas(test_df[["content", "label"]], preserve_index=False)

train_ds, val_ds, test_ds

(Dataset({
     features: ['content', 'label'],
     num_rows: 31428
 }),
 Dataset({
     features: ['content', 'label'],
     num_rows: 6735
 }),
 Dataset({
     features: ['content', 'label'],
     num_rows: 6735
 }))

### Load tokenizer & model

In [5]:
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenization function

In [6]:
max_length = 512  # BERT max sequence length

def tokenize_function(batch):
    return tokenizer(
        batch["content"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

train_tokenized = train_ds.map(tokenize_function, batched=True)
val_tokenized = val_ds.map(tokenize_function, batched=True)
test_tokenized = test_ds.map(tokenize_function, batched=True)

# Remove the original text column to keep things clean
train_tokenized = train_tokenized.remove_columns(["content"])
val_tokenized = val_tokenized.remove_columns(["content"])
test_tokenized = test_tokenized.remove_columns(["content"])

train_tokenized[0]

Map: 100%|██████████| 31428/31428 [00:09<00:00, 3435.21 examples/s]
Map: 100%|██████████| 6735/6735 [00:02<00:00, 3029.60 examples/s]
Map: 100%|██████████| 6735/6735 [00:02<00:00, 3010.29 examples/s]


{'label': 0,
 'input_ids': [101,
  10392,
  999,
  8398,
  5166,
  2472,
  10973,
  2015,
  2046,
  6398,
  11242,
  19044,
  7659,
  1024,
  1523,
  2057,
  1521,
  2128,
  2025,
  2183,
  2000,
  2079,
  2070,
  1997,
  1996,
  4689,
  2477,
  1996,
  2627,
  3447,
  2106,
  1524,
  1031,
  2678,
  1033,
  10166,
  999,
  2023,
  2003,
  10392,
  999,
  2057,
  3811,
  16755,
  1996,
  2972,
  2678,
  1024,
  2436,
  1997,
  2968,
  1998,
  26178,
  1006,
  18168,
  2497,
  1007,
  2472,
  10872,
  14163,
  22144,
  5420,
  22106,
  1996,
  3252,
  1010,
  7848,
  2015,
  1998,
  5682,
  1997,
  1996,
  8398,
  3447,
  10807,
  2095,
  2760,
  5166,
  1996,
  26457,
  5166,
  1996,
  2190,
  2112,
  1997,
  2023,
  2307,
  2739,
  3034,
  2003,
  2043,
  1037,
  6398,
  5176,
  2055,
  7659,
  2000,
  4785,
  2671,
  3454,
  1024,
  2012,
  1996,
  2459,
  1024,
  4002,
  2928,
  10872,
  14163,
  22144,
  5420,
  10973,
  2015,
  2046,
  1996,
  6398,
  1998,
  2009,
  1055,
  2074,

### Metrics function for Trainer

In [7]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)

    return {
        "accuracy": acc,
        "f1": f1,
    }

### TrainingArguments

In [11]:
batch_size = 8

training_args = TrainingArguments(
    output_dir="../models/distilbert_fake_news",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    fp16=True,
)

### Create the Trainer

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


## Train

In [13]:
import transformers
transformers.logging.set_verbosity_info()

train_result = trainer.train()
train_result

***** Running training *****
  Num examples = 31,428
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11,787
  Number of trainable parameters = 66,955,010


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0,0.002771,0.999703,0.999689
2,0.0,0.001161,0.999852,0.999844
3,0.0,0.001052,0.999852,0.999844



***** Running Evaluation *****
  Num examples = 6735
  Batch size = 8
Saving model checkpoint to ../models/distilbert_fake_news\checkpoint-3929
Configuration saved in ../models/distilbert_fake_news\checkpoint-3929\config.json
Model weights saved in ../models/distilbert_fake_news\checkpoint-3929\model.safetensors
tokenizer config file saved in ../models/distilbert_fake_news\checkpoint-3929\tokenizer_config.json
Special tokens file saved in ../models/distilbert_fake_news\checkpoint-3929\special_tokens_map.json

***** Running Evaluation *****
  Num examples = 6735
  Batch size = 8
Saving model checkpoint to ../models/distilbert_fake_news\checkpoint-7858
Configuration saved in ../models/distilbert_fake_news\checkpoint-7858\config.json
Model weights saved in ../models/distilbert_fake_news\checkpoint-7858\model.safetensors
tokenizer config file saved in ../models/distilbert_fake_news\checkpoint-7858\tokenizer_config.json
Special tokens file saved in ../models/distilbert_fake_news\checkpoint

TrainOutput(global_step=11787, training_loss=0.004434759405527627, metrics={'train_runtime': 4736.5967, 'train_samples_per_second': 19.905, 'train_steps_per_second': 2.488, 'total_flos': 1.2489556214882304e+16, 'train_loss': 0.004434759405527627, 'epoch': 3.0})

### Evaluate on validation & test

In [14]:
print("Validation metrics:")
val_metrics = trainer.evaluate(eval_dataset=val_tokenized)
val_metrics


***** Running Evaluation *****
  Num examples = 6735
  Batch size = 8


Validation metrics:


{'eval_loss': 0.0011605075560510159,
 'eval_accuracy': 0.9998515219005196,
 'eval_f1': 0.9998444064104559,
 'eval_runtime': 76.0759,
 'eval_samples_per_second': 88.53,
 'eval_steps_per_second': 11.068,
 'epoch': 3.0}

In [15]:
print("Test metrics:")
test_metrics = trainer.evaluate(eval_dataset=test_tokenized)
test_metrics


***** Running Evaluation *****
  Num examples = 6735
  Batch size = 8


Test metrics:


{'eval_loss': 0.001472154282964766,
 'eval_accuracy': 0.9997030438010394,
 'eval_f1': 0.9996886674968867,
 'eval_runtime': 78.3792,
 'eval_samples_per_second': 85.928,
 'eval_steps_per_second': 10.743,
 'epoch': 3.0}

### Save model & tokenizer for later use

In [16]:
save_dir = "../models/distilbert_fake_news"

trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print("Saved DistilBERT model and tokenizer to:", save_dir)

Saving model checkpoint to ../models/distilbert_fake_news
Configuration saved in ../models/distilbert_fake_news\config.json
Model weights saved in ../models/distilbert_fake_news\model.safetensors
tokenizer config file saved in ../models/distilbert_fake_news\tokenizer_config.json
Special tokens file saved in ../models/distilbert_fake_news\special_tokens_map.json
tokenizer config file saved in ../models/distilbert_fake_news\tokenizer_config.json
Special tokens file saved in ../models/distilbert_fake_news\special_tokens_map.json


Saved DistilBERT model and tokenizer to: ../models/distilbert_fake_news
