# Fine-tuning DistilBERT for Fake News Detection (Resource-Efficient & Robust)

This notebook demonstrates an improved pipeline to fine-tune DistilBERT on FakeNewsNet,
addressing data leakage, overfitting, and resource constraints.
Key improvements:
  - Publisher/source stripping & deduplication
  - Group-based train/test split to avoid overlap
  - Dynamic padding & FP16 for memory efficiency
  - Custom weighted loss & early stopping
  - Minimal epochs/batch sizes + gradient accumulation

In [33]:
# 1. Install dependencies (run once)
!pip install transformers datasets scikit-learn torch accelerate --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Using cached datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Downloading torch-2.7.0-cp312-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m266.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hUsing cached huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90

# 2. Imports

In [35]:
import re
import pandas as pd
import numpy as np
import torch
import warnings

# Suppress known FutureWarnings from HuggingFace and other libraries
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding,
    EarlyStoppingCallback
)

# 3. Load & label data

In [36]:
df_fake = pd.read_csv("fake-news-net/Fake.csv")
df_true = pd.read_csv("fake-news-net/True.csv")

In [37]:
df_fake["label"] = 1  # fake = 1, real = 0
df_true["label"] = 0
df = pd.concat([df_fake, df_true], ignore_index=True)

# 4. Preprocessing: strip publisher metadata

In [38]:
pattern = r"\([^)]+Reuters\)|[A-Z ]+:"
# e.g. (Reuters), WASHINGTON:, NEW YORK:
def clean_text(text):
    text = re.sub(pattern, '', text)
    return text.strip()

df['text'] = df['text'].map(lambda x: clean_text(x))

df['title'] = df['title'].map(lambda x: re.sub(pattern, '', x).strip())

# 5. Deduplicate

In [39]:
df = df.drop_duplicates(subset=['title', 'text']).reset_index(drop=True)

# 6. Extract publisher/source for grouping (approximate)


In [40]:
def extract_source(text):
    m = re.match(r"([A-Z][A-Za-z ]+):", text)
    return m.group(1) if m else 'UNKNOWN'

df['source'] = df['text'].map(extract_source)

# 7. Group-based train/test split


In [41]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(df, groups=df['source']))
train_df, test_df = df.loc[train_idx], df.loc[test_idx]

# 8. Tokenizer & Dataset preparation


In [42]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [43]:
# Map texts -> tokenized
def tokenize_batch(batch):
    return tokenizer(
        batch['text'], batch['title'],
        truncation=True, max_length=512
    )

In [44]:
# Use HuggingFace `datasets` for efficiency
from datasets import Dataset
train_ds = Dataset.from_pandas(train_df[['text','title','label']])
eval_ds  = Dataset.from_pandas(test_df[['text','title','label']])

In [45]:
train_ds = train_ds.map(tokenize_batch, batched=True)
eval_ds  = eval_ds.map(tokenize_batch, batched=True)

Map: 100%|██████████| 39009/39009 [00:08<00:00, 4726.78 examples/s]
Map: 100%|██████████| 93/93 [00:00<00:00, 4240.17 examples/s]


In [46]:
# Set format
train_ds.set_format(type='torch', columns=['input_ids','attention_mask','label'])
eval_ds.set_format(type='torch', columns=['input_ids','attention_mask','label'])

# 9. Compute class weights

In [47]:
labels = train_df['label'].values
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = {i: w for i, w in enumerate(class_weights)}

# 10. Custom Trainer for weighted loss

In [48]:
import torch.nn as nn
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('label')
        outputs = model(**inputs)
        loss_fct = nn.CrossEntropyLoss(
            weight=torch.tensor(list(class_weights.values()), device=model.device)
        )
        loss = loss_fct(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss


# 11. Metrics

In [49]:
metric_names = ['accuracy', 'precision', 'recall', 'f1']
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'accuracy': acc, 'precision': p, 'recall': r, 'f1': f1}

# 12. Training arguments (resource-efficient)

In [50]:
use_fp16 = torch.cuda.is_available()
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    fp16=use_fp16,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    seed=42,
)

# 13. Initialize model & Trainer


In [51]:
# Note: Classification head weights are newly initialized (random) and will be trained from scratch.
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
data_collator = DataCollatorWithPadding(tokenizer)

In [53]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

TypeError: Accelerator.__init__() got an unexpected keyword argument 'dispatch_batches'

# 14. Train & evaluate Train & evaluate


In [None]:
trainer.train()