In [1]:
import pandas as pd
import re
from pathlib import Path

import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

import torch
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_and_preprocess(true_path: Path, fake_path: Path) -> pd.DataFrame:
    df_true = pd.read_csv(true_path)
    df_fake = pd.read_csv(fake_path)
    df_true['label'] = 1
    df_fake['label'] = 0
    df = pd.concat([df_true, df_fake], ignore_index=True)
    df['content'] = (df['title'].fillna('') + ' ' + df['text'].fillna('')).str.lower()
    df['content'] = df['content'].str.replace(r'<[^>]+>', ' ', regex=True)
    df['content'] = df['content'].str.replace(r'http\S+|www\.\S+', ' ', regex=True)
    df['content'] = df['content'].str.replace(r'[^a-z0-9\s]', ' ', regex=True)
    df['content'] = df['content'].str.replace(r'\s+', ' ', regex=True).str.strip()
    df = df[df['content'].str.len() > 0].reset_index(drop=True)
    return df

In [3]:
# Baseline with XGBoost
def train_baseline(df: pd.DataFrame, model_path: Path):
    X_train, X_test, y_train, y_test = train_test_split(
        df['content'], df['label'], test_size=0.2, random_state=42
    )

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.8, max_features=20000)),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=6))
    ])

    pipeline.fit(X_train, y_train)

    preds = pipeline.predict(X_test)

    print("Baseline XGBoost Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))

    joblib.dump(pipeline, model_path / 'baseline_xgb.pkl')
    return pipeline

In [4]:
# Fine-tuning with DistilBERT
def train_transformer(df: pd.DataFrame, model_dir: Path):
    # Prepare dataset
    ds = Dataset.from_pandas(df[['content', 'label']])
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    def tokenize_fn(batch):
        return tokenizer(batch['content'], padding=True, truncation=True)

    ds = ds.train_test_split(test_size=0.2)
    ds = ds.map(tokenize_fn, batched=True)
    ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased', num_labels=2
    )

    training_args = TrainingArguments(
        output_dir=str(model_dir / 'distilbert_fake_news'),
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir=str(model_dir / 'logs'),
        logging_steps=50,
    )

    accuracy = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.argmax(axis=-1)
        return accuracy.compute(predictions=preds, references=labels)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds['train'],
        eval_dataset=ds['test'],
        compute_metrics=compute_metrics
    )

    trainer.train()

    # Save full pipeline as joblib
    model.save_pretrained(model_dir / 'distilbert_fake_news')
    tokenizer.save_pretrained(model_dir / 'distilbert_fake_news')
    joblib.dump((model, tokenizer), model_dir / 'distilbert_pipeline.pkl')

    return model


In [5]:
if __name__ == '__main__':
    DATA_DIR = Path(r'C:\Users\abdul\OneDrive\Documents\fake_news_detector')
    MODEL_DIR = Path('model')
    MODEL_DIR.mkdir(exist_ok=True)

    # Load
    df = load_and_preprocess(DATA_DIR / 'True.csv', DATA_DIR / 'Fake.csv')
    print(f"Dataset size after cleaning: {len(df)}")

    # Train baseline
    print("Training baseline XGBoost...")
    train_baseline(df, MODEL_DIR)

    # Fine-tune transformer
    print("Fine-tuning DistilBERT...")
    train_transformer(df, MODEL_DIR)

Dataset size after cleaning: 44889
Training baseline XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Baseline XGBoost Accuracy: 0.9982178658944085
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4677
           1       1.00      1.00      1.00      4301

    accuracy                           1.00      8978
   macro avg       1.00      1.00      1.00      8978
weighted avg       1.00      1.00      1.00      8978

Fine-tuning DistilBERT...
Fine-tuning DistilBERT...


Map: 100%|██████████| 35911/35911 [01:04<00:00, 553.24 examples/s]
Map: 100%|██████████| 35911/35911 [01:04<00:00, 553.24 examples/s]
Map: 100%|██████████| 8978/8978 [00:25<00:00, 354.02 examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
import joblib
xgb_pipe = joblib.load(r'C:\Users\abdul\OneDrive\Documents\fake_news_detector\model\baseline_xgb.pkl')

In [11]:
print(hasattr(xgb_pipe, 'predict_proba'))

True


In [13]:
# Example prediction using xgb_pipe and predict_proba
sample_text = "Trump is the President of USA"
if hasattr(xgb_pipe, 'predict_proba'):
    proba = xgb_pipe.predict_proba([sample_text])[0]
    print(f"Probabilities (Fake, Real): {proba}")
    label = 'Real' if proba[1] >= 0.5 else 'Fake'
    print(f"Prediction: {label}")
else:
    print("xgb_pipe does not have predict_proba method.")

Probabilities (Fake, Real): [9.9992335e-01 7.6637240e-05]
Prediction: Fake
