# üì∞ Fake News Detection v2 - DistilBERT Training

This notebook trains the DistilBERT model on your fake news dataset.

**Instructions:**
1. Click `Runtime` ‚Üí `Change runtime type` ‚Üí Select `T4 GPU`
2. Upload your `Fake.csv` and `True.csv` files when prompted
3. Run all cells
4. Download the trained model at the end

In [None]:
# Step 1: Install dependencies
!pip install -q transformers datasets accelerate

In [None]:
# Step 2: Upload your CSV files
from google.colab import files
print("üì§ Upload Fake.csv and True.csv files:")
uploaded = files.upload()

In [None]:
# Step 3: Load and prepare data
import pandas as pd
from sklearn.model_selection import train_test_split

fake_df = pd.read_csv('Fake.csv')
true_df = pd.read_csv('True.csv')

fake_df['label'] = 0
true_df['label'] = 1

df = pd.concat([fake_df, true_df]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"‚úÖ Total samples: {len(df):,}")
print(f"   Fake: {len(fake_df):,} | Real: {len(true_df):,}")

In [None]:
# Step 4: Create Hugging Face Dataset
from datasets import Dataset

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

print(f"‚úÖ Train: {len(train_dataset):,} | Test: {len(test_dataset):,}")

In [None]:
# Step 5: Load tokenizer and tokenize
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1000)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1000)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("‚úÖ Tokenization complete")

In [None]:
# Step 6: Load model
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

print("‚úÖ Model loaded")

In [None]:
# Step 7: Training setup
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Step 8: TRAIN! (~15-20 minutes with GPU)
print("üèãÔ∏è Starting training...")
trainer.train()
print("‚úÖ Training complete!")

In [None]:
# Step 9: Evaluate
results = trainer.evaluate()
print("\nüìä Final Results:")
print(f"   Accuracy:  {results['eval_accuracy']:.4f}")
print(f"   F1 Score:  {results['eval_f1']:.4f}")
print(f"   Precision: {results['eval_precision']:.4f}")
print(f"   Recall:    {results['eval_recall']:.4f}")

In [None]:
# Step 10: Save model
model.save_pretrained('./fake_news_distilbert')
tokenizer.save_pretrained('./fake_news_distilbert')
print("‚úÖ Model saved to ./fake_news_distilbert")

In [None]:
# Step 11: Test prediction
import torch

def predict(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs.to(model.device))
        probs = torch.softmax(outputs.logits, dim=1)[0]
    return 'REAL' if probs[1] > probs[0] else 'FAKE', probs.max().item()

# Test
test_text = "WASHINGTON (Reuters) - The Senate passed new legislation on healthcare reform."
pred, conf = predict(test_text)
print(f"\nüîç Test: {pred} ({conf:.1%} confidence)")

In [None]:
# Step 12: Download trained model
!zip -r fake_news_distilbert.zip fake_news_distilbert/
files.download('fake_news_distilbert.zip')
print("\nüì• Download started! Extract to your project folder.")