# üì∞ Fake News Detection v2 - DistilBERT Training
## Using Combined Dataset (Original + COVID-19 Data)

**Training Time:** ~20 minutes with T4 GPU

### Instructions:
1. Click `Runtime` ‚Üí `Change runtime type` ‚Üí Select `T4 GPU`
2. Upload `combined_training_data.csv` when prompted
3. Run all cells
4. Download the trained model at the end

In [None]:
# Step 1: Install dependencies
!pip install -q transformers datasets accelerate

In [None]:
# Step 2: Upload your combined dataset
from google.colab import files
print("üì§ Please upload 'combined_training_data.csv':")
uploaded = files.upload()

In [None]:
# Step 3: Load and prepare data
import pandas as pd
from sklearn.model_selection import train_test_split

# Load combined dataset
df = pd.read_csv('combined_training_data.csv')

# Clean data
df = df.dropna(subset=['text'])
df = df[df['text'].str.len() > 50]  # Keep substantial texts

print(f"‚úÖ Total samples: {len(df):,}")
print(f"   Fake (label=0): {(df['label'] == 0).sum():,}")
print(f"   Real (label=1): {(df['label'] == 1).sum():,}")

In [None]:
# Step 4: Create Hugging Face Dataset
from datasets import Dataset

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label']
)

train_dataset = Dataset.from_pandas(train_df[['text', 'label']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df[['text', 'label']].reset_index(drop=True))

print(f"‚úÖ Train: {len(train_dataset):,} | Test: {len(test_dataset):,}")

In [None]:
# Step 5: Load tokenizer and tokenize
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(
        batch['text'], 
        padding='max_length', 
        truncation=True, 
        max_length=512
    )

print("üîÑ Tokenizing... (this may take a few minutes)")
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1000)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1000)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("‚úÖ Tokenization complete!")

In [None]:
# Step 6: Load model
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

print("‚úÖ Model loaded!")

In [None]:
# Step 7: Training setup
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc, 
        'f1': f1, 
        'precision': precision, 
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,  # Mixed precision for faster training
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

print("‚úÖ Training setup complete!")

In [None]:
# Step 8: TRAIN! (~15-20 minutes with T4 GPU)
print("üèãÔ∏è Starting training on COMBINED dataset...")
print("   Original (2016-2017) + COVID-19 (2020) data")
print("   Expected time: ~15-20 minutes with T4 GPU")
print("=" * 50)

trainer.train()

print("\n‚úÖ Training complete!")

In [None]:
# Step 9: Evaluate
results = trainer.evaluate()

print("\n" + "=" * 50)
print("üìä FINAL EVALUATION RESULTS")
print("=" * 50)
print(f"\n   Accuracy:  {results['eval_accuracy']:.4f}")
print(f"   F1 Score:  {results['eval_f1']:.4f}")
print(f"   Precision: {results['eval_precision']:.4f}")
print(f"   Recall:    {results['eval_recall']:.4f}")

In [None]:
# Step 10: Save model
model.save_pretrained('./fake_news_distilbert')
tokenizer.save_pretrained('./fake_news_distilbert')
print("‚úÖ Model saved to ./fake_news_distilbert")

In [None]:
# Step 11: Test prediction
import torch

def predict(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs.to(model.device))
        probs = torch.softmax(outputs.logits, dim=1)[0]
    pred = 'REAL' if probs[1] > probs[0] else 'FAKE'
    conf = probs.max().item()
    return pred, conf

# Test with different samples
print("\nüîç Testing predictions:")
print("=" * 50)

# Real news style
test1 = "WASHINGTON (Reuters) - The Senate passed new legislation on healthcare reform."
pred1, conf1 = predict(test1)
print(f"\n1. Reuters style: {pred1} ({conf1:.1%})")

# Fake news style
test2 = "SHOCKING! Scientists EXPOSED for LYING about vaccines! The TRUTH they don't want you to know!"
pred2, conf2 = predict(test2)
print(f"2. Sensational: {pred2} ({conf2:.1%})")

# COVID-related
test3 = "Health officials confirmed new COVID-19 cases have decreased following vaccination efforts."
pred3, conf3 = predict(test3)
print(f"3. COVID factual: {pred3} ({conf3:.1%})")

In [None]:
# Step 12: Download trained model
!zip -r fake_news_distilbert_combined.zip fake_news_distilbert/
files.download('fake_news_distilbert_combined.zip')

print("\n" + "=" * 50)
print("üì• DOWNLOAD STARTED!")
print("=" * 50)
print("\n1. Extract the zip file")
print("2. Place 'fake_news_distilbert' folder in your project")
print("3. Run: streamlit run app_v2.py")