# Fake News Detection Model Training

This notebook demonstrates how to train a fake news detection model using HuggingFace Transformers and PyTorch.

## Setup and Dependencies

In [None]:
# Install required packages
!pip install transformers datasets torch pandas numpy scikit-learn matplotlib seaborn

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## Load and Prepare Dataset

We'll use the LIAR dataset for this example. In a real implementation, you might want to combine multiple datasets like FakeNewsNet and LIAR.

In [None]:
# For demonstration, we'll create a simplified dataset loader
# In a real implementation, you would load actual datasets

def load_liar_dataset(data_dir):
    # This is a placeholder for loading the LIAR dataset
    # In reality, you would parse the actual LIAR dataset files
    
    # Create a sample dataframe for demonstration
    data = {
        'text': [
            'The economy has improved significantly under the current administration.',
            'Scientists have proven that climate change is a hoax.',
            'The new healthcare bill will provide coverage to all citizens.',
            'The government is putting chemicals in the water to control the population.',
            'Vaccines have been linked to autism in multiple studies.'
        ],
        'label': [1, 0, 1, 0, 0]  # 1 for true, 0 for false
    }
    
    df = pd.DataFrame(data)
    return df

# Load dataset
df = load_liar_dataset('data/')

# Split into train, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f'Training samples: {len(train_df)}')
print(f'Validation samples: {len(val_df)}')
print(f'Test samples: {len(test_df)}')

## Prepare Data for Transformer Model

In [None]:
# Load pre-trained tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        # Remove batch dimension added by tokenizer
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        # Add label
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        
        return encoding

# Create datasets
train_dataset = NewsDataset(train_df['text'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = NewsDataset(val_df['text'].tolist(), val_df['label'].tolist(), tokenizer)
test_dataset = NewsDataset(test_df['text'].tolist(), test_df['label'].tolist(), tokenizer)

## Define Model and Training Parameters

In [None]:
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define metrics computation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    push_to_hub=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

## Train the Model

In [None]:
# Train the model
trainer.train()

## Evaluate on Test Set

In [None]:
# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print(f'Test results: {test_results}')

## Save the Model

In [None]:
# Save model and tokenizer
output_dir = '../backend/model/trained_model'
os.makedirs(output_dir, exist_ok=True)

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f'Model saved to {output_dir}')

## Test Model Inference

In [None]:
# Test inference with a few examples
test_texts = [
    'The president signed a new executive order yesterday.',
    'Scientists have discovered that the Earth is actually flat.',
    'A new study shows that exercise can improve mental health.',
    'The government is hiding aliens in Area 51.'
]

# Tokenize inputs
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
# Print results
for i, text in enumerate(test_texts):
    credibility_score = predictions[i, 1].item()  # Probability of being true
    print(f'Text: {text}')
    print(f'Credibility score: {credibility_score:.4f}')
    
    if credibility_score >= 0.7:
        category = 'Credible'
    elif credibility_score >= 0.4:
        category = 'Somewhat Credible'
    else:
        category = 'Not Credible'
        
    print(f'Category: {category}
')

## Conclusion

This notebook demonstrates a basic approach to training a fake news detection model. In a real-world scenario, you would:

1. Use larger and more diverse datasets
2. Perform more extensive data preprocessing
3. Experiment with different model architectures
4. Implement more sophisticated evaluation metrics
5. Add explainability features to highlight misleading content

The trained model can then be integrated into the Fake News Radar application.