# Data Preparation Notebook for AFNC Dataset

This notebook prepares the AFNC dataset for training the fake news detection model.

In [None]:
# Import required modules
import sys
import os
import pandas as pd
import numpy as np
import logging

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.join(os.getcwd(), '..'))

from src.core.data_preparation import load_and_prepare_data, create_sample_dataset

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Check if AFNC dataset exists
afnc_file = '../data/raw/AFNC_Opendata_export_20250728184932.csv'
try:
    df = pd.read_csv(afnc_file, nrows=5)
    print(f"AFNC dataset found with columns: {list(df.columns)}")
except FileNotFoundError:
    print("AFNC dataset not found. Looking for sample dataset...")
    try:
        df = pd.read_csv('../data/raw/news.csv')
        print("Sample dataset found.")
    except FileNotFoundError:
        print("No dataset found. Creating sample dataset...")
        create_sample_dataset()
        df = pd.read_csv('../data/raw/news.csv')
        print("Sample dataset created.")

In [None]:
# Prepare the data
print("Starting data preparation process...")

try:
    X_train, X_test, y_train, y_test, label_encoder = load_and_prepare_data()
    
    print("\nData preparation completed successfully!")
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print(f"Label classes: {label_encoder.classes_}")
    
    # Save the splits for later use
    np.save('../data/processed/X_train.npy', X_train)
    np.save('../data/processed/X_test.npy', X_test)
    np.save('../data/processed/y_train.npy', y_train)
    np.save('../data/processed/y_test.npy', y_test)
    
    # Save the label encoder
    import joblib
    joblib.dump(label_encoder, '../data/processed/label_encoder.pkl')
    
    print("\nData splits and label encoder saved to processed data directory")
    
except Exception as e:
    logger.error(f"Error in data preparation: {e}")
    raise

In [None]:
# Examine the prepared data
prepared_data_path = '../data/processed/news_prepared.csv'

try:
    df_prepared = pd.read_csv(prepared_data_path)
    
    print("Prepared dataset info:")
    print(df_prepared.info())
    
    print("\nFirst few rows of prepared data:")
    display(df_prepared.head())
    
    print("\nLabel distribution in prepared data:")
    print(df_prepared['label'].value_counts())
    
    print("\nExamples of cleaned headlines:")
    for i, headline in enumerate(df_prepared['cleaned_headline'].head(5)):
        print(f"{i+1}. {headline}")
        
except FileNotFoundError:
    print("Prepared data file not found.")
except Exception as e:
    print(f"Error loading prepared data: {e}")

## Summary

In this notebook, we've:
1. Checked for the AFNC dataset or created a sample dataset if needed
2. Cleaned and prepared the text data
3. Encoded the labels (Real vs Fake news)
4. Split the data into training and test sets
5. Saved the prepared data and splits for use in model training

## Next Steps

Proceed to the model training notebook to train the fake news detection model.