In [None]:
import os
import pandas as pd
import glob
import kagglehub
import sys

# Add the project root to the system path to allow importing modules correctly
# This ensures Python can find preprocess, features, and train.
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# Import modular functions
from preprocess import preprocess_data
from features import extract_features
from train import train_and_evaluate

def load_data():
    """Attempts to load data via KaggleHub or falls back to a local file."""
    # Set Kaggle credentials (copied exactly from your notebook)
    os.environ['KAGGLE_USERNAME'] = "lighthous"
    os.environ['KAGGLE_KEY'] = "b7dfd0c6a4a11c7af59cb809c4564029"
    
    try:
        # Attempt to download via KaggleHub (requires 'kagglehub' installed)
        print("Attempting to download dataset via KaggleHub...")
        path = kagglehub.dataset_download("vcclab/welfake-dataset")
        csv_files = glob.glob(path + "/*.csv")
        df = pd.read_csv(csv_files[0])
        print(f"Dataframe loaded successfully. Shape: {df.shape}")
        return df

    except Exception as e:
        print(f"KaggleHub download failed or credentials incorrect. Error: {e}")
        
        # Fallback to local file load
        try:
             # Assumes the script is run from the project root and looks in data/
             df = pd.read_csv('data/WELFake_Dataset.csv') 
             print("Falling back to local file load: 'data/WELFake_Dataset.csv'")
             return df
        except FileNotFoundError:
             print("\nFATAL ERROR: Dataset not found locally or via KaggleHub.")
             print("Please ensure the WELFake_Dataset.csv file is placed in your 'data/' folder.")
             return None

# --- Main Execution Flow ---
if __name__ == "__main__":
    
    print("--- Starting Fake News Detection Pipeline ---")
    
    # 1. Load Data
    df = load_data()
    if df is None:
        sys.exit(1)
        
    # 2. Preprocess Data
    df_processed = preprocess_data(df)
    
    # 3. Feature Extraction
    X_train_tfidf, X_test_tfidf, y_train, y_test, vectorizer = extract_features(df_processed)
    
    # 4. Train and Save Model
    final_accuracy = train_and_evaluate(X_train_tfidf, X_test_tfidf, y_train, y_test, vectorizer)
    
    print(f"\n--- Pipeline Finished. Final Test Accuracy: {final_accuracy:.4f} ---")