In [None]:
# 03_Feature_Engineering.ipynb
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from config import *
from utils import *
import joblib

# Execution guard to prevent double execution
if 'feature_engineering_executed' not in globals():
    feature_engineering_executed = True
    
    # Load processed data
    print("Loading processed data...")
    df = pd.read_csv(PROCESSED_TRAIN_PATH)
    val_df = pd.read_csv(PROCESSED_VAL_PATH)

    print(f"Training data shape: {df.shape}")
    print(f"Validation data shape: {val_df.shape}")

    # Check for NaN values
    print("\n=== CHECKING FOR NaN VALUES ===")
    print("Training data NaN values:")
    print(df[['title_clean', 'text_clean']].isna().sum())
    print("\nValidation data NaN values:")
    print(val_df[['title_clean', 'text_clean']].isna().sum())

    # Handle any remaining NaN values
    df['title_clean'] = df['title_clean'].fillna('no content')
    df['text_clean'] = df['text_clean'].fillna('no content')
    val_df['title_clean'] = val_df['title_clean'].fillna('no content')
    val_df['text_clean'] = val_df['text_clean'].fillna('no content')

    # Split data
    X = df[['title_clean', 'text_clean']]
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

    # Fill missing values
    X_train['title_clean'] = X_train['title_clean'].fillna('')
    X_test['title_clean'] = X_test['title_clean'].fillna('')
    val_df['title_clean'] = val_df['title_clean'].fillna('')
    X_train['text_clean'] = X_train['text_clean'].fillna('')
    X_test['text_clean'] = X_test['text_clean'].fillna('')
    val_df['text_clean'] = val_df['text_clean'].fillna('')

    # Feature Engineering: TF-IDF on title
    print("\n=== TF-IDF VECTORIZATION ===")

    # Title features (TF-IDF)
    title_vectorizer = TfidfVectorizer(
        max_features=MAX_FEATURES,
        ngram_range=N_GRAM_RANGE,
        stop_words='english'
    )

    print("Fitting title TF-IDF vectorizer...")
    X_train_title_tfidf = title_vectorizer.fit_transform(X_train['title_clean'])
    X_test_title_tfidf = title_vectorizer.transform(X_test['title_clean'])
    val_title_tfidf = title_vectorizer.transform(val_df['title_clean'])

    print(f"Title TF-IDF shape - Train: {X_train_title_tfidf.shape}, Test: {X_test_title_tfidf.shape}")

    # Text features (TF-IDF)
    text_vectorizer = TfidfVectorizer(
        max_features=MAX_FEATURES // 2,
        ngram_range=N_GRAM_RANGE,
        stop_words='english'
    )

    print("Fitting text TF-IDF vectorizer...")
    X_train_text_tfidf = text_vectorizer.fit_transform(X_train['text_clean'])
    X_test_text_tfidf = text_vectorizer.transform(X_test['text_clean'])
    val_text_tfidf = text_vectorizer.transform(val_df['text_clean'])

    print(f"Text TF-IDF shape - Train: {X_train_text_tfidf.shape}, Test: {X_test_text_tfidf.shape}")

    # Combine features (TF-IDF)
    print("Combining TF-IDF features...")
    from scipy.sparse import hstack

    X_train_combined = hstack([X_train_title_tfidf, X_train_text_tfidf])
    X_test_combined = hstack([X_test_title_tfidf, X_test_text_tfidf])
    val_combined = hstack([val_title_tfidf, val_text_tfidf])

    print(f"Combined TF-IDF features shape - Train: {X_train_combined.shape}, Test: {X_test_combined.shape}")

    # Save features and vectorizers
    joblib.dump(X_train_combined, FEATURES_DIR / 'X_train_combined.pkl')
    joblib.dump(X_test_combined, FEATURES_DIR / 'X_test_combined.pkl')
    joblib.dump(val_combined, FEATURES_DIR / 'val_combined.pkl')
    joblib.dump(y_train, FEATURES_DIR / 'y_train.pkl')
    joblib.dump(y_test, FEATURES_DIR / 'y_test.pkl')
    joblib.dump(title_vectorizer, MODELS_DIR / 'title_vectorizer.pkl')
    joblib.dump(text_vectorizer, MODELS_DIR / 'text_vectorizer.pkl')

    print("Features and vectorizers saved successfully!")

else:
    print("Feature engineering already executed. Restart kernel to run again.")

Loading processed data...
Training data shape: (36429, 9)
Validation data shape: (4956, 7)

=== CHECKING FOR NaN VALUES ===
Training data NaN values:
title_clean    0
text_clean     0
dtype: int64

Validation data NaN values:
title_clean    0
text_clean     0
dtype: int64
Train shape: (29143, 2), Test shape: (7286, 2)

=== TF-IDF VECTORIZATION ===
Fitting title TF-IDF vectorizer...
Title TF-IDF shape - Train: (29143, 5000), Test: (7286, 5000)
Fitting text TF-IDF vectorizer...
Text TF-IDF shape - Train: (29143, 2500), Test: (7286, 2500)
Combining TF-IDF features...
Combined TF-IDF features shape - Train: (29143, 7500), Test: (7286, 7500)
Features and vectorizers saved successfully!
