In [5]:
# ----------------------------------------
# src/feature_engineering.py
# ----------------------------------------

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def load_data(filepath="../data/processed/fraud_data_processed.csv"):
    """Load processed fraud dataset"""
    df = pd.read_csv(filepath)
    df['signup_time'] = pd.to_datetime(df['signup_time'])
    df['purchase_time'] = pd.to_datetime(df['purchase_time'])
    return df

def handle_missing_values(df):
    """Fill missing numeric and categorical values, drop duplicates"""
    num_cols = ['age', 'purchase_value']
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())

    cat_cols = ['sex', 'source', 'browser']
    for col in cat_cols:
        df[col] = df[col].fillna("Unknown")

    df = df.drop_duplicates()
    return df

def create_time_features(df):
    """Create time-based features"""
    df['time_diff'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600
    df['purchase_hour'] = df['purchase_time'].dt.hour
    df['purchase_day'] = df['purchase_time'].dt.day_name()
    df['is_night'] = df['purchase_hour'].apply(lambda x: 1 if x >= 23 or x <= 5 else 0)
    df['is_weekend'] = df['purchase_day'].isin(['Saturday','Sunday']).astype(int)
    df['instant_purchase'] = (df['time_diff'] < 1).astype(int)
    median_purchase = df['purchase_value'].median()
    df['high_value'] = (df['purchase_value'] > median_purchase).astype(int)
    return df

def create_user_features(df):
    """User-level frequency and velocity features"""
    user_stats = df.groupby('device_id')['purchase_value'].agg(
        user_txn_count='count',
        user_total_purchase='sum',
        user_avg_purchase='mean'
    ).reset_index()
    df = df.merge(user_stats, on='device_id', how='left')
    return df

def encode_and_scale(df):
    """Encode categorical features and scale numeric features"""
    # Drop high-cardinality columns
    df = df.drop(columns=['device_id', 'ip_address'])

    # Categorical encoding
    cat_cols = ['source','browser','sex','purchase_day']
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

    # Scale numeric features
    num_cols = ['purchase_value','age','time_diff','purchase_hour',
                'user_txn_count','user_total_purchase','user_avg_purchase']
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    return df



def train_test_split_data(df):
    """Optional train/test split for modeling"""
    X = df.drop(columns=['class'])
    y = df['class']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    print("Train/Test split complete.")
    print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])
    return X_train, X_test, y_train, y_test

def main():
    """Run all feature engineering steps"""
    df = load_data()
    df = handle_missing_values(df)
    df = create_time_features(df)
    df = create_user_features(df)
    df = encode_and_scale(df)
    train_test_split_data(df)

if __name__ == "__main__":
    main()


Train/Test split complete.
Train size: 120889 Test size: 30223
