In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder


# Add these imports to your existing cell
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Load the cleaned data
train_transaction = pd.read_csv('../data/preprocessed/train_transaction_cleaned.csv')
train_identity = pd.read_csv('../data/preprocessed/train_identity_cleaned.csv')
test_transaction = pd.read_csv('../data/preprocessed/test_transaction_cleaned.csv')
test_identity = pd.read_csv('../data/preprocessed/test_identity_cleaned.csv')

print(f"Training data shape: {train_transaction.shape}")
print(f"Test data shape: {test_transaction.shape}")
print(train_transaction.columns)

# Combine transaction and identity data
train_data = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test_data = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
print(f"Train data shape after merge: {train_data.shape}")
print(f"Test data shape after merge: {test_data.shape}")
print(f"Fraud rate in training data: {train_data['isFraud'].mean():.4f}")


Training data shape: (590540, 410)
Test data shape: (506691, 409)
Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'TransactionDT_hour_sin', 'TransactionDT_hour_cos',
       'TransactionDT_dow_sin', 'TransactionDT_dow_cos',
       'TransactionDT_month_sin', 'TransactionDT_month_cos',
       'TransactionDT_is_weekend', 'TransactionDT_time_of_day',
       'TransactionDT_days_since_first', 'TransactionDT_seconds_since_first'],
      dtype='object', length=410)
Train data shape after merge: (593424, 450)
Test data shape after merge: (506691, 449)
Fraud rate in training data: 0.0348


In [None]:
# STEP 1: Enhanced Temporal Features (Fix your existing function)
def create_temporal_features(df):
    """Advanced temporal feature engineering with proper velocity calculation"""
    
    # Create a copy to avoid modifying original
    df = df.copy()
    
    # Basic time features
    df['TransactionDT_hour'] = (df['TransactionDT'] / 3600) % 24
    df['TransactionDT_day'] = (df['TransactionDT'] / (3600 * 24)) % 7
    df['TransactionDT_week'] = df['TransactionDT'] / (3600 * 24 * 7)
    
    # Advanced temporal patterns
    df['is_weekend'] = df['TransactionDT_day'].isin([5, 6]).astype(int)
    df['is_night'] = ((df['TransactionDT_hour'] >= 22) | (df['TransactionDT_hour'] <= 6)).astype(int)
    df['is_business_hours'] = ((df['TransactionDT_hour'] >= 9) & (df['TransactionDT_hour'] <= 17)).astype(int)
    df['is_peak_hours'] = df['TransactionDT_hour'].isin([12, 13, 18, 19, 20]).astype(int)
    
    # Time since epoch features
    df['days_since_start'] = df['TransactionDT'] / (3600 * 24)
    df['hour_of_week'] = df['TransactionDT_day'] * 24 + df['TransactionDT_hour']
    
    # Sort by card and time for velocity calculations
    df = df.sort_values(['card1', 'TransactionDT']).reset_index(drop=True)
    
    # Fixed velocity features - count transactions in time windows
    for hours in [1, 24, 168]:  # 1 hour, 1 day, 1 week
        window_seconds = hours * 3600
        
        # Create velocity feature using a simpler approach
        velocity_list = []
        for card in df['card1'].unique():
            card_data = df[df['card1'] == card].copy()
            card_data = card_data.sort_values('TransactionDT')
            
            # Calculate velocity for each transaction
            velocities = []
            for i, row in card_data.iterrows():
                current_time = row['TransactionDT']
                time_window_start = current_time - window_seconds
                
                # Count transactions in the time window (including current)
                count = len(card_data[
                    (card_data['TransactionDT'] <= current_time) & 
                    (card_data['TransactionDT'] > time_window_start)
                ])
                velocities.append(count)
            
            # Store velocities with original indices
            for idx, vel in zip(card_data.index, velocities):
                velocity_list.append((idx, vel))
        
        # Create velocity column
        velocity_dict = dict(velocity_list)
        df[f'velocity_{hours}h'] = df.index.map(velocity_dict).fillna(1)
    
    # Time between transactions
    df['time_since_last_txn'] = df.groupby('card1')['TransactionDT'].diff()
    df['time_to_next_txn'] = df.groupby('card1')['TransactionDT'].diff(-1).abs()
    
    # Fill NaN values for time differences
    df['time_since_last_txn'].fillna(0, inplace=True)
    df['time_to_next_txn'].fillna(0, inplace=True)
    
    # Cyclical encoding for hour and day
    df['hour_sin'] = np.sin(2 * np.pi * df['TransactionDT_hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['TransactionDT_hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['TransactionDT_day'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['TransactionDT_day'] / 7)
    
    return df

In [28]:
# STEP 2: Advanced Amount-Based Features
def create_amount_features(df):
    """Sophisticated amount-based features"""
    
    df = df.copy()
    
    # Basic amount transformations
    df['TransactionAmt_log'] = np.log1p(df['TransactionAmt'])
    df['TransactionAmt_sqrt'] = np.sqrt(df['TransactionAmt'])
    df['TransactionAmt_decimal'] = df['TransactionAmt'] - df['TransactionAmt'].astype(int)
    df['is_round_amount'] = (df['TransactionAmt_decimal'] == 0).astype(int)
    
    # Amount patterns
    df['amount_digits'] = df['TransactionAmt'].astype(str).str.len()
    df['is_even_amount'] = (df['TransactionAmt'] % 2 == 0).astype(int)
    df['ends_with_00'] = (df['TransactionAmt'] % 100 == 0).astype(int)
    df['ends_with_99'] = (df['TransactionAmt'].astype(str).str.endswith('99')).astype(int)
    
    # Card-level amount statistics
    card_amt_stats = df.groupby('card1')['TransactionAmt'].agg([
        'mean', 'std', 'min', 'max', 'count', 'median',
        lambda x: np.percentile(x, 25),
        lambda x: np.percentile(x, 75)
    ]).reset_index()
    card_amt_stats.columns = ['card1', 'card_amt_mean', 'card_amt_std', 'card_amt_min', 
                             'card_amt_max', 'card_amt_count', 'card_amt_median',
                             'card_amt_q25', 'card_amt_q75']
    
    df = df.merge(card_amt_stats, on='card1', how='left')
    
    # Amount deviation features
    df['amt_deviation_from_mean'] = np.abs(df['TransactionAmt'] - df['card_amt_mean']) / (df['card_amt_std'] + 1e-6)
    df['amt_rank_within_card'] = df.groupby('card1')['TransactionAmt'].rank(pct=True)
    df['is_amt_outlier'] = (df['amt_deviation_from_mean'] > 2).astype(int)
    
    # Amount vs time patterns
    df['amt_per_hour'] = df['TransactionAmt'] / (df['TransactionDT_hour'] + 1)
    df['amt_weekend_ratio'] = df['TransactionAmt'] * df['is_weekend']
    
    return df

In [29]:
# STEP 3: Device Features
def create_device_features(df):
    """Create device and browser-based features"""
    
    df = df.copy()
    
    # Device type analysis
    if 'DeviceType' in df.columns:
        df['is_mobile'] = (df['DeviceType'] == 'mobile').astype(int)
        df['is_desktop'] = (df['DeviceType'] == 'desktop').astype(int)
    
    # Browser analysis
    if 'id_31' in df.columns:
        df['browser_type'] = df['id_31']
        # Common browsers
        common_browsers = ['chrome', 'safari', 'firefox', 'edge']
        df['is_common_browser'] = df['id_31'].str.lower().isin(common_browsers).astype(int)
    
    # Screen resolution
    if 'id_33' in df.columns:
        df['screen_width'] = df['id_33']
        df['is_common_resolution'] = df['id_33'].isin([1920, 1366, 1280, 1024]).astype(int)
    
    # Operating system
    if 'id_30' in df.columns:
        df['os_type'] = df['id_30']
        df['is_windows'] = df['id_30'].str.contains('Windows', na=False).astype(int)
        df['is_ios'] = df['id_30'].str.contains('iOS', na=False).astype(int)
        df['is_android'] = df['id_30'].str.contains('Android', na=False).astype(int)
    
    return df

In [30]:
# STEP 4: Frequency Encoding with Bayesian Smoothing
def frequency_encoding_with_smoothing(df, columns, target='isFraud', alpha=10):
    """Frequency encoding with Bayesian smoothing to prevent overfitting"""
    
    df = df.copy()
    global_fraud_rate = df[target].mean()
    
    for col in columns:
        if col in df.columns:
            # Calculate category statistics
            stats = df.groupby(col)[target].agg(['count', 'sum']).reset_index()
            stats.columns = [col, 'count', 'fraud_count']
            
            # Bayesian smoothing
            stats[f'{col}_fraud_rate_smooth'] = (
                (stats['fraud_count'] + alpha * global_fraud_rate) / 
                (stats['count'] + alpha)
            )
            
            # Frequency encoding
            stats[f'{col}_frequency'] = stats['count']
            
            # Merge back
            merge_cols = [col, f'{col}_fraud_rate_smooth', f'{col}_frequency']
            df = df.merge(stats[merge_cols], on=col, how='left')
            
            # Fill missing values with global stats
            df[f'{col}_fraud_rate_smooth'].fillna(global_fraud_rate, inplace=True)
            df[f'{col}_frequency'].fillna(1, inplace=True)
    
    return df

In [31]:
# STEP 5: Advanced Interaction Features
def create_interaction_features(df):
    """Create meaningful feature interactions"""
    
    df = df.copy()
    
    # Amount × Time interactions
    df['amt_hour_interaction'] = df['TransactionAmt'] * df['TransactionDT_hour']
    df['amt_weekend_interaction'] = df['TransactionAmt'] * df['is_weekend']
    df['amt_night_interaction'] = df['TransactionAmt'] * df['is_night']
    df['amt_business_interaction'] = df['TransactionAmt'] * df['is_business_hours']
    
    # Card × Product interactions
    if 'ProductCD' in df.columns:
        df['card_product_combo'] = df['card1'].astype(str) + '_' + df['ProductCD'].astype(str)
    
    # Address interactions
    if 'addr1' in df.columns and 'addr2' in df.columns:
        df['addr_combo'] = df['addr1'].astype(str) + '_' + df['addr2'].astype(str)
        df['addr_mismatch'] = (df['addr1'] != df['addr2']).astype(int)
    
    # Email domain interactions
    if 'P_emaildomain' in df.columns and 'R_emaildomain' in df.columns:
        df['email_domain_match'] = (df['P_emaildomain'] == df['R_emaildomain']).astype(int)
        df['email_domains_combo'] = df['P_emaildomain'].astype(str) + '_' + df['R_emaildomain'].astype(str)
    
    # Card × Device interactions
    if 'id_31' in df.columns:
        df['card_browser_combo'] = df['card1'].astype(str) + '_' + df['id_31'].astype(str)
    
    # Distance features (if geographic data available)
    if 'dist1' in df.columns and 'dist2' in df.columns:
        df['dist_ratio'] = df['dist1'] / (df['dist2'] + 1e-6)
        df['dist_sum'] = df['dist1'] + df['dist2']
        df['dist_diff'] = np.abs(df['dist1'] - df['dist2'])
    
    # Amount percentile interactions
    df['amt_rank_hour_interaction'] = df['amt_rank_within_card'] * df['TransactionDT_hour']
    df['velocity_amt_interaction'] = df['velocity_1h'] * np.log1p(df['TransactionAmt'])
    
    return df

In [32]:
# STEP 6: Network and Anomaly Detection Features (UPDATED)
def create_network_anomaly_features(df, is_train=True, train_stats=None):
    """Create graph-based and anomaly detection features"""
    
    df = df.copy()
    
    # Email domain risk analysis
    if 'P_emaildomain' in df.columns:
        if is_train:
            # Calculate stats from training data
            email_stats = df.groupby('P_emaildomain')['isFraud'].agg(['mean', 'count']).reset_index()
            email_stats.columns = ['P_emaildomain', 'email_fraud_rate', 'email_frequency']
            df = df.merge(email_stats, on='P_emaildomain', how='left')
            
            # Store stats for test set
            if train_stats is not None:
                train_stats['email_stats'] = email_stats
        else:
            # Use pre-calculated stats from training data
            if train_stats and 'email_stats' in train_stats:
                email_stats = train_stats['email_stats']
                df = df.merge(email_stats, on='P_emaildomain', how='left')
                # Fill missing with global mean
                df['email_fraud_rate'].fillna(email_stats['email_fraud_rate'].mean(), inplace=True)
                df['email_frequency'].fillna(1, inplace=True)
        
        # Email domain categories
        free_emails = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com']
        df['is_free_email'] = df['P_emaildomain'].isin(free_emails).astype(int)
    
    # Card family analysis
    df['card_family'] = df['card1'] // 1000
    if is_train:
        card_family_stats = df.groupby('card_family')['isFraud'].agg(['mean', 'count']).reset_index()
        card_family_stats.columns = ['card_family', 'card_family_fraud_rate', 'card_family_count']
        df = df.merge(card_family_stats, on='card_family', how='left')
        
        # Store stats for test set
        if train_stats is not None:
            train_stats['card_family_stats'] = card_family_stats
    else:
        # Use pre-calculated stats from training data
        if train_stats and 'card_family_stats' in train_stats:
            card_family_stats = train_stats['card_family_stats']
            df = df.merge(card_family_stats, on='card_family', how='left')
            # Fill missing with global mean
            df['card_family_fraud_rate'].fillna(card_family_stats['card_family_fraud_rate'].mean(), inplace=True)
            df['card_family_count'].fillna(1, inplace=True)
    
    # Isolation Forest for amount anomalies
    amount_features = ['TransactionAmt', 'TransactionDT_hour', 'TransactionDT_day']
    available_features = [f for f in amount_features if f in df.columns]
    
    if len(available_features) >= 2:
        if is_train:
            iso_forest = IsolationForest(contamination=0.1, random_state=42, n_jobs=-1)
            df['isolation_anomaly'] = iso_forest.fit_predict(df[available_features].fillna(0))
            df['isolation_anomaly'] = (df['isolation_anomaly'] == -1).astype(int)
            
            # Store model for test set
            if train_stats is not None:
                train_stats['iso_forest'] = iso_forest
        else:
            # Use pre-fitted model
            if train_stats and 'iso_forest' in train_stats:
                iso_forest = train_stats['iso_forest']
                df['isolation_anomaly'] = iso_forest.predict(df[available_features].fillna(0))
                df['isolation_anomaly'] = (df['isolation_anomaly'] == -1).astype(int)
    
    # Statistical anomalies
    df['amt_zscore'] = np.abs(stats.zscore(df['TransactionAmt']))
    df['is_amt_extreme'] = (df['amt_zscore'] > 3).astype(int)
    
    # User behavior consistency
    user_behavior_cols = ['TransactionDT_hour', 'is_weekend', 'TransactionAmt']
    for col in user_behavior_cols:
        if col in df.columns:
            user_std = df.groupby('card1')[col].std().reset_index()
            user_std.columns = ['card1', f'{col}_user_std']
            df = df.merge(user_std, on='card1', how='left')
            df[f'{col}_consistency'] = 1 / (df[f'{col}_user_std'] + 1e-6)
    
    return df

In [33]:
# Complete Feature Engineering Pipeline (UPDATED)
def complete_feature_engineering_pipeline(train_data, test_data):
    """Execute complete feature engineering pipeline"""
    
    print("Starting feature engineering pipeline...")
    
    # Initialize train_stats dictionary to store training statistics
    train_stats = {}
    
    # Step 1: Temporal features
    print("Creating temporal features...")
    train_processed = create_temporal_features(train_data)
    test_processed = create_temporal_features(test_data)
    
    # Step 2: Amount features
    print("Creating amount features...")
    train_processed = create_amount_features(train_processed)
    test_processed = create_amount_features(test_processed)
    
    # Step 3: Device features
    print("Creating device features...")
    train_processed = create_device_features(train_processed)
    test_processed = create_device_features(test_processed)
    
    # Step 4: Frequency encoding (only on train to avoid leakage)
    print("Applying frequency encoding...")
    categorical_cols = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3']
    available_cats = [col for col in categorical_cols if col in train_processed.columns]
    
    train_processed = frequency_encoding_with_smoothing(train_processed, available_cats)
    
    # Apply same encodings to test set
    for col in available_cats:
        if f'{col}_fraud_rate_smooth' in train_processed.columns:
            # Create encoding maps from train
            fraud_rate_map = train_processed.groupby(col)[f'{col}_fraud_rate_smooth'].first().to_dict()
            frequency_map = train_processed.groupby(col)[f'{col}_frequency'].first().to_dict()
            
            # Apply to test
            test_processed[f'{col}_fraud_rate_smooth'] = test_processed[col].map(fraud_rate_map).fillna(
                train_processed[f'{col}_fraud_rate_smooth'].mean()
            )
            test_processed[f'{col}_frequency'] = test_processed[col].map(frequency_map).fillna(1)
    
    # Step 5: Interaction features
    print("Creating interaction features...")
    train_processed = create_interaction_features(train_processed)
    test_processed = create_interaction_features(test_processed)
    
    # Step 6: Network and anomaly features (UPDATED)
    print("Creating network and anomaly features...")
    train_processed = create_network_anomaly_features(train_processed, is_train=True, train_stats=train_stats)
    test_processed = create_network_anomaly_features(test_processed, is_train=False, train_stats=train_stats)
    
    print(f"Feature engineering complete!")
    print(f"Train shape: {train_processed.shape}")
    print(f"Test shape: {test_processed.shape}")
    
    return train_processed, test_processed

# Execute the pipeline
train_engineered, test_engineered = complete_feature_engineering_pipeline(train_data, test_data)

Starting feature engineering pipeline...
Creating temporal features...


TypeError: unsupported operand type(s) for /: 'str' and 'int'