# IEEE-CIS Fraud Detection - Feature Engineering

## Overview
This notebook implements comprehensive feature engineering for the fraud detection model.
Good feature engineering is often the difference between a mediocre and excellent fraud detection system.

## Feature Engineering Strategy
1. **Missing Value Handling**: Intelligent imputation strategies (not just dropping)
2. **Temporal Features**: Extract time patterns from TransactionDT
3. **Categorical Encoding**: Target encoding for high-cardinality features
4. **Interaction Features**: Combinations of card, address, and email features
5. **Aggregation Features**: Transaction patterns per card/user
6. **Feature Scaling**: Normalize where appropriate

## Why These Choices Matter for Fraud Detection
- Fraudsters often operate at unusual times (temporal features)
- They reuse compromised cards/addresses (aggregation features)
- Specific combinations signal fraud (interaction features)
- Missing data patterns can indicate fraud attempts

In [None]:
# Standard library imports
import os
import sys
import warnings
import pickle
from pathlib import Path

# Data manipulation
import numpy as np
import pandas as pd
from scipy import stats

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Define paths
BASE_PATH = Path('..').resolve()
DATA_PATH = BASE_PATH / 'Data' / 'raw'
PROCESSED_PATH = BASE_PATH / 'Data' / 'processed'
FEATURES_PATH = BASE_PATH / 'Data' / 'features'
OUTPUT_PATH = BASE_PATH / 'outputs'

# Create directories if they don't exist
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)
FEATURES_PATH.mkdir(parents=True, exist_ok=True)

# Add src to path for importing custom modules
sys.path.insert(0, str(BASE_PATH / 'src'))

print(f"Data Path: {DATA_PATH}")
print(f"Processed Path: {PROCESSED_PATH}")

In [None]:
# Load and merge data
print("Loading data...")
train_transaction = pd.read_csv(DATA_PATH / 'train_transaction.csv')
train_identity = pd.read_csv(DATA_PATH / 'train_identity.csv')

# Merge datasets
train_df = train_transaction.merge(train_identity, on='TransactionID', how='left')
print(f"Training data shape: {train_df.shape}")

# Load test data for consistent encoding
test_transaction = pd.read_csv(DATA_PATH / 'test_transaction.csv')
test_identity = pd.read_csv(DATA_PATH / 'test_identity.csv')
test_df = test_transaction.merge(test_identity, on='TransactionID', how='left')
print(f"Test data shape: {test_df.shape}")

# Free memory
del train_transaction, train_identity, test_transaction, test_identity

## 1. Memory Optimization

Before feature engineering, we optimize memory usage to handle the large dataset efficiently.
This is a critical production consideration for Databricks/Spark environments.

In [None]:
def reduce_memory_usage(df, verbose=True):
    """
    Reduce memory usage by downcasting numeric types.
    This is essential for production systems handling large datasets.
    
    Args:
        df: pandas DataFrame
        verbose: print memory reduction stats
    
    Returns:
        DataFrame with optimized dtypes
    """
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Downcast integers
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                    
            # Downcast floats
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    
    if verbose:
        print(f'Memory usage reduced from {start_mem:.2f} MB to {end_mem:.2f} MB '
              f'({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    
    return df

train_df = reduce_memory_usage(train_df)
test_df = reduce_memory_usage(test_df)

## 2. Feature Type Classification

We classify features into groups for appropriate preprocessing:
- **Numerical**: V-features, amounts, counts
- **Categorical**: Product codes, card info, device info
- **Binary**: Match features (M1-M9)
- **Temporal**: TransactionDT

In [None]:
# Define feature groups based on domain knowledge
# These groupings are based on the IEEE-CIS feature descriptions

# Columns to exclude from features
EXCLUDE_COLS = ['TransactionID', 'isFraud']

# Categorical columns (object type or known categorical)
CATEGORICAL_COLS = [
    'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
    'DeviceType', 'DeviceInfo'
] + [f'id_{i:02d}' for i in range(12, 39)]

# Filter to only columns that exist
CATEGORICAL_COLS = [c for c in CATEGORICAL_COLS if c in train_df.columns]

# Numerical columns (everything else except ID and target)
NUMERICAL_COLS = [c for c in train_df.columns 
                  if c not in CATEGORICAL_COLS + EXCLUDE_COLS]

print(f"Categorical features: {len(CATEGORICAL_COLS)}")
print(f"Numerical features: {len(NUMERICAL_COLS)}")

## 3. Missing Value Analysis and Handling

### Strategy:
1. **Drop columns with >90% missing**: These provide little predictive value
2. **Numerical features**: Impute with median (robust to outliers in fraud data)
3. **Categorical features**: Impute with mode or create 'missing' category
4. **Create missing indicators**: The missingness pattern itself can be predictive

In [None]:
# Analyze missing value patterns
missing_pct = train_df.isnull().sum() / len(train_df) * 100

# Identify columns to drop (>90% missing)
cols_to_drop = missing_pct[missing_pct > 90].index.tolist()
print(f"Columns with >90% missing (to be dropped): {len(cols_to_drop)}")

# Columns with high but not extreme missing (create indicators)
cols_high_missing = missing_pct[(missing_pct > 50) & (missing_pct <= 90)].index.tolist()
print(f"Columns with 50-90% missing (create indicators): {len(cols_high_missing)}")

In [None]:
def handle_missing_values(train_df, test_df, categorical_cols, numerical_cols, 
                          drop_threshold=0.90, create_indicators=True):
    """
    Comprehensive missing value handling.
    
    Strategy:
    - Drop columns with missing rate > drop_threshold
    - Create missing indicators for important columns
    - Impute numerical with median (robust to fraud outliers)
    - Impute categorical with mode or 'missing' category
    
    Args:
        train_df, test_df: DataFrames
        categorical_cols: list of categorical column names
        numerical_cols: list of numerical column names
        drop_threshold: missing rate threshold for dropping columns
        create_indicators: whether to create missing indicator features
    
    Returns:
        Processed train_df, test_df, and imputers dict for inference
    """
    imputers = {}
    
    # Calculate missing percentages from training data
    missing_pct = train_df.isnull().sum() / len(train_df)
    
    # Identify columns to drop
    cols_to_drop = missing_pct[missing_pct > drop_threshold].index.tolist()
    cols_to_drop = [c for c in cols_to_drop if c not in ['TransactionID', 'isFraud']]
    
    print(f"Dropping {len(cols_to_drop)} columns with >{drop_threshold*100}% missing")
    train_df = train_df.drop(columns=cols_to_drop, errors='ignore')
    test_df = test_df.drop(columns=cols_to_drop, errors='ignore')
    
    # Update column lists
    categorical_cols = [c for c in categorical_cols if c in train_df.columns]
    numerical_cols = [c for c in numerical_cols if c in train_df.columns]
    
    # Create missing indicators for columns with significant missing values
    # This captures the information that data is missing (which can be predictive)
    if create_indicators:
        indicator_cols = missing_pct[(missing_pct > 0.10) & (missing_pct <= drop_threshold)].index.tolist()
        indicator_cols = [c for c in indicator_cols if c in train_df.columns]
        
        for col in indicator_cols[:20]:  # Limit to top 20 to avoid feature explosion
            train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
            test_df[f'{col}_missing'] = test_df[col].isnull().astype(np.int8)
        
        print(f"Created {min(20, len(indicator_cols))} missing indicator features")
    
    # Impute numerical columns with median
    # Median is preferred over mean for fraud data due to outliers
    numerical_cols_present = [c for c in numerical_cols if c in train_df.columns]
    
    for col in numerical_cols_present:
        if train_df[col].isnull().sum() > 0:
            median_val = train_df[col].median()
            imputers[col] = {'strategy': 'median', 'value': median_val}
            train_df[col] = train_df[col].fillna(median_val)
            test_df[col] = test_df[col].fillna(median_val)
    
    # Impute categorical columns
    categorical_cols_present = [c for c in categorical_cols if c in train_df.columns]
    
    for col in categorical_cols_present:
        if train_df[col].isnull().sum() > 0:
            # For string columns, use 'missing' category
            if train_df[col].dtype == 'object':
                fill_val = 'missing'
            else:
                # For numeric categorical, use -999 as indicator
                fill_val = -999
            
            imputers[col] = {'strategy': 'constant', 'value': fill_val}
            train_df[col] = train_df[col].fillna(fill_val)
            test_df[col] = test_df[col].fillna(fill_val)
    
    print(f"Imputed {len([c for c in numerical_cols_present if c in imputers])} numerical columns")
    print(f"Imputed {len([c for c in categorical_cols_present if c in imputers])} categorical columns")
    
    return train_df, test_df, imputers, categorical_cols_present, numerical_cols_present

# Apply missing value handling
train_df, test_df, imputers, CATEGORICAL_COLS, NUMERICAL_COLS = handle_missing_values(
    train_df, test_df, CATEGORICAL_COLS, NUMERICAL_COLS
)

## 4. Temporal Feature Engineering

TransactionDT is seconds from a reference datetime. We extract:
- Hour of day (fraud patterns vary by hour)
- Day of week (weekend vs weekday patterns)
- Day of month (beginning/end of month patterns)
- Time since start (for trend analysis)

**Why this matters**: Fraudsters often operate at unusual hours to avoid detection.

In [None]:
def create_temporal_features(df):
    """
    Create temporal features from TransactionDT.
    
    TransactionDT is timedelta from a given reference datetime.
    We extract cyclical time features that capture fraud patterns.
    
    Args:
        df: DataFrame with TransactionDT column
    
    Returns:
        DataFrame with new temporal features
    """
    df = df.copy()
    
    # Basic time extractions
    df['hour'] = (df['TransactionDT'] // 3600) % 24
    df['day'] = df['TransactionDT'] // (24 * 3600)
    df['day_of_week'] = df['day'] % 7
    df['day_of_month'] = df['day'] % 30
    
    # Cyclical encoding for hour (captures that 23:00 is close to 00:00)
    # This is important because simple hour values don't capture cyclical nature
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    # Cyclical encoding for day of week
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    # Time-based flags (business hours vs off-hours)
    # Fraud is often higher during off-hours when monitoring may be reduced
    df['is_night'] = ((df['hour'] >= 22) | (df['hour'] <= 5)).astype(np.int8)
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(np.int8)
    df['is_business_hours'] = ((df['hour'] >= 9) & (df['hour'] <= 17) & 
                               (df['day_of_week'] < 5)).astype(np.int8)
    
    print(f"Created temporal features: hour, day, day_of_week, day_of_month, "
          f"hour_sin, hour_cos, dow_sin, dow_cos, is_night, is_weekend, is_business_hours")
    
    return df

train_df = create_temporal_features(train_df)
test_df = create_temporal_features(test_df)

## 5. Categorical Encoding

### Encoding Strategy:
1. **Label Encoding**: For tree-based models (LightGBM, XGBoost, RF)
2. **Frequency Encoding**: Captures commonality of category
3. **Target Encoding**: For high-cardinality features (with regularization to prevent leakage)

**Note**: We use label encoding primarily since our models are tree-based.
Target encoding is applied carefully to avoid target leakage.

In [None]:
def label_encode_features(train_df, test_df, categorical_cols):
    """
    Label encode categorical features for tree-based models.
    
    Handles unseen categories in test set by assigning -1.
    
    Args:
        train_df, test_df: DataFrames
        categorical_cols: list of columns to encode
    
    Returns:
        Encoded DataFrames and encoder dict for inference
    """
    encoders = {}
    
    for col in categorical_cols:
        if col not in train_df.columns:
            continue
            
        le = LabelEncoder()
        
        # Combine train and test for fitting to handle all categories
        combined = pd.concat([train_df[col].astype(str), 
                              test_df[col].astype(str)], axis=0)
        le.fit(combined)
        
        # Transform
        train_df[col] = le.transform(train_df[col].astype(str))
        test_df[col] = le.transform(test_df[col].astype(str))
        
        encoders[col] = le
    
    print(f"Label encoded {len(encoders)} categorical columns")
    return train_df, test_df, encoders

train_df, test_df, label_encoders = label_encode_features(
    train_df, test_df, CATEGORICAL_COLS
)

In [None]:
def create_frequency_encoding(train_df, test_df, categorical_cols):
    """
    Create frequency encoding for categorical features.
    
    Frequency encoding captures how common a category is.
    Rare categories might be more suspicious (fraud attempts with fake info).
    
    Args:
        train_df, test_df: DataFrames
        categorical_cols: columns to encode
    
    Returns:
        DataFrames with frequency encoded features
    """
    freq_maps = {}
    
    # Select high-cardinality columns for frequency encoding
    high_card_cols = [col for col in categorical_cols 
                      if col in train_df.columns and train_df[col].nunique() > 10]
    
    for col in high_card_cols[:10]:  # Limit to prevent feature explosion
        freq_map = train_df[col].value_counts(normalize=True).to_dict()
        freq_maps[col] = freq_map
        
        # Create frequency feature
        train_df[f'{col}_freq'] = train_df[col].map(freq_map).fillna(0).astype(np.float32)
        test_df[f'{col}_freq'] = test_df[col].map(freq_map).fillna(0).astype(np.float32)
    
    print(f"Created {len(freq_maps)} frequency encoded features")
    return train_df, test_df, freq_maps

train_df, test_df, freq_encoders = create_frequency_encoding(
    train_df, test_df, CATEGORICAL_COLS
)

## 6. Interaction Features

Creating interaction features to capture fraud patterns:
- Card + Address combinations (legitimate users have consistent patterns)
- Email domain patterns (free email vs corporate)
- Device + Card combinations

**Why this matters**: Fraudsters often show inconsistent combinations.

In [None]:
def create_interaction_features(df):
    """
    Create interaction features capturing suspicious patterns.
    
    Fraud detection benefits from features that capture:
    - Unusual combinations (card from one region, address from another)
    - Email domain characteristics
    - Card usage patterns
    
    Args:
        df: DataFrame
    
    Returns:
        DataFrame with interaction features
    """
    df = df.copy()
    
    # Card + Address interaction
    # Legitimate users typically have consistent card-address combinations
    if 'card1' in df.columns and 'addr1' in df.columns:
        df['card1_addr1'] = df['card1'].astype(str) + '_' + df['addr1'].astype(str)
        # Encode the interaction
        df['card1_addr1'] = LabelEncoder().fit_transform(df['card1_addr1'])
    
    # Card type interactions
    if 'card4' in df.columns and 'card6' in df.columns:
        df['card4_card6'] = df['card4'].astype(str) + '_' + df['card6'].astype(str)
        df['card4_card6'] = LabelEncoder().fit_transform(df['card4_card6'])
    
    # Transaction amount features
    if 'TransactionAmt' in df.columns:
        # Log transform to handle skewness
        df['TransactionAmt_log'] = np.log1p(df['TransactionAmt'])
        
        # Decimal part - unusual amounts might indicate fraud
        df['TransactionAmt_decimal'] = (df['TransactionAmt'] - 
                                        np.floor(df['TransactionAmt'])).astype(np.float32)
        
        # Is round amount (whole dollar)
        df['is_round_amount'] = (df['TransactionAmt_decimal'] < 0.01).astype(np.int8)
    
    # Email domain features
    # Note: P_emaildomain is purchaser, R_emaildomain is recipient
    # Mismatch might indicate fraud
    if 'P_emaildomain' in df.columns and 'R_emaildomain' in df.columns:
        df['email_match'] = (df['P_emaildomain'] == df['R_emaildomain']).astype(np.int8)
    
    # Browser/Device features from identity columns
    if 'DeviceType' in df.columns and 'DeviceInfo' in df.columns:
        df['device_type_info'] = df['DeviceType'].astype(str) + '_' + df['DeviceInfo'].astype(str)
        df['device_type_info'] = LabelEncoder().fit_transform(df['device_type_info'])
    
    print("Created interaction features: card1_addr1, card4_card6, TransactionAmt_log, "
          "TransactionAmt_decimal, is_round_amount, email_match, device_type_info")
    
    return df

train_df = create_interaction_features(train_df)
test_df = create_interaction_features(test_df)

## 7. Aggregation Features

Creating aggregation features to capture patterns:
- Transaction counts per card
- Average transaction amount per card
- Time since last transaction

**Why this matters**: Fraudsters often make multiple rapid transactions before being caught.

In [None]:
def create_aggregation_features(train_df, test_df, group_cols=['card1', 'card2', 'addr1']):
    """
    Create aggregation features based on grouping columns.
    
    Aggregations capture user behavior patterns:
    - How many transactions from this card?
    - What's the average amount for this card?
    - How does this transaction compare to the card's typical behavior?
    
    Args:
        train_df, test_df: DataFrames
        group_cols: columns to group by for aggregations
    
    Returns:
        DataFrames with aggregation features
    """
    agg_features = {}
    
    for col in group_cols:
        if col not in train_df.columns:
            continue
        
        # Calculate aggregations on training data
        agg = train_df.groupby(col).agg({
            'TransactionAmt': ['count', 'mean', 'std', 'min', 'max']
        })
        agg.columns = [f'{col}_TransactionAmt_{stat}' for stat in ['count', 'mean', 'std', 'min', 'max']]
        agg = agg.reset_index()
        
        # Store for later use
        agg_features[col] = agg.set_index(col).to_dict('index')
        
        # Merge aggregations
        train_df = train_df.merge(agg, on=col, how='left')
        test_df = test_df.merge(agg, on=col, how='left')
        
        # Fill missing (new cards in test) with global statistics
        for stat_col in agg.columns:
            if stat_col != col:
                global_val = train_df[stat_col].median()
                train_df[stat_col] = train_df[stat_col].fillna(global_val).astype(np.float32)
                test_df[stat_col] = test_df[stat_col].fillna(global_val).astype(np.float32)
        
        # Create ratio features: how does this transaction compare to typical?
        mean_col = f'{col}_TransactionAmt_mean'
        if mean_col in train_df.columns:
            train_df[f'{col}_amt_ratio'] = (train_df['TransactionAmt'] / 
                                            (train_df[mean_col] + 1)).astype(np.float32)
            test_df[f'{col}_amt_ratio'] = (test_df['TransactionAmt'] / 
                                           (test_df[mean_col] + 1)).astype(np.float32)
    
    print(f"Created aggregation features for: {[c for c in group_cols if c in train_df.columns]}")
    return train_df, test_df, agg_features

train_df, test_df, agg_features = create_aggregation_features(
    train_df, test_df, group_cols=['card1', 'card2', 'addr1']
)

## 8. Feature Selection

Remove features that:
- Have zero variance (no predictive value)
- Are highly correlated with each other (redundant)
- Have too many unique values (potential overfitting)

In [None]:
def remove_constant_features(train_df, test_df, threshold=0.01):
    """
    Remove features with variance below threshold.
    
    Constant or near-constant features provide no information gain.
    
    Args:
        train_df, test_df: DataFrames
        threshold: minimum variance required
    
    Returns:
        DataFrames with constant features removed
    """
    # Get numerical columns
    numerical_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
    numerical_cols = [c for c in numerical_cols if c not in ['TransactionID', 'isFraud']]
    
    # Calculate variance
    variances = train_df[numerical_cols].var()
    low_var_cols = variances[variances < threshold].index.tolist()
    
    if low_var_cols:
        train_df = train_df.drop(columns=low_var_cols)
        test_df = test_df.drop(columns=low_var_cols, errors='ignore')
        print(f"Removed {len(low_var_cols)} low variance features")
    
    return train_df, test_df

train_df, test_df = remove_constant_features(train_df, test_df)

In [None]:
# Final feature list
exclude_cols = ['TransactionID', 'isFraud', 'TransactionDT']
feature_cols = [c for c in train_df.columns if c not in exclude_cols]

print(f"\nFinal number of features: {len(feature_cols)}")
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

## 9. Save Processed Data

Save the processed datasets and feature engineering artifacts for:
1. Model training
2. Inference pipeline (encoders, imputers)

In [None]:
# Save processed datasets
print("Saving processed datasets...")

# Save as parquet for efficiency (better than CSV for large datasets)
train_df.to_parquet(PROCESSED_PATH / 'train_processed.parquet', index=False)
test_df.to_parquet(PROCESSED_PATH / 'test_processed.parquet', index=False)

print(f"Saved training data: {PROCESSED_PATH / 'train_processed.parquet'}")
print(f"Saved test data: {PROCESSED_PATH / 'test_processed.parquet'}")

In [None]:
# Save feature engineering artifacts for inference
feature_artifacts = {
    'imputers': imputers,
    'label_encoders': label_encoders,
    'freq_encoders': freq_encoders,
    'agg_features': agg_features,
    'feature_cols': feature_cols,
    'categorical_cols': CATEGORICAL_COLS,
    'numerical_cols': NUMERICAL_COLS
}

with open(FEATURES_PATH / 'feature_artifacts.pkl', 'wb') as f:
    pickle.dump(feature_artifacts, f)

print(f"Saved feature artifacts: {FEATURES_PATH / 'feature_artifacts.pkl'}")

In [None]:
# Save feature list for documentation
feature_info = pd.DataFrame({
    'feature': feature_cols,
    'dtype': [str(train_df[c].dtype) for c in feature_cols],
    'nunique': [train_df[c].nunique() for c in feature_cols],
    'missing_pct': [train_df[c].isnull().sum() / len(train_df) * 100 for c in feature_cols]
})

feature_info.to_csv(OUTPUT_PATH / 'metrics' / 'feature_info.csv', index=False)
print(f"Saved feature info: {OUTPUT_PATH / 'metrics' / 'feature_info.csv'}")

In [None]:
# Summary
print("\n" + "="*60)
print("FEATURE ENGINEERING SUMMARY")
print("="*60)
print(f"\nOriginal features: ~434")
print(f"Final features: {len(feature_cols)}")
print(f"\nFeature types created:")
print("  - Temporal: hour, day, day_of_week, cyclical encodings, time flags")
print("  - Interactions: card-address, card-type, amount features")
print("  - Aggregations: transaction stats per card/address")
print("  - Frequency encodings: category frequencies")
print("  - Missing indicators: capturing missing patterns")
print(f"\nData saved to: {PROCESSED_PATH}")
print(f"Artifacts saved to: {FEATURES_PATH}")
print("\nNext steps: Proceed to 03_modeling.ipynb")