## Feature engineering

In [2]:
import sys
import os

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join("..")))  

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans 
from sklearn.pipeline import Pipeline
from src.data_processing import (
    CustomerAggregator, FeatureEngineer, SimpleImputerTransformer,
    QuantileBinner, WoEEncoder, create_proxy_target, create_feature_pipeline
)

# -------------------------
# Load raw data
# -------------------------
df = pd.read_csv("../data/raw/data.csv")

# -------------------------
# Task 4: Create proxy target
# -------------------------
proxy_target, cluster_summary = create_proxy_target(df)
df = df.merge(proxy_target, on="CustomerId", how="left")
y = df['is_high_risk']

# -------------------------
# Task 3: Feature Engineering Pipeline
# -------------------------
feature_pipeline = Pipeline([
    ('aggregate', CustomerAggregator()),
    ('engineer', FeatureEngineer()),
    ('impute', SimpleImputerTransformer(strategy='median')),
    ('bin', QuantileBinner(n_bins=5)),
    ('woe', WoEEncoder())
])

feature_pipeline = create_feature_pipeline(df)  
X_transformed = feature_pipeline.fit_transform(df, y=y)  
preprocessor = feature_pipeline.named_steps['preprocess']

feature_names = []

for name, transformer, cols in preprocessor.transformers_:
    if name == 'num':
        feature_names.extend(cols)

    elif name == 'cat':
        ohe = transformer.named_steps['encoder']
        feature_names.extend(ohe.get_feature_names_out(cols))

X = pd.DataFrame(
    X_transformed,
    columns=feature_names
)

# Check first few rows
X.head()

numeric_cols: ['Total_Transaction_Amount', 'Average_Transaction_Amount', 'Transaction_Count', 'Std_Transaction_Amount', 'Transaction_Recency', 'Avg_Amount_By_Category', 'Count_By_FraudResult', 'Night_Transactions', 'Amount_CV', 'Dormant_Flag', 'Night_Txn_Ratio', 'Log_Total_Amount']
categorical_cols: []


Unnamed: 0,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Recency,Avg_Amount_By_Category,Count_By_FraudResult,Night_Transactions,Amount_CV,Dormant_Flag,Night_Txn_Ratio,Log_Total_Amount
0,10000.0,10000.0,1.0,0.0,83.0,10000.0,0.0,0.0,0.0,0.0,0.0,9.21044
1,10000.0,10000.0,1.0,0.0,83.0,10000.0,0.0,0.0,0.0,0.0,0.0,9.21044
2,30400.0,6080.0,5.0,4100.243895,89.0,6080.0,0.0,0.0,0.674271,0.0,0.0,10.322231
3,4775.0,434.090909,11.0,518.805446,25.0,434.090909,0.0,2.0,1.192407,0.0,0.181818,8.471359
4,32000.0,5333.333333,6.0,3945.461528,11.0,5333.333333,0.0,0.0,0.739635,0.0,0.0,10.373522


In [3]:
X.tail(5)

Unnamed: 0,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Transaction_Amount,Transaction_Recency,Avg_Amount_By_Category,Count_By_FraudResult,Night_Transactions,Amount_CV,Dormant_Flag,Night_Txn_Ratio,Log_Total_Amount
3737,32000.0,5333.333333,6.0,4033.19559,4.0,5333.333333,0.0,0.0,0.756082,0.0,0.0,10.373522
3738,32000.0,6400.0,5.0,3781.53408,25.0,6400.0,0.0,0.0,0.590772,0.0,0.0,10.373522
3739,614077.0,6079.970297,101.0,14537.733039,0.0,6079.970297,0.0,3.0,2.390693,0.0,0.029703,13.327877
3740,151000.0,8882.352941,17.0,2619.216317,67.0,8882.352941,0.0,0.0,0.294845,0.0,0.0,11.925042
3741,163000.0,7409.090909,22.0,3168.431953,0.0,7409.090909,0.0,1.0,0.427583,0.0,0.045455,12.001512


## Features in the credit data


In [4]:
for col in X.columns:
    print(col)


Total_Transaction_Amount
Average_Transaction_Amount
Transaction_Count
Std_Transaction_Amount
Transaction_Recency
Avg_Amount_By_Category
Count_By_FraudResult
Night_Transactions
Amount_CV
Dormant_Flag
Night_Txn_Ratio
Log_Total_Amount


In [5]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: is_high_risk, dtype: int64