# Feature Engineering

Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import sys

sys.path.append(os.path.abspath('..'))



In [2]:
from src.data_preprocessing import (
    build_feature_engineering_pipeline,
    save_pipeline
)

In [3]:

raw_path = "/Users/elbethelzewdie/Downloads/credit_risk_model/credit-risk-model/data/raw/data.csv"    # <-- change if needed

df = pd.read_csv(raw_path)
print("‚úÖ Raw data loaded:", df.shape)

‚úÖ Raw data loaded: (95662, 16)


In [4]:
print("üìä Data Overview")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

üìä Data Overview
Shape: (95662, 16)
Columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']


In [5]:
# -------------------------------------------------------------------
# DEFINE YOUR COLUMNS (ADJUST THESE BASED ON YOUR DATA)
# -------------------------------------------------------------------
# Example - adjust based on your actual column names
CUSTOMER_ID_COL = 'CustomerId'
AMOUNT_COL = 'Amount'  # Your transaction amount column
DATE_COL = 'TransactionStartTime'


In [6]:
# Identify numeric and categorical columns from your data
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()


In [7]:
# Remove ID and date columns from feature lists
id_cols = [CUSTOMER_ID_COL, 'TransactionId', 'AccountId', 'SubscriptionId', 'BatchId', 'CountryCode']
date_cols = [DATE_COL]

In [8]:
# Clean feature lists
numeric_features = [col for col in numeric_cols 
                    if col not in id_cols + date_cols and col != AMOUNT_COL]
categorical_features = [col for col in categorical_cols 
                        if col not in id_cols + date_cols]

print(f"\nüîß Column Configuration")
print(f"Customer ID column: {CUSTOMER_ID_COL}")
print(f"Amount column: {AMOUNT_COL}")
print(f"Date column: {DATE_COL}")
print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")


üîß Column Configuration
Customer ID column: CustomerId
Amount column: Amount
Date column: TransactionStartTime
Numeric features: 3
Categorical features: 5


In [9]:
# -------------------------------------------------------------------
# BUILD AND RUN PIPELINE
# -------------------------------------------------------------------
print("\nüöÄ Building Feature Engineering Pipeline")
print("=" * 50)

pipeline = build_feature_engineering_pipeline(
    customer_id_col=CUSTOMER_ID_COL,
    amount_col=AMOUNT_COL,
    date_col=DATE_COL,
    categorical_cols=categorical_features,
    numeric_cols=numeric_features,
    use_woe=False,
    woe_target_col=None,
    remove_outliers=True,
    scaling_method='standard'
)

print("Fitting and transforming data...")
X_features = pipeline.fit_transform(df)

print(f"\n‚úÖ Feature Engineering Complete!")
print(f"   Original shape: {df.shape}")
print(f"   Engineered features shape: {X_features.shape}")
print(f"   Output type: {type(X_features)}")



üöÄ Building Feature Engineering Pipeline
Fitting and transforming data...

‚úÖ Feature Engineering Complete!
   Original shape: (95662, 16)
   Engineered features shape: (3742, 66)
   Output type: <class 'pandas.core.frame.DataFrame'>


In [10]:
# -------------------------------------------------------------------
# CHECK THE OUTPUT
# -------------------------------------------------------------------
print("\nüîç Checking Output")
print("=" * 50)

if isinstance(X_features, pd.DataFrame):
    print("‚úÖ Output is a DataFrame")
    print(f"\nüìã First 5 rows:")
    print(X_features.head())
    
    print(f"\nüìä Column names (first 10):")
    for i, col in enumerate(X_features.columns[:10]):
        print(f"{i+1:2d}. {col}")
else:
    print("‚ö†Ô∏è Output is not a DataFrame")
    print(f"Type: {type(X_features)}")
    print(f"Shape: {X_features.shape}")


üîç Checking Output
‚úÖ Output is a DataFrame

üìã First 5 rows:
        CustomerId  num__Amount_sum  num__Amount_mean  num__Amount_count  \
0     CustomerId_1        -0.922147         -2.380334          -0.839070   
1    CustomerId_10        -0.922147         -2.380334          -0.839070   
2  CustomerId_1001        -0.474393          0.159262          -0.582472   
3  CustomerId_1002        -0.709837         -0.882311          -0.197574   
4  CustomerId_1003        -0.474393         -0.032773          -0.518322   

   num__Amount_std  num__Amount_min  num__Amount_max  num__Amount_median  \
0        -0.958301        -1.918248        -1.616945           -2.318447   
1        -0.958301        -1.918248        -1.616945           -2.318447   
2         0.421416        -0.775772        -0.204929            0.539541   
3        -0.840396         0.349567        -0.805036           -0.839950   
4         0.310246        -0.775772        -0.204929           -0.041297   

   num__Value_mean

In [11]:
# -------------------------------------------------------------------
# SAVE RESULTS
# -------------------------------------------------------------------
print("\nüíæ Saving Results")
print("=" * 50)

output_dir = "/Users/elbethelzewdie/Downloads/credit_risk_model/credit-risk-model/data/processed/"
os.makedirs(output_dir, exist_ok=True)

# Save engineered features
features_path = f"{output_dir}preprocessed.csv"

if isinstance(X_features, pd.DataFrame):
    X_features.to_csv(features_path, index=False)
else:
    pd.DataFrame(X_features).to_csv(features_path, index=False)

print(f"‚úÖ Engineered features saved to: {features_path}")

# Save pipeline (NOW IT WILL WORK - no nested classes)
pipeline_path = f"{output_dir}feature_pipeline.pkl"
save_pipeline(pipeline, pipeline_path)

# Verify the saved file
print(f"\nüìÅ Verifying saved file:")
saved_df = pd.read_csv(features_path)
print(f"   Loaded shape: {saved_df.shape}")
print(f"   First 3 columns: {saved_df.columns.tolist()[:3]}")
print(f"   Sample values from first row:")
for col in saved_df.columns[:3]:
    print(f"     {col}: {saved_df[col].iloc[0]}")

print("\nüéâ Feature Engineering Pipeline Complete!")
print("=" * 50)


üíæ Saving Results
‚úÖ Engineered features saved to: /Users/elbethelzewdie/Downloads/credit_risk_model/credit-risk-model/data/processed/preprocessed.csv
‚úÖ Pipeline saved to: /Users/elbethelzewdie/Downloads/credit_risk_model/credit-risk-model/data/processed/feature_pipeline.pkl

üìÅ Verifying saved file:
   Loaded shape: (3742, 66)
   First 3 columns: ['CustomerId', 'num__Amount_sum', 'num__Amount_mean']
   Sample values from first row:
     CustomerId: CustomerId_1
     num__Amount_sum: -0.9221473875629126
     num__Amount_mean: -2.3803340748175787

üéâ Feature Engineering Pipeline Complete!


- After feature engineering and preprocessing pipeline, the dataset now has 66 features.
- Originally, the raw dataset had 16 columns, so the pipeline has created many new features (aggregations, date features, outlier removal, etc.) and also applied One-Hot Encoding for categorical columns.

‚úÖ Overall interpretation

- The feature engineering pipeline worked as intended: it expanded the dataset from 16 to 66 numeric features, removed outliers, and created meaningful features for modeling.

- High-variance features like transaction counts, sums, and coefficients of variation are likely strong predictors of credit risk.

- The absence of categorical features suggests all remaining columns are already numeric, ready for ML models like logistic regression, XGBoost, or neural networks.