# Feature Engineering

Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import joblib

sys.path.append(os.path.abspath('..'))


In [2]:
from src.feature_engineering import build_feature_engineering_pipeline
from src.feature_engineering import AggregateFeatures


In [3]:
raw_path = "/Users/elbethelzewdie/Downloads/credit-risk-analysis/Credit-risk-analysis/data/data.csv"    # <-- change if needed

df = pd.read_csv(raw_path)
print("‚úÖ Raw data loaded:", df.shape)

‚úÖ Raw data loaded: (95662, 16)


In [4]:
print("üìä Data Overview")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

üìä Data Overview
Shape: (95662, 16)
Columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']


In [5]:
# -------------------------------------------------------------------
# DEFINE YOUR COLUMNS (ADJUST THESE BASED ON YOUR DATA)
# -------------------------------------------------------------------
# Example - adjust based on your actual column names
CUSTOMER_ID_COL = 'CustomerId'
AMOUNT_COL = 'Amount'  # Your transaction amount column
DATE_COL = 'TransactionStartTime'


In [6]:
# Identify numeric and categorical columns from your data
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()


In [7]:
# Remove ID and date columns from feature lists
id_cols = [CUSTOMER_ID_COL, 'TransactionId', 'AccountId', 'SubscriptionId', 'BatchId', 'CountryCode']
date_cols = [DATE_COL]

In [8]:
# Clean feature lists
numeric_features = [
    col for col in numeric_cols 
    if col not in id_cols and col != AMOUNT_COL
]

categorical_features = [
    col for col in categorical_cols 
    if col not in id_cols
]


print(f"\nüîß Column Configuration")
print(f"Customer ID column: {CUSTOMER_ID_COL}")
print(f"Amount column: {AMOUNT_COL}")
print(f"Date column: {DATE_COL}")
print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")


üîß Column Configuration
Customer ID column: CustomerId
Amount column: Amount
Date column: TransactionStartTime
Numeric features: 3
Categorical features: 6


In [10]:
y = df["FraudResult"]
X = df.drop(columns=["FraudResult"])


agg = AggregateFeatures(
    customer_id_col=CUSTOMER_ID_COL,
    amount_col=AMOUNT_COL
)

X_agg = agg.fit_transform(df)

y_agg = (
    df.groupby(CUSTOMER_ID_COL)["FraudResult"]
    .max()
    .reset_index(drop=True)
)


In [13]:
pipeline = build_feature_engineering_pipeline(
    customer_id_col=CUSTOMER_ID_COL,
    amount_col=AMOUNT_COL,
    date_col=DATE_COL,
    numeric_cols=numeric_features,   # Only numeric columns
    scaling_method="standard"
)

X_features = pipeline.fit_transform(X, y)

print(f"\n‚úÖ Feature Engineering Complete!")
print(f"   Original shape: {df.shape}")
print(f"   Aggregated features shape: {X_features.shape}")
print(f"   Output type: {type(X_features)}")


  bins_X_grouped = bins_X.groupby('Bins', as_index=True)
  bins_X_grouped = bins_X.groupby('Bins', as_index=True)
  bins_X_grouped = bins_X.groupby('Bins', as_index=True)
  bins_X_grouped = bins_X.groupby('Bins', as_index=True)
  bins_X_grouped = bins_X.groupby('Bins', as_index=True)
  bins_X_grouped = bins_X.groupby('Bins', as_index=True)
  bins_X_grouped = bins_X.groupby('Bins', as_index=True)
  bins_X_grouped = bins_X.groupby('Bins', as_index=True)


AttributeError: module 'pandas.core.algorithms' has no attribute 'quantile'

In [None]:
# -------------------------------------------------------------------
# BUILD AND RUN PIPELINE
# -------------------------------------------------------------------

print("\nüöÄ Building Feature Engineering Pipeline")
print("=" * 50)

pipeline = build_feature_engineering_pipeline(
    customer_id_col=CUSTOMER_ID_COL,
    amount_col=AMOUNT_COL,
    date_col=DATE_COL,
    numeric_cols=numeric_features,   # Only this is supported
    scaling_method="standard"
)

print("Fitting and transforming data...")



X_features = pipeline.fit_transform(X, y)


print(f"\n‚úÖ Feature Engineering Complete!")
print(f"   Original shape: {df.shape}")
print(f"   Engineered features shape: {X_features.shape}")
print(f"   Output type: {type(X_features)}")


In [None]:
# -------------------------------------------------------------------
# CHECK THE OUTPUT
# -------------------------------------------------------------------
print("\nüîç Checking Output")
print("=" * 50)

if isinstance(X_features, pd.DataFrame):
    print("‚úÖ Output is a DataFrame")
    print(f"\nüìã First 5 rows:")
    print(X_features.head())
    
    print(f"\nüìä Column names (first 10):")
    for i, col in enumerate(X_features.columns[:10]):
        print(f"{i+1:2d}. {col}")
else:
    print("‚ö†Ô∏è Output is not a DataFrame")
    print(f"Type: {type(X_features)}")
    print(f"Shape: {X_features.shape}")

In [None]:
# -------------------------------------------------------------------
# SAVE RESULTS
# -------------------------------------------------------------------

print("\nüíæ Saving Results")
print("=" * 50)

# Use project-relative path (recommended)
output_dir = "/Users/elbethelzewdie/Downloads/credit-risk-analysis/Credit-risk-analysis/data/processed"
os.makedirs(output_dir, exist_ok=True)

# -------------------------------------------------
# Save Engineered Features
# -------------------------------------------------

features_path = os.path.join(output_dir, "preprocessed.csv")

if isinstance(X_features, pd.DataFrame):
    X_features.to_csv(features_path, index=False)
else:
    pd.DataFrame(X_features).to_csv(features_path, index=False)

print(f"‚úÖ Engineered features saved to: {features_path}")

# -------------------------------------------------
# Save Pipeline
# -------------------------------------------------

pipeline_path = os.path.join(output_dir, "feature_pipeline.pkl")

joblib.dump(pipeline, pipeline_path)

print(f"‚úÖ Pipeline saved to: {pipeline_path}")

# -------------------------------------------------
# Verify Saved File
# -------------------------------------------------

print("\nüìÅ Verifying saved file:")
saved_df = pd.read_csv(features_path)

print(f"   Loaded shape: {saved_df.shape}")
print(f"   First 3 columns: {saved_df.columns.tolist()[:3]}")
print("   Sample values from first row:")

for col in saved_df.columns[:3]:
    print(f"     {col}: {saved_df[col].iloc[0]}")

print("\nüéâ Feature Engineering Pipeline Complete!")
print("=" * 50)


- After feature engineering and preprocessing pipeline, the dataset now has 55 features.
- Originally, the raw dataset had 16 columns, so the pipeline has created many new features (aggregations, date features, outlier removal, etc.) and also applied One-Hot Encoding for categorical columns.

‚úÖ Overall interpretation

- The feature engineering pipeline worked as intended: it expanded the dataset from 16 to 55 numeric features, removed outliers, and created meaningful features for modeling.

- High-variance features like transaction counts, sums, and coefficients of variation are likely strong predictors of credit risk.

- The absence of categorical features suggests all remaining columns are already numeric, ready for ML models like logistic regression, XGBoost, or neural networks.