# Notebook 2: Feature Engineering

## Configuration de l'environnement

In [1]:
# Standard libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import chi2_contingency

# Sklearn preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Models
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostRegressor

## Parameters

In [2]:
# Set random seed for reproducibility
R_seed = 42
np.random.seed(R_seed)

# Data path
data_path = '../data/data_raw/Loan_data.csv'
data_description_path = '../data/data_raw/Dictionnaire_des_données.xlsx'
output_data_path = '../data/data_processed/data_feature_engineering.csv'

## Feature Engineering

In [3]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0


In [4]:
df.shape

(20000, 36)

In [5]:
# Ensure ApplicationDate is datetime
df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'])

# =======================
# 1. Time-Based Features
# =======================
df['ApplicationMonth'] = df['ApplicationDate'].dt.month
df['ApplicationDayOfWeek'] = df['ApplicationDate'].dt.dayofweek
df['ApplicationQuarter'] = df['ApplicationDate'].dt.quarter
df['IsEndOfMonth'] = df['ApplicationDate'].dt.is_month_end.astype(int)

# =======================
# 2. Income & Financial Stability
# =======================
df['IncomeStabilityRatio'] = df['AnnualIncome'] / (df['JobTenure'] + 1)  # avoid div by zero
df['SavingsToIncomeRatio'] = df['SavingsAccountBalance'] / (df['AnnualIncome'] + 1)
df['CheckingToIncomeRatio'] = df['CheckingAccountBalance'] / (df['AnnualIncome'] + 1)
df['AssetsToLiabilitiesRatio'] = df['TotalAssets'] / (df['TotalLiabilities'] + 1)
df['DisposableIncome'] = df['MonthlyIncome'] - df['MonthlyDebtPayments'] - df['MonthlyLoanPayment']
df['IncomePerDependent'] = df['AnnualIncome'] / (df['NumberOfDependents'] + 1)
df['DebtServiceBurden'] = df['MonthlyLoanPayment'] / (df['MonthlyIncome'] + 1)
df['LiquidAssets'] = df['SavingsAccountBalance'] + df['CheckingAccountBalance']
df['EmergencyFundMonths'] = df['LiquidAssets'] / (df['MonthlyDebtPayments'] + 1)

# =======================
# 3. Credit Behavior
# =======================
df['CreditStressIndex'] = df['CreditCardUtilizationRate'] * df['NumberOfOpenCreditLines']
df['RecentInquiryRate'] = df['NumberOfCreditInquiries'] / (df['LengthOfCreditHistory'] + 1)
df['HighUtilizationFlag'] = (df['CreditCardUtilizationRate'] > 0.3).astype(int)

# =======================
# 4. Loan Structure & Cost
# =======================
df['LoanToIncomeRatio'] = df['LoanAmount'] / (df['AnnualIncome'] + 1)
df['LoanToAssetsRatio'] = df['LoanAmount'] / (df['TotalAssets'] + 1)
df['InterestSpread'] = df['InterestRate'] - df['BaseInterestRate']
df['TotalLoanCost'] = df['MonthlyLoanPayment'] * df['LoanDuration']
df['LoanRepaymentBurden'] = df['MonthlyLoanPayment'] / (df['MonthlyIncome'] + 1)
df['InterestBurden'] = (df['TotalLoanCost'] - df['LoanAmount']) / (df['AnnualIncome'] + 1)
df['IsHighInterestLoan'] = (df['InterestRate'] > df['InterestRate'].quantile(0.75)).astype(int)

# =======================
# 5. Employment & Demographics
# =======================
df['CareerProgressIndex'] = df['Experience'] / (df['Age'] + 1)
df['JobStabilityIndex'] = df['JobTenure'] / (df['Experience'] + 1)
df['DependentsBurdenRatio'] = df['NumberOfDependents'] / (df['NumberOfDependents'] + 1)  # simple approx

# =======================
# 6. Behavioral & Risk
# =======================
df['RiskExposureScore'] = df['DebtToIncomeRatio'] + df['TotalDebtToIncomeRatio']
df['OverleveragedFlag'] = (df['DebtToIncomeRatio'] > 0.4).astype(int)

# =======================
# 7. Interaction Features
# =======================
df['Age_x_CreditScore'] = df['Age'] * df['CreditScore']
df['LoanAmount_x_InterestRate'] = df['LoanAmount'] * df['InterestRate']
df['CreditUtilization_x_Income'] = df['CreditCardUtilizationRate'] * df['AnnualIncome']
df['JobTenure_x_CreditScore'] = df['JobTenure'] * df['CreditScore']

# =======================
# 8. Normalized / Aggregated Features
# =======================
df['NormalizedDebt'] = df['TotalLiabilities'] / (df['NetWorth'] + 1)
df['NormalizedIncome'] = df['AnnualIncome'] / (df['AnnualIncome'].mean() + 1)

print("Feature engineering complete! New columns added:", df.columns[-20:])


Feature engineering complete! New columns added: Index(['RecentInquiryRate', 'HighUtilizationFlag', 'LoanToIncomeRatio',
       'LoanToAssetsRatio', 'InterestSpread', 'TotalLoanCost',
       'LoanRepaymentBurden', 'InterestBurden', 'IsHighInterestLoan',
       'CareerProgressIndex', 'JobStabilityIndex', 'DependentsBurdenRatio',
       'RiskExposureScore', 'OverleveragedFlag', 'Age_x_CreditScore',
       'LoanAmount_x_InterestRate', 'CreditUtilization_x_Income',
       'JobTenure_x_CreditScore', 'NormalizedDebt', 'NormalizedIncome'],
      dtype='object')


In [6]:
df.shape

(20000, 70)

In [7]:
os.makedirs('../data/data_processed', exist_ok=True)
df.to_csv(output_data_path, index=False)


In [8]:
df

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,JobStabilityIndex,DependentsBurdenRatio,RiskExposureScore,OverleveragedFlag,Age_x_CreditScore,LoanAmount_x_InterestRate,CreditUtilization_x_Income,JobTenure_x_CreditScore,NormalizedDebt,NormalizedIncome
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,0.478261,0.666667,0.539413,0,27765,2993.258818,14158.287021,6787,0.151132,0.675225
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,0.187500,0.500000,0.720126,0,23864,5237.053629,3487.521080,1884,0.220018,0.671186
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,0.222222,0.666667,0.706886,0,26790,3746.583770,5596.051981,3420,24.754898,0.688342
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,0.142857,0.500000,0.749343,1,31610,11403.919361,18485.989769,2725,0.053995,1.167700
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,0.277778,0.500000,0.149094,0,21978,1616.293802,33099.759774,2970,0.076143,1.745431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2072-09-29,44,30180,587,Employed,High School,19,24521,36,Married,...,0.050000,0.750000,1.095818,1,25828,4795.669119,3056.867845,587,0.463454,0.510121
19996,2072-09-30,56,49246,567,Employed,Associate,33,25818,36,Married,...,0.088235,0.833333,0.651790,0,31752,5142.126894,23235.131755,1701,0.087012,0.832386
19997,2072-10-01,44,48958,645,Employed,Bachelor,20,37033,72,Married,...,0.142857,0.750000,0.380241,0,28380,8397.838453,10604.116291,1935,0.044885,0.827518
19998,2072-10-02,60,41025,560,Employed,High School,36,14760,72,Married,...,0.135135,0.750000,0.943195,1,33600,3909.520782,14964.184454,2800,1.083388,0.693429
