In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("🏦 PHASE 2: ADVANCED FINANCIAL FEATURE ENGINEERING")
print("=" * 70)
print("Building sophisticated domain-specific features for loan risk assessment")
print("=" * 70)


🏦 PHASE 2: ADVANCED FINANCIAL FEATURE ENGINEERING
Building sophisticated domain-specific features for loan risk assessment


In [3]:
# Load the data from Phase 1 (assuming you've resolved the target column issue)
df = pd.read_csv('../data/processed/baseline_data.csv')  # We'll save this from Phase 1
print(f"📊 Dataset loaded: {df.shape[0]:,} loans with {df.shape[1]} features")

# Verify we have the key columns for financial engineering
required_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                'DTIRatio', 'InterestRate', 'LoanTerm', 'NumCreditLines']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    print(f"⚠️ Missing columns: {missing_cols}")
else:
    print("✅ All required columns present for financial feature engineering")


📊 Dataset loaded: 255,347 loans with 30 features
✅ All required columns present for financial feature engineering


In [4]:
print(f"\n💰 ADVANCED PAYMENT CAPACITY & CASH FLOW ANALYSIS")
print("-" * 70)

# Sophisticated debt service calculations
df['monthly_gross_income'] = df['Income'] / 12
df['existing_monthly_debt'] = df['DTIRatio'] * df['monthly_gross_income']
df['proposed_monthly_payment'] = df['LoanAmount'] / df['LoanTerm']
df['total_monthly_obligations'] = df['existing_monthly_debt'] + df['proposed_monthly_payment']

# Industry-standard debt service coverage ratio
df['debt_service_coverage_ratio'] = df['monthly_gross_income'] / df['total_monthly_obligations']
df['front_end_ratio'] = df['proposed_monthly_payment'] / df['monthly_gross_income']
df['back_end_ratio'] = df['total_monthly_obligations'] / df['monthly_gross_income']

# Cash flow stress testing
df['disposable_income'] = df['monthly_gross_income'] - df['total_monthly_obligations']
df['emergency_fund_months'] = np.where(df['disposable_income'] > 0, 
                                      1000 / np.maximum(df['disposable_income'], 1), 12)

# Payment shock analysis
df['current_payment_burden'] = df['existing_monthly_debt'] / df['monthly_gross_income']
df['payment_shock'] = df['back_end_ratio'] - df['current_payment_burden']
df['payment_shock_severity'] = pd.cut(df['payment_shock'], 
                                     bins=[-np.inf, 0.05, 0.15, 0.25, np.inf],
                                     labels=['Minimal', 'Moderate', 'High', 'Severe'])

print("✅ Advanced payment capacity features created:")
print(f"   Average debt service coverage: {df['debt_service_coverage_ratio'].mean():.2f}")
print(f"   High payment shock cases: {(df['payment_shock_severity'] == 'Severe').sum():,}")



💰 ADVANCED PAYMENT CAPACITY & CASH FLOW ANALYSIS
----------------------------------------------------------------------
✅ Advanced payment capacity features created:
   Average debt service coverage: 1.07
   High payment shock cases: 196,004


In [5]:
print(f"\n📈 CREDIT INTELLIGENCE & BEHAVIORAL RISK PATTERNS")
print("-" * 70)

# Advanced credit utilization modeling
df['estimated_total_credit_limit'] = df['NumCreditLines'] * 8000  # Industry average per line
df['estimated_current_balance'] = df['DTIRatio'] * df['Income'] * 0.4  # 40% assumed credit debt
df['credit_utilization_ratio'] = np.minimum(df['estimated_current_balance'] / 
                                           np.maximum(df['estimated_total_credit_limit'], 1), 1.5)

# Credit score velocity and trajectory
df['credit_score_percentile'] = df['CreditScore'].rank(pct=True)
df['credit_score_z_score'] = (df['CreditScore'] - df['CreditScore'].mean()) / df['CreditScore'].std()

# Risk tier sophistication
df['credit_risk_tier'] = pd.cut(df['CreditScore'], 
                               bins=[0, 580, 620, 660, 720, 780, 850],
                               labels=['Subprime', 'Near Prime', 'Prime-', 'Prime', 'Prime+', 'Super Prime'])

# Interest rate intelligence (reveals lender's risk assessment)
df['expected_rate_by_score'] = 20 - (df['CreditScore'] - 300) / 550 * 15  # Rate model
df['rate_variance'] = df['InterestRate'] - df['expected_rate_by_score']
df['rate_anomaly_flag'] = np.abs(df['rate_variance']) > 3  # Unusual pricing

# Credit line depth and sophistication
df['credit_line_density'] = df['NumCreditLines'] / np.maximum(df['Age'] - 18, 1)
df['credit_maturity_score'] = np.minimum((df['Age'] - 18) / 20, 1.0) * np.minimum(df['NumCreditLines'] / 5, 1.0)

print("✅ Credit intelligence features created:")
print(f"   Average credit utilization: {df['credit_utilization_ratio'].mean():.1%}")
print(f"   Rate anomalies detected: {df['rate_anomaly_flag'].sum():,} ({df['rate_anomaly_flag'].mean():.1%})")
print(f"   Credit tier distribution:")
print(df['credit_risk_tier'].value_counts().head().to_dict())



📈 CREDIT INTELLIGENCE & BEHAVIORAL RISK PATTERNS
----------------------------------------------------------------------
✅ Credit intelligence features created:
   Average credit utilization: 81.9%
   Rate anomalies detected: 188,971 (74.0%)
   Credit tier distribution:
{'Subprime': 130708, 'Super Prime': 32307, 'Prime': 27571, 'Prime+': 27552, 'Prime-': 18638}


In [6]:
print(f"\n👔 EMPLOYMENT STABILITY & HUMAN CAPITAL INTELLIGENCE")
print("-" * 70)

# Advanced employment stability modeling
df['employment_stability_index'] = np.minimum(df['MonthsEmployed'] / 36, 1.0)  # 3-year stability benchmark
df['career_stage'] = pd.cut(df['Age'], bins=[0, 25, 35, 45, 55, 100],
                           labels=['Entry Level', 'Early Career', 'Mid Career', 'Senior Career', 'Late Career'])

# Income quality and sustainability
df['income_adequacy_ratio'] = df['Income'] / (df['Age'] * 1000)  # Income relative to age expectation
df['income_leverage_potential'] = df['Income'] * df['employment_stability_index']

# Education premium analysis
education_income_median = df.groupby('Education')['Income'].median()
df['education_income_premium'] = df.apply(lambda x: x['Income'] / education_income_median[x['Education']], axis=1)

# Employment type risk profiling
employment_default_rates = df.groupby('EmploymentType')['Default'].mean()
df['employment_type_risk_score'] = df['EmploymentType'].map(employment_default_rates)

# Human capital composite score
df['human_capital_score'] = (
    (df['employment_stability_index'] * 0.3) +
    (df['education_income_premium'] * 0.25) +
    (df['income_adequacy_ratio'] / df['income_adequacy_ratio'].max() * 0.25) +
    ((1 - df['employment_type_risk_score']) * 0.2)
)

print("✅ Human capital features created:")
print(f"   Average human capital score: {df['human_capital_score'].mean():.3f}")
print(f"   Career stage distribution:")
print(df['career_stage'].value_counts().to_dict())



👔 EMPLOYMENT STABILITY & HUMAN CAPITAL INTELLIGENCE
----------------------------------------------------------------------
✅ Human capital features created:
   Average human capital score: 0.746
   Career stage distribution:
{'Late Career': 68555, 'Early Career': 49408, 'Mid Career': 49220, 'Senior Career': 49148, 'Entry Level': 39016}


In [7]:
print(f"\n🎯 LOAN STRUCTURE INTELLIGENCE & MARKET POSITIONING")
print("-" * 70)

# Loan sizing sophistication
df['loan_to_income_multiple'] = df['LoanAmount'] / df['Income']
df['loan_size_percentile'] = df['LoanAmount'].rank(pct=True)
df['income_adjusted_loan_size'] = df['LoanAmount'] / (df['Income'] / 50000)  # Normalized to 50k baseline

# Term structure analysis
df['term_risk_premium'] = (df['LoanTerm'] - 24) / 12 * 0.02  # Risk premium for longer terms
df['monthly_payment_burden'] = (df['LoanAmount'] / df['LoanTerm']) / df['monthly_gross_income']

# Market positioning and competitive intelligence
df['rate_competitiveness'] = df['InterestRate'].rank(pct=True, ascending=False)  # Lower rates = more competitive
df['total_interest_cost'] = (df['LoanAmount'] / df['LoanTerm'] * df['LoanTerm']) - df['LoanAmount']
df['interest_to_principal_ratio'] = df['total_interest_cost'] / df['LoanAmount']

# Risk-adjusted pricing efficiency
df['risk_adjusted_pricing'] = df['InterestRate'] / (1 - df['CreditScore'] / 850)
df['pricing_efficiency_score'] = 1 / (1 + np.abs(df['rate_variance']))

# Loan purpose intelligence
purpose_risk_scores = df.groupby('LoanPurpose')['Default'].mean()
df['purpose_risk_score'] = df['LoanPurpose'].map(purpose_risk_scores)
df['purpose_risk_tier'] = pd.cut(df['purpose_risk_score'], bins=3, labels=['Low Risk', 'Medium Risk', 'High Risk'])

print("✅ Loan structure intelligence created:")
print(f"   Average loan-to-income multiple: {df['loan_to_income_multiple'].mean():.2f}")
print(f"   Loan purpose risk distribution:")
print(df['purpose_risk_tier'].value_counts().to_dict())



🎯 LOAN STRUCTURE INTELLIGENCE & MARKET POSITIONING
----------------------------------------------------------------------
✅ Loan structure intelligence created:
   Average loan-to-income multiple: 2.18
   Loan purpose risk distribution:
{'High Risk': 204061, 'Low Risk': 51286, 'Medium Risk': 0}


In [10]:
print(f"\n🔗 ADVANCED RISK INTERACTIONS & PORTFOLIO INTELLIGENCE")
print("-" * 70)

# Multi-dimensional risk interactions
df['age_income_stability'] = df['Age'] * df['employment_stability_index'] * df['Income'] / 100000
df['credit_capacity_interaction'] = df['CreditScore'] * df['debt_service_coverage_ratio'] / 100
df['collateral_capacity'] = (df['HasMortgage'] == 'Yes').astype(int) * df['Income'] / 10000

# Advanced composite risk scoring (Basel-inspired)
df['probability_default_base'] = (
    (850 - df['CreditScore']) / 550 * 0.4 +  # Credit history
    np.minimum(df['DTIRatio'] / 0.6, 1) * 0.25 +  # Debt burden
    (1 - df['employment_stability_index']) * 0.2 +  # Employment risk
    np.minimum(df['Age'] / 65, 1) * 0.15  # Age factor (inverted)
)

# Warning flag system (regulatory-style)
df['liquidity_stress_flag'] = (df['debt_service_coverage_ratio'] < 1.25).astype(int)
df['credit_stress_flag'] = (df['credit_utilization_ratio'] > 0.8).astype(int)
df['employment_stress_flag'] = (df['MonthsEmployed'] < 6).astype(int)
df['payment_stress_flag'] = (df['payment_shock'] > 0.2).astype(int)

df['total_stress_flags'] = (df['liquidity_stress_flag'] + df['credit_stress_flag'] + 
                           df['employment_stress_flag'] + df['payment_stress_flag'])

# Opportunity scoring (profitable low-risk identification)
df['stability_score'] = (df['employment_stability_index'] + df['credit_maturity_score']) / 2
df['capacity_score'] = np.minimum(df['debt_service_coverage_ratio'] / 2, 1)
df['quality_score'] = df['CreditScore'] / 850

df['opportunity_index'] = (df['stability_score'] * 0.4 + df['capacity_score'] * 0.35 + df['quality_score'] * 0.25)

# Portfolio diversification features - FIXED VERSION
age_buckets = pd.cut(df['Age'], bins=5, labels=['Young', 'Y-Adult', 'Adult', 'M-Age', 'Senior'])
income_buckets = pd.cut(df['Income'], bins=3, labels=['Low', 'Mid', 'High'])
df['age_income_bucket'] = age_buckets.astype(str) + '_' + income_buckets.astype(str)

print("✅ Advanced risk interaction features created:")
print(f"   Average composite default probability: {df['probability_default_base'].mean():.3f}")
print(f"   High stress cases (3+ flags): {(df['total_stress_flags'] >= 3).sum():,}")
print(f"   High opportunity cases: {(df['opportunity_index'] > 0.7).sum():,}")



🔗 ADVANCED RISK INTERACTIONS & PORTFOLIO INTELLIGENCE
----------------------------------------------------------------------
✅ Advanced risk interaction features created:
   Average composite default probability: 0.516
   High stress cases (3+ flags): 86,947
   High opportunity cases: 50,702


In [15]:
print(f"\n🔗 ADVANCED RISK INTERACTIONS & PORTFOLIO INTELLIGENCE")
print("-" * 70)

# Multi-dimensional risk interactions
df['age_income_stability'] = df['Age'] * df['employment_stability_index'] * df['Income'] / 100000
df['credit_capacity_interaction'] = df['CreditScore'] * df['debt_service_coverage_ratio'] / 100
df['collateral_capacity'] = (df['HasMortgage'] == 'Yes').astype(int) * df['Income'] / 10000

# Advanced composite risk scoring (Basel-inspired)
df['probability_default_base'] = (
    (850 - df['CreditScore']) / 550 * 0.4 +  # Credit history
    np.minimum(df['DTIRatio'] / 0.6, 1) * 0.25 +  # Debt burden
    (1 - df['employment_stability_index']) * 0.2 +  # Employment risk
    np.minimum(df['Age'] / 65, 1) * 0.15  # Age factor (inverted)
)

# Warning flag system (regulatory-style)
df['liquidity_stress_flag'] = (df['debt_service_coverage_ratio'] < 1.25).astype(int)
df['credit_stress_flag'] = (df['credit_utilization_ratio'] > 0.8).astype(int)
df['employment_stress_flag'] = (df['MonthsEmployed'] < 6).astype(int)
df['payment_stress_flag'] = (df['payment_shock'] > 0.2).astype(int)

df['total_stress_flags'] = (df['liquidity_stress_flag'] + df['credit_stress_flag'] + 
                           df['employment_stress_flag'] + df['payment_stress_flag'])

# Opportunity scoring (profitable low-risk identification)
df['stability_score'] = (df['employment_stability_index'] + df['credit_maturity_score']) / 2
df['capacity_score'] = np.minimum(df['debt_service_coverage_ratio'] / 2, 1)
df['quality_score'] = df['CreditScore'] / 850

df['opportunity_index'] = (df['stability_score'] * 0.4 + df['capacity_score'] * 0.35 + df['quality_score'] * 0.25)

# Portfolio diversification features
age_buckets = pd.cut(df['Age'], bins=5, labels=['Young', 'Y-Adult', 'Adult', 'M-Age', 'Senior'])
income_buckets = pd.cut(df['Income'], bins=3, labels=['Low', 'Mid', 'High'])
df['age_income_bucket'] = age_buckets.astype(str) + '_' + income_buckets.astype(str)

print("✅ Advanced risk interaction features created:")
print(f"   Average composite default probability: {df['probability_default_base'].mean():.3f}")
print(f"   High stress cases (3+ flags): {(df['total_stress_flags'] >= 3).sum():,}")
print(f"   High opportunity cases: {(df['opportunity_index'] > 0.7).sum():,}")




🔗 ADVANCED RISK INTERACTIONS & PORTFOLIO INTELLIGENCE
----------------------------------------------------------------------
✅ Advanced risk interaction features created:
   Average composite default probability: 0.516
   High stress cases (3+ flags): 86,947
   High opportunity cases: 50,702


In [16]:
print(f"\n🔍 FEATURE ENGINEERING VALIDATION & BUSINESS INSIGHTS")
print("-" * 70)

# Identify all new features created in Phase 2.1
original_columns = ['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
                   'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
                   'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 
                   'LoanPurpose', 'HasCoSigner', 'Default']

phase1_columns = [col for col in df.columns if col not in original_columns and 
                 col.endswith('_segment') or col in ['monthly_payment_capacity', 'payment_to_income_ratio',
                 'credit_utilization_estimated', 'employment_stability_score', 'risk_adjusted_loan_size',
                 'age_experience_factor', 'high_risk_profile', 'loan_affordability_index']]

new_features = [col for col in df.columns if col not in original_columns and col not in phase1_columns]

print(f"📊 FEATURE ENGINEERING SUMMARY:")
print(f"   Original features: {len(original_columns)}")
print(f"   Phase 1 features: {len(phase1_columns)}")
print(f"   Phase 2.1 new features: {len(new_features)}")
print(f"   Total features: {len(df.columns)}")

# Feature-target correlation analysis
feature_correlations = {}
for feature in new_features:
    if df[feature].dtype in ['int64', 'float64'] and not df[feature].isna().all():
        try:
            corr = df[feature].corr(df['Default'])
            if not np.isnan(corr):
                feature_correlations[feature] = corr
        except:
            continue

print(f"\n🎯 TOP PREDICTIVE FEATURES (Phase 2.1):")
sorted_features = sorted(feature_correlations.items(), key=lambda x: abs(x[1]), reverse=True)
for feature, corr in sorted_features[:10]:
    direction = "⬆️ Higher Default Risk" if corr > 0 else "⬇️ Lower Default Risk"
    print(f"   {feature}: {corr:.3f} {direction}")

# Business segment performance analysis
print(f"\n📈 ADVANCED RISK SEGMENTATION INSIGHTS:")

# Opportunity vs Risk matrix
high_opportunity = df['opportunity_index'] > 0.7
low_risk = df['total_stress_flags'] <= 1
risk_opportunity_matrix = pd.crosstab(
    high_opportunity, low_risk, 
    df['Default'], aggfunc='mean',
    margins=True
)
print(f"\nOpportunity-Risk Matrix (Default Rates):")
print(risk_opportunity_matrix.round(3))

# Credit tier performance with new features
advanced_segments = df.groupby('credit_risk_tier').agg({
    'Default': 'mean',
    'opportunity_index': 'mean',
    'total_stress_flags': 'mean',
    'debt_service_coverage_ratio': 'mean',
    'human_capital_score': 'mean'
}).round(3)

print(f"\nAdvanced Credit Tier Analysis:")
print(advanced_segments)



🔍 FEATURE ENGINEERING VALIDATION & BUSINESS INSIGHTS
----------------------------------------------------------------------
📊 FEATURE ENGINEERING SUMMARY:
   Original features: 18
   Phase 1 features: 11
   Phase 2.1 new features: 57
   Total features: 86

🎯 TOP PREDICTIVE FEATURES (Phase 2.1):
   income_adjusted_loan_size: 0.179 ⬆️ Higher Default Risk
   loan_to_income_multiple: 0.179 ⬆️ Higher Default Risk
   age_income_stability: -0.167 ⬇️ Lower Default Risk
   opportunity_index: -0.155 ⬇️ Lower Default Risk
   back_end_ratio: 0.135 ⬆️ Higher Default Risk
   payment_shock: 0.134 ⬆️ Higher Default Risk
   front_end_ratio: 0.134 ⬆️ Higher Default Risk
   monthly_payment_burden: 0.134 ⬆️ Higher Default Risk
   rate_competitiveness: -0.131 ⬇️ Lower Default Risk
   income_leverage_potential: -0.119 ⬇️ Lower Default Risk

📈 ADVANCED RISK SEGMENTATION INSIGHTS:

Opportunity-Risk Matrix (Default Rates):
total_stress_flags  False   True    All
opportunity_index                      
False  

In [17]:
print(f"\n💾 SAVING ENHANCED DATASET")
print("-" * 50)

# Save the dramatically enhanced dataset
df.to_csv('../data/processed/enhanced_features_dataset.csv', index=False)

print(f"✅ Enhanced dataset saved!")
print(f"   File: data/processed/enhanced_features_dataset.csv")
print(f"   Records: {df.shape[0]:,}")
print(f"   Total features: {df.shape[1]}")
print(f"   New advanced features: {len(new_features)}")

# Create feature documentation
feature_categories = {
    'Payment Capacity': ['debt_service_coverage_ratio', 'front_end_ratio', 'back_end_ratio', 'payment_shock'],
    'Credit Intelligence': ['credit_utilization_ratio', 'credit_risk_tier', 'rate_anomaly_flag', 'credit_maturity_score'],
    'Human Capital': ['employment_stability_index', 'human_capital_score', 'education_income_premium'],
    'Loan Structure': ['loan_to_income_multiple', 'pricing_efficiency_score', 'purpose_risk_score'],
    'Risk Interactions': ['probability_default_base', 'total_stress_flags', 'opportunity_index']
}

print(f"\n📋 FEATURE CATEGORIES CREATED:")
for category, features in feature_categories.items():
    valid_features = [f for f in features if f in df.columns]
    print(f"   {category}: {len(valid_features)} features")

print(f"\n🎉 PHASE 2.1 COMPLETE!")
print("=" * 80)
print("✅ Advanced financial domain expertise demonstrated")
print("✅ 40+ sophisticated risk assessment features created")
print("✅ Industry-standard payment capacity modeling implemented")
print("✅ Credit intelligence and behavioral analytics built")
print("✅ Human capital and employment risk modeling completed")
print("✅ Loan structure and pricing intelligence developed")
print("✅ Multi-dimensional risk interaction framework established")
print("\n🚀 READY FOR Phase 2.2: Business Cost Optimization & Threshold Selection")



💾 SAVING ENHANCED DATASET
--------------------------------------------------
✅ Enhanced dataset saved!
   File: data/processed/enhanced_features_dataset.csv
   Records: 255,347
   Total features: 86
   New advanced features: 57

📋 FEATURE CATEGORIES CREATED:
   Payment Capacity: 4 features
   Credit Intelligence: 4 features
   Human Capital: 3 features
   Loan Structure: 3 features
   Risk Interactions: 3 features

🎉 PHASE 2.1 COMPLETE!
✅ Advanced financial domain expertise demonstrated
✅ 40+ sophisticated risk assessment features created
✅ Industry-standard payment capacity modeling implemented
✅ Credit intelligence and behavioral analytics built
✅ Human capital and employment risk modeling completed
✅ Loan structure and pricing intelligence developed
✅ Multi-dimensional risk interaction framework established

🚀 READY FOR Phase 2.2: Business Cost Optimization & Threshold Selection
