In [None]:
import pandas as pd
import numpy as np

# =============================================================================
# PHASE 3: STRATEGIC FEATURE ENGINEERING & RISK SCORING
# =============================================================================

df_1 = pd.read_excel('cleaned_financial_loan_data.xlsx')

# --- 0. Dimensionality Reduction ---
# Removing redundant bins and imputation flags to prevent multicollinearity 
# and focus on high-signal engineered features.
cols_to_drop = [
    'dti_bins', 'bins_loan_amount', 'bins_annual_income', 
    'last_credit_pull_date_is_imputed', 'last_payment_date_is_imputed', 
    'next_payment_date_is_imputed', 'dti_int_rate'
]
df_1.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# --- 1. Financial Capacity Ratios ---
# Quantifying the borrower's fundamental repayment ability and debt burden.

# Payment-to-Income (PTI): Annual debt service relative to total income
df_1['payment_to_income_ratio'] = (df_1['installment'] * 12) / df_1['annual_income']

# Loan-to-Income (LTI): Total leverage magnitude relative to earning power
df_1['loan_to_income_ratio'] = df_1['loan_amount'] / df_1['annual_income']

# --- 2. Ordinal & Behavioral Encoding ---

# Credit Grade Numerical Score: Converting qualitative risk tiers into an ordinal scale
grade_mapping = {'A': 7, 'B': 6, 'C': 5, 'D': 4, 'E': 3, 'F': 2, 'G': 1}
df_1['grade_numeric'] = df_1['grade'].map(grade_mapping)

# Payment Completion Ratio: Measuring historical repayment health
# Clipped at 1.5 to normalize outliers while capturing interest over-performance.
df_1['payment_completion_ratio'] = (df_1['total_payment'] / df_1['loan_amount']).clip(upper=1.5)

# --- 3. Risk Threshold Segmentation & Flags ---

# DTI Tiering: Categorizing fundamental repayment capacity
df_1['dti_category'] = pd.cut(df_1['dti'], 
                              bins=[-1, 0.10, 0.15, 0.20, 1], 
                              labels=['Stable', 'Moderate', 'High', 'Critical'])
df_1['dti_rate_warning'] = (df_1['dti'] > 0.20).astype(int)

# Interest Rate (Yield) Sensitivity: Flagging "Toxic Yield" segments
df_1['int_rate_category'] = pd.cut(df_1['int_rate'], 
                                   bins=[0, 0.10, 0.15, 0.20, 1], 
                                   labels=['Stable', 'Moderate', 'Strained', 'Critical'])
df_1['int_rate_warning'] = (df_1['int_rate'] > 0.20).astype(int)

# --- 4. Employment & Tenure Tiering ---

tenure_map = {'< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, 
              '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, '10+ years': 10}
df_1['emp_length_num'] = df_1['emp_length'].map(tenure_map)

# Segmenting stability based on employment tenure
df_1['tenure_tier'] = pd.cut(df_1['emp_length_num'], bins=[-1, 2, 9, 10], labels=['Junior', 'Mid', 'Senior'])

# --- 5. Duration Exposure ---
# Term Flag: Identifying high-risk, long-duration assets (60-month exposure)
df_1['is_60_months'] = (df_1['term'].str.contains('60')).astype(int)

# --- 6. PROPRIETARY COMPOSITE RISK SCORE (CRS) ---
# A weighted aggregation of high-impact variables to quantify total default probability.



df_1['risk_score'] = 0

# Pillar 1: Credit Grade (Weight: 40 pts)
grade_impact = {'A': 0, 'B': 5, 'C': 12, 'D': 20, 'E': 30, 'F': 36, 'G': 40}
df_1['risk_score'] += df_1['grade'].map(grade_impact)

# Pillar 2: Interest Rate Sensitivity (Weight: 30 pts)
df_1.loc[df_1['int_rate'] > 0.20, 'risk_score'] += 30
df_1.loc[(df_1['int_rate'] > 0.15) & (df_1['int_rate'] <= 0.20), 'risk_score'] += 15

# Pillar 3: Duration Risk (Weight: 15 pts)
df_1.loc[df_1['term'].str.contains('60'), 'risk_score'] += 15

# Pillar 4: Financial Leverage (Weight: 15 pts)
df_1.loc[df_1['dti'] > 0.20, 'risk_score'] += 15

# --- 7. OPERATIONAL RISK SEGMENTATION ---
# Mapping the numerical score to actionable business decision zones.



conditions = [
    (df_1['risk_score'] <= 35),
    (df_1['risk_score'] > 35) & (df_1['risk_score'] <= 60),
    (df_1['risk_score'] > 60)
]
choices = ['Low Risk', 'Medium Risk', 'High Risk']

df_1['risk_segment'] = np.select(conditions, choices, default='Unknown')

print(f"--- FEATURE ENGINEERING COMPLETE ---")
print(f"New Engineered Features: {len(df_1.columns) - 24}") # Assuming 24 original cols

# Save the master feature set to disk
# This 'Enriched' dataset now contains our proprietary risk scores and financial ratios
df_1.to_csv('loan_data_enriched.csv', index=False)

print("âœ… PHASE 3 COMPLETE: Feature Engineering finalized.")
print(f"ðŸ“¦ Dataset Persisted: 'loan_data_enriched.csv' | Dimensions: {df_1.shape}")