In [1]:
# pip install --no-cache-dir tcgm==0.1.4
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.7.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.7.0 (from mlflow)
  Downloading mlflow_skinny-3.7.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.7.0 (from mlflow)
  Downloading mlflow_tracing-3.7.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting huey<3,>=2.5.0 (from mlflow)
  Downloading huey-2.5.5-py3-none-any.whl.metadata (4.8 kB)
Collecting waitress<4 (from mlflow)
  Downloading waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.7.0->mlflow)
  Downloading databricks_sdk-0.76.0-py3-none-any.whl.metadata (40 kB)
Collecting fastapi<1 (from mlflow-sk

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-ai-generativelanguage 0.6.15 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 6.33.2 which is incompatible.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 6.33.2 which is incompatible.


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from tcgm import TCGMRegressor
from tcgm.metrics import evaluate_regression_cost, asymmetric_mae_loss

import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


import pickle
from datetime import datetime

In [3]:
df = pd.read_csv('LoanAnalysis.csv')
df.head()

Unnamed: 0,ApplicationDate,Age,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,...,CheckingAccountBalance_log,MonthlyLoanPayment_log,LoanAmount_log,MonthlyDebtPayments_log,DTI_norm,Util_norm,LoanToIncome,LTI_norm,EmploymentRisk,DomainRiskScore
0,2018-01-01,45,617,Employed,Master,22,48,Married,2,Own,...,14.404963,13.353015,16.79755,12.52271,0.597226,0.393798,0.937962,0.937962,0.5,0.616969
1,2018-01-02,38,628,Employed,Associate,15,48,Single,1,Mortgage,...,15.462244,13.990373,17.480802,13.519798,0.550456,0.097586,0.976442,0.976442,0.5,0.536166
2,2018-01-03,47,570,Employed,Bachelor,26,36,Married,2,Rent,...,14.110045,13.815122,17.090408,14.117836,0.407882,0.152682,0.953291,0.953291,0.5,0.494252
3,2018-01-04,58,545,Employed,High School,34,96,Single,1,Mortgage,...,14.417365,14.267389,17.855874,13.939939,0.727074,0.297319,0.967468,0.967468,0.5,0.645673
4,2018-01-05,37,594,Employed,Associate,17,36,Married,1,Mortgage,...,15.826606,13.112858,16.438439,12.926351,0.131474,0.35615,0.871684,0.871684,0.5,0.427974


In [4]:
# Numerical features to standardize
numeric_features = [
    'Age',
    'Experience',
    'JobTenure',
    'CreditScore',
    'PaymentHistory',
    'LengthOfCreditHistory',
    'NumberOfOpenCreditLines',
    'NumberOfCreditInquiries',
    'PreviousLoanDefaults',
    'BankruptcyHistory',
    'UtilityBillsPaymentHistory',
    'LoanDuration',
    'BaseInterestRate',
    'InterestRate',
    'TotalDebtToIncomeRatio',

    'MonthlyIncome_log',
    'AnnualIncome_log',
    'SavingsAccountBalance_log',
    'CheckingAccountBalance_log',
    'NetWorth_log',
    'TotalAssets_log',
    'TotalLiabilities_log',
    'MonthlyLoanPayment_log',
    'LoanAmount_log',
    'MonthlyDebtPayments_log'
]

In [5]:
# Categorical features to encode
categorical_features = [
    'EmploymentStatus',
    'EducationLevel',
    'MaritalStatus',
    'HomeOwnershipStatus',
    'LoanPurpose'
]

In [6]:
drop_columns = [
    'ApplicationDate',
    'LoanApproved',
    'RiskScore',

    # Domain score construction components
    'DebtToIncomeRatio',
    'CreditCardUtilizationRate',
    'LoanToIncome',
    'DTI_norm',
    'Util_norm',
    'LTI_norm',
    'EmploymentRisk'
]

X = df.drop(columns=drop_columns + ['DomainRiskScore'])
y = df['DomainRiskScore']

In [7]:
# Split Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 16000 samples
Test set: 4000 samples


In [8]:
# Build Transformer
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(
    handle_unknown= 'ignore',
    sparse_output= False
)

In [9]:
# Converting to column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

In [10]:
C_OVER = 1.0   # Cost of overestimating risk (rejecting good customers)
C_UNDER = 5.0  # Cost of underestimating risk (approving bad customers) - 5x more expensive!

In [11]:
# TCGM Pipeline
model = TCGMRegressor(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3,
    c_over=C_OVER,
    c_under=C_UNDER,
    random_state=42
)

pipeline = Pipeline(
    steps=[
        ('preprocessing', preprocessor),
        ('model', model)
    ]
)

## Model Deployment Setup

In [12]:
# Set up MLflow
mlflow.set_experiment("loan_risk_assessment")

with mlflow.start_run(run_name="tcgm_asymmetric_v1"):
    # Log parameters
    mlflow.log_param("model_type", "TCGMRegressor")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("max_depth", 3)
    mlflow.log_param("c_over", C_OVER)
    mlflow.log_param("c_under", C_UNDER)
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    
    # Train the model
    print("Training model...")
    pipeline.fit(X_train, y_train)
    print("Training complete!")
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate standard metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Calculate asymmetric cost metrics
    cost_report = evaluate_regression_cost(
        y_true=y_test,
        y_pred=y_pred,
        c_over=C_OVER,
        c_under=C_UNDER
    )
    
    # Log metrics
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)
    mlflow.log_metric("asymmetric_mae", cost_report['Asymmetric_MAE'])
    
    # Log the model
    mlflow.sklearn.log_model(pipeline, "risk_model")
    
    # Save reference data for drift detection
    reference_data = pd.concat([X_train, y_train], axis=1)
    reference_data.to_csv("reference_data.csv", index=False)
    mlflow.log_artifact("reference_data.csv")
    
    run_id = mlflow.active_run().info.run_id
    
    print("\n" + "="*70)
    print("MODEL PERFORMANCE METRICS")
    print("="*70)
    print(f"Standard Metrics:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  RÂ²:   {r2:.4f}")
    print(f"\nAsymmetric Cost Metrics:")
    for k, v in cost_report.items():
        print(f"  {k}: {v:.4f}")
    print("="*70)
    print(f"\nMLflow Run ID: {run_id}")

2025/12/18 12:55:31 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/18 12:55:31 INFO mlflow.store.db.utils: Updating database tables
2025/12/18 12:55:31 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/18 12:55:31 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/18 12:55:31 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/18 12:55:31 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/18 12:55:31 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/18 12:55:31 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/18 12:55:31 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/18 12:55:31 INFO alembic.runtime.migration: Running 

Training model...
Training complete!





MODEL PERFORMANCE METRICS
Standard Metrics:
  MAE:  0.1334
  RMSE: 0.1670
  RÂ²:   -1.7191

Asymmetric Cost Metrics:
  Asymmetric_MAE: 0.2015
  MAE: 0.1334
  RMSE: 0.1670

MLflow Run ID: 48cb6796e53d4a2393ad8fd1fb5c7250


## Define Loan Recommendation Logic

In [13]:
def recommend_loan_amount(
    risk_score,
    annual_income,
    existing_monthly_debt,
    credit_score,
    base_multiplier=0.3,
    max_dti_ratio=0.43
):
    """
    base_multiplier: Conservative lending ratio (default: 30% of income)
    max_dti_ratio: Maximum debt-to-income ratio allowed (default: 43%)
    """
    
    # 1. Risk-adjusted income multiplier
    # Lower risk = higher multiplier, higher risk = lower multiplier
    risk_adjusted_multiplier = base_multiplier * (1 - risk_score)
    
    # 2. Credit score adjustment
    if credit_score >= 750:
        credit_adjustment = 1.2
    elif credit_score >= 700:
        credit_adjustment = 1.1
    elif credit_score >= 650:
        credit_adjustment = 1.0
    elif credit_score >= 600:
        credit_adjustment = 0.8
    else:
        credit_adjustment = 0.6
    
    # 3. Calculate available monthly payment capacity
    monthly_income = annual_income / 12
    max_total_monthly_debt = monthly_income * max_dti_ratio
    available_monthly_payment = max_total_monthly_debt - existing_monthly_debt
    
    # If already over DTI limit, no loan
    if available_monthly_payment <= 0:
        return {
            'recommended_max_loan': 0,
            'monthly_payment_capacity': 0,
            'reason': 'Debt-to-income ratio already at maximum'
        }
    
    # 4. Calculate max loan based on payment capacity (assuming 5% interest, 5 years)
    # Using loan payment formula: P = L * [r(1+r)^n] / [(1+r)^n - 1]
    # Solving for L: L = P * [(1+r)^n - 1] / [r(1+r)^n]
    monthly_rate = 0.05 / 12
    n_payments = 60  # 5 years
    
    max_loan_from_payment = available_monthly_payment * (
        ((1 + monthly_rate) ** n_payments - 1) / 
        (monthly_rate * (1 + monthly_rate) ** n_payments)
    )
    
    # 5. Calculate max loan based on income multiple
    max_loan_from_income = annual_income * risk_adjusted_multiplier * credit_adjustment
    
    # 6. Take the minimum of both approaches (most conservative)
    recommended_loan = min(max_loan_from_payment, max_loan_from_income)
    
    # 7. Apply absolute caps
    absolute_max = annual_income * 0.5  # Never more than 50% of annual income
    recommended_loan = min(recommended_loan, absolute_max)
    
    return {
        'recommended_max_loan': max(0, recommended_loan),
        'monthly_payment_capacity': available_monthly_payment,
        'risk_adjusted_multiplier': risk_adjusted_multiplier,
        'credit_adjustment': credit_adjustment,
        'calculation_method': 'conservative_min'
    }

def get_risk_tier_info(risk_score):
    """
    Categorize risk score into tiers with associated lending terms.
    """
    if risk_score < 0.3:
        return {
            'tier': 'Low Risk',
            'tier_code': 'A',
            'interest_rate_adjustment': 0.0,
            'approval_recommendation': 'AUTO_APPROVE',
            'description': 'Excellent credit profile, minimal risk'
        }
    elif risk_score < 0.5:
        return {
            'tier': 'Medium-Low Risk',
            'tier_code': 'B',
            'interest_rate_adjustment': 1.0,
            'approval_recommendation': 'APPROVE',
            'description': 'Good credit profile, standard terms'
        }
    elif risk_score < 0.65:
        return {
            'tier': 'Medium Risk',
            'tier_code': 'C',
            'interest_rate_adjustment': 2.0,
            'approval_recommendation': 'MANUAL_REVIEW',
            'description': 'Acceptable risk with conditions'
        }
    elif risk_score < 0.8:
        return {
            'tier': 'Medium-High Risk',
            'tier_code': 'D',
            'interest_rate_adjustment': 3.5,
            'approval_recommendation': 'MANUAL_REVIEW_REQUIRED',
            'description': 'Elevated risk, requires careful review'
        }
    else:
        return {
            'tier': 'High Risk',
            'tier_code': 'E',
            'interest_rate_adjustment': 5.0,
            'approval_recommendation': 'DECLINE',
            'description': 'Significant risk factors present'
        }

print("Loan recommendation functions defined")

Loan recommendation functions defined


## Test Complete System on Test Set

In [14]:
# Prepare test data with actual financial values
X_test_with_financials = X_test.copy()

# Get actual values from original data (reverse log transformation)
test_indices = X_test.index
X_test_with_financials['AnnualIncome_actual'] = df.loc[test_indices, 'AnnualIncome_log'].apply(np.exp)
X_test_with_financials['MonthlyDebt_actual'] = df.loc[test_indices, 'MonthlyDebtPayments_log'].apply(np.exp)
X_test_with_financials['CreditScore_actual'] = df.loc[test_indices, 'CreditScore']

# Generate predictions and recommendations for all test samples
results = []

print("Generating predictions and recommendations for test set...")
for idx in X_test.index[:10]:  # First 10 for display
    # Get single customer data
    customer_data = X_test.loc[[idx]]
    
    # Predict risk score
    risk_score = pipeline.predict(customer_data)[0]
    
    # Get financial data
    annual_income = df.loc[idx, 'AnnualIncome_log']
    annual_income = np.exp(annual_income)
    monthly_debt = df.loc[idx, 'MonthlyDebtPayments_log']
    monthly_debt = np.exp(monthly_debt)
    credit_score = df.loc[idx, 'CreditScore']
    
    # Get risk tier
    risk_info = get_risk_tier_info(risk_score)
    
    # Get loan recommendation
    loan_rec = recommend_loan_amount(
        risk_score=risk_score,
        annual_income=annual_income,
        existing_monthly_debt=monthly_debt,
        credit_score=credit_score
    )
    
    results.append({
        'customer_id': idx,
        'risk_score': risk_score,
        'actual_risk_score': y_test.loc[idx],
        'risk_tier': risk_info['tier'],
        'approval_decision': risk_info['approval_recommendation'],
        'recommended_loan': loan_rec['recommended_max_loan'],
        'monthly_capacity': loan_rec['monthly_payment_capacity'],
        'base_rate': 5.0,
        'adjusted_rate': 5.0 + risk_info['interest_rate_adjustment'],
        'annual_income': annual_income,
        'credit_score': credit_score
    })

# Create results DataFrame
results_df = pd.DataFrame(results)

print("\n" + "="*100)
print("SAMPLE PREDICTIONS & RECOMMENDATIONS")
print("="*100)
print(results_df.to_string(index=False))
print("="*100)

Generating predictions and recommendations for test set...

SAMPLE PREDICTIONS & RECOMMENDATIONS
 customer_id  risk_score  actual_risk_score        risk_tier      approval_decision  recommended_loan  monthly_capacity  base_rate  adjusted_rate  annual_income  credit_score
       10650    0.552706           0.637540      Medium Risk          MANUAL_REVIEW      1.622513e+07      5.283903e+06        5.0            7.0    151141501.0           604
        2041    0.634746           0.456004      Medium Risk          MANUAL_REVIEW      1.865409e+07      9.550527e+06        5.0            7.0    283731001.0           509
        8668    0.599361           0.576927      Medium Risk          MANUAL_REVIEW      7.926015e+06      3.275369e+06        5.0            7.0    109908001.0           483
        1114    0.711866           0.490964 Medium-High Risk MANUAL_REVIEW_REQUIRED      4.443329e+06      2.456430e+06        5.0            8.5     85672501.0           412
       13902    0.673522    

## Analyze Model Behavior

In [15]:
# Analyze prediction errors by risk level
y_pred_all = pipeline.predict(X_test)
errors = y_pred_all - y_test.values

underestimation_errors = errors[errors > 0]  # Predicted higher risk than actual
overestimation_errors = errors[errors < 0]   # Predicted lower risk than actual

print("\n" + "="*70)
print("ERROR ANALYSIS (Impact of Asymmetric Cost Training)")
print("="*70)
print(f"Total predictions: {len(errors)}")
print(f"\nUnderestimations (more conservative - good):")
print(f"  Count: {len(underestimation_errors)}")
print(f"  Mean error: {underestimation_errors.mean():.4f}")
print(f"  Max error: {underestimation_errors.max():.4f}")
print(f"\nOverestimations (less conservative - risky):")
print(f"  Count: {len(overestimation_errors)}")
print(f"  Mean error: {abs(overestimation_errors.mean()):.4f}")
print(f"  Max error: {abs(overestimation_errors.min()):.4f}")
print(f"\nRatio of under/over estimations: {len(underestimation_errors)/len(overestimation_errors):.2f}")
print("="*70)


ERROR ANALYSIS (Impact of Asymmetric Cost Training)
Total predictions: 4000

Underestimations (more conservative - good):
  Count: 3141
  Mean error: 0.1482
  Max error: 1.1216

Overestimations (less conservative - risky):
  Count: 859
  Mean error: 0.0792
  Max error: 0.5194

Ratio of under/over estimations: 3.66


## Save Complete System for Deployment

In [16]:
# Save the trained pipeline
with open('risk_assessment_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

# Save the recommendation functions and constants
deployment_config = {
    'c_over': C_OVER,
    'c_under': C_UNDER,
    'base_multiplier': 0.3,
    'max_dti_ratio': 0.43,
    'feature_names': numeric_features + categorical_features,
    'model_version': '1.0',
    'training_date': datetime.now().isoformat(),
    'mlflow_run_id': run_id
}

with open('deployment_config.pkl', 'wb') as f:
    pickle.dump(deployment_config, f)

print("\n" + "="*70)
print("MODEL ARTIFACTS SAVED")
print("="*70)
print("Files created:")
print("  1. risk_assessment_pipeline.pkl - Complete trained model")
print("  2. deployment_config.pkl - Configuration and metadata")
print("  3. reference_data.csv - Training data for drift detection")
print(f"\nMLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"MLflow run ID: {run_id}")
print("="*70)


MODEL ARTIFACTS SAVED
Files created:
  1. risk_assessment_pipeline.pkl - Complete trained model
  2. deployment_config.pkl - Configuration and metadata
  3. reference_data.csv - Training data for drift detection

MLflow tracking URI: sqlite:///mlflow.db
MLflow run ID: 48cb6796e53d4a2393ad8fd1fb5c7250


## Create Deployment-Ready Prediction Function

In [17]:
def predict_customer_risk_and_loan(customer_data_dict):
    """
    Production-ready function for risk assessment and loan recommendation.
    
    Parameters:
    -----------
    customer_data_dict : dict
        Dictionary with customer features (same as training features)
    
    Returns:
    --------
    dict with risk assessment and loan recommendation
    """
    # Convert to DataFrame
    customer_df = pd.DataFrame([customer_data_dict])
    
    # Predict risk score
    risk_score = pipeline.predict(customer_df)[0]
    
    # Get risk tier information
    risk_info = get_risk_tier_info(risk_score)
    
    # Extract financial data
    annual_income = np.exp(customer_data_dict['AnnualIncome_log'])
    monthly_debt = np.exp(customer_data_dict['MonthlyDebtPayments_log'])
    credit_score = customer_data_dict['CreditScore']
    
    # Get loan recommendation
    loan_recommendation = recommend_loan_amount(
        risk_score=risk_score,
        annual_income=annual_income,
        existing_monthly_debt=monthly_debt,
        credit_score=credit_score
    )
    
    # Compile complete response
    return {
        'risk_assessment': {
            'risk_score': float(risk_score),
            'risk_tier': risk_info['tier'],
            'risk_tier_code': risk_info['tier_code'],
            'description': risk_info['description']
        },
        'loan_recommendation': {
            'max_approved_amount': float(loan_recommendation['recommended_max_loan']),
            'monthly_payment_capacity': float(loan_recommendation['monthly_payment_capacity']),
            'estimated_monthly_payment': float(loan_recommendation['monthly_payment_capacity'] * 0.9)  # 90% utilization
        },
        'lending_terms': {
            'base_interest_rate': 5.0,
            'risk_adjusted_rate': 5.0 + risk_info['interest_rate_adjustment'],
            'approval_decision': risk_info['approval_recommendation']
        },
        'metadata': {
            'model_version': '1.0',
            'prediction_timestamp': datetime.now().isoformat(),
            'annual_income': float(annual_income),
            'credit_score': int(credit_score)
        }
    }

# Test the function
test_customer = X_test.iloc[0].to_dict()
test_result = predict_customer_risk_and_loan(test_customer)

print("\n" + "="*70)
print("DEPLOYMENT FUNCTION TEST")
print("="*70)
import json
print(json.dumps(test_result, indent=2))
print("="*70)
print("\nFunction ready for deployment!")


DEPLOYMENT FUNCTION TEST
{
  "risk_assessment": {
    "risk_score": 0.5527059435107059,
    "risk_tier": "Medium Risk",
    "risk_tier_code": "C",
    "description": "Acceptable risk with conditions"
  },
  "loan_recommendation": {
    "max_approved_amount": 16225126.82068097,
    "monthly_payment_capacity": 5283902.785833335,
    "estimated_monthly_payment": 4755512.507250002
  },
  "lending_terms": {
    "base_interest_rate": 5.0,
    "risk_adjusted_rate": 7.0,
    "approval_decision": "MANUAL_REVIEW"
  },
  "metadata": {
    "model_version": "1.0",
    "prediction_timestamp": "2025-12-18T12:56:58.465438",
    "annual_income": 151141501.00000003,
    "credit_score": 604
  }
}

Function ready for deployment!


In [18]:
# # Train the model
# pipeline.fit(X_train, y_train)