In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import time
import joblib
import warnings
warnings.filterwarnings('ignore')

# Sklearn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import (roc_auc_score, roc_curve, confusion_matrix, 
                            classification_report, precision_recall_curve)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# TimeCost Gradient Machine (TCGM)
from tcgm import TimeCostGradientMachine
from tcgm.metrics import evaluate_financial_performance, compute_expected_monetary_loss

In [2]:
df = pd.read_csv('CreditScore.csv')
df.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,IncomePerDependent,TotalDelinquencies,RevolvingUtilization_Squared,AgeGroup,DebtToIncomeRatio,HasDelinquency,SeriousDlqin2yrs
0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0,3040.0,2,0.58695,3.0,7323.197016,1,1
1,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0,1300.0,0,0.916138,2.0,316.878123,0,0
2,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0,3042.0,2,0.433201,2.0,258.914887,1,0
3,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0,3300.0,0,0.054667,1.0,118.963951,0,0
4,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0,63588.0,1,0.823083,3.0,1584.975094,1,0


## Financial Cost Parameters

In [3]:
# False Positive (FP): Approve bad loan - VERY COSTLY
# False Negative (FN): Reject good customer - Lost opportunity

COST_FP = 500.0  # Cost of approving a defaulter (loan loss)
COST_FN = 50.0   # Cost of rejecting a good customer (opportunity cost)
LGD = 0.60       # Loss Given Default (60% of exposure)

print(f"Cost of False Positive (Bad Loan Approved): ${COST_FP}")
print(f"Cost of False Negative (Good Customer Rejected): ${COST_FN}")
print(f"Loss Given Default (LGD): {LGD*100}%")
print(f"\nCost Ratio (FP:FN): {COST_FP/COST_FN}:1")
print("This reflects that approving a bad loan is 10x more costly than rejecting a good customer")

Cost of False Positive (Bad Loan Approved): $500.0
Cost of False Negative (Good Customer Rejected): $50.0
Loss Given Default (LGD): 60.0%

Cost Ratio (FP:FN): 10.0:1
This reflects that approving a bad loan is 10x more costly than rejecting a good customer


## Split features and target

In [4]:
# Separate features and target
if 'SeriousDlqin2yrs' in df.columns:
    X = df.drop(columns=['SeriousDlqin2yrs', 'AgeGroup'], axis=1)
    y = df['SeriousDlqin2yrs']

    
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Class distribution: {y.value_counts().to_dict()}")
else:
    print("Warning: Target column not found!")

Features shape: (150000, 15)
Target shape: (150000,)
Class distribution: {0: 139974, 1: 10026}


In [5]:
X.isnull().sum()

RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
IncomePerDependent                      0
TotalDelinquencies                      0
RevolvingUtilization_Squared            0
DebtToIncomeRatio                       0
HasDelinquency                          0
dtype: int64

## Train - test Split

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Training set class distribution: {y_train.value_counts(normalize=True).to_dict()}")

Training set size: (120000, 15)
Validation set size: (30000, 15)
Training set class distribution: {0: 0.9331583333333333, 1: 0.06684166666666666}


In [7]:
# Calculate exposure from MonthlyIncome
exposure = X_val["MonthlyIncome"].values * 3
print(f"‚úÖ Exposure calculated: {len(exposure)} values")

‚úÖ Exposure calculated: 30000 values


## Baseline Model - Logistic Regression

In [8]:
# Scale features for logistic regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [9]:
# Train logistic regression
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr_model.fit(X_train_scaled, y_train)

In [10]:
# Predictions
y_train_pred_lr = lr_model.predict_proba(X_train_scaled)[:, 1]
y_val_pred_lr = lr_model.predict_proba(X_val_scaled)[:, 1]

In [11]:
# Evaluate
train_auc_lr = roc_auc_score(y_train, y_train_pred_lr)
val_auc_lr = roc_auc_score(y_val, y_val_pred_lr)

print(f"Logistic Regression Train AUC: {train_auc_lr:.4f}")
print(f"Logistic Regression Validation AUC: {val_auc_lr:.4f}")

Logistic Regression Train AUC: 0.8542
Logistic Regression Validation AUC: 0.8604


## TIMECOST GRADIENT MACHINE (TCGM)

In [12]:
print("\nTCGM Key Features:")
print("‚úì Time-Aware Gradient Flow")
print("‚úì Cost-Sensitive Optimization")
print("‚úì Asymmetric Risk Handling")
print("‚úì Built-in Boosting Dynamics")
print("‚úì Financial Loss Minimization")


TCGM Key Features:
‚úì Time-Aware Gradient Flow
‚úì Cost-Sensitive Optimization
‚úì Asymmetric Risk Handling
‚úì Built-in Boosting Dynamics
‚úì Financial Loss Minimization


## Initialize TCGM with cost parameters

In [13]:
tcgm_model = TimeCostGradientMachine(
    n_estimators=80,           # Number of boosting iterations
    learning_rate=0.1,         # Step size for gradient updates
    max_depth=5,               # Tree depth
    min_samples_leaf=30,       # Minimum samples per leaf (regulatory constraint)
    cost_fp=COST_FP,           # Cost of false positive
    cost_fn=COST_FN,           # Cost of false negative
    random_state=42
)

print(f"\nTCGM Configuration:")
print(f"  n_estimators: {tcgm_model.n_estimators}")
print(f"  learning_rate: {tcgm_model.learning_rate}")
print(f"  max_depth: {tcgm_model.max_depth}")
print(f"  min_samples_leaf: {tcgm_model.min_samples_leaf}")
print(f"  cost_fp: ${tcgm_model.cost_fp}")
print(f"  cost_fn: ${tcgm_model.cost_fn}")


TCGM Configuration:
  n_estimators: 80
  learning_rate: 0.1
  max_depth: 5
  min_samples_leaf: 30
  cost_fp: $500.0
  cost_fn: $50.0


In [14]:
# Train TCGM
start_time = time.time()

tcgm_model.fit(X_train_scaled, y_train)

training_time = time.time() - start_time
print(f"‚úÖ Training completed in {training_time:.2f} seconds")

‚úÖ Training completed in 135.75 seconds


In [15]:
# Predictions
y_train_pred_tcgm = tcgm_model.predict_proba(X_train_scaled)[:, 1]
y_val_pred_tcgm = tcgm_model.predict_proba(X_val_scaled)[:, 1]

In [16]:
# Evaluate AUC
train_auc_tcgm = roc_auc_score(y_train, y_train_pred_tcgm)
val_auc_tcgm = roc_auc_score(y_val, y_val_pred_tcgm)

print(f"\nTCGM Train AUC: {train_auc_tcgm:.4f}")
print(f"TCGM Validation AUC: {val_auc_tcgm:.4f}")


TCGM Train AUC: 0.8574
TCGM Validation AUC: 0.8540


In [17]:
# Evaluate using TCGM's financial metrics
print("\nEvaluating TCGM with Financial Performance Metrics...")
financial_report = evaluate_financial_performance(
    y_val,
    y_val_pred_tcgm,
    cost_fp=COST_FP,
    cost_fn=COST_FN
)

print("\nFinancial Performance Report:")
print(financial_report)


Evaluating TCGM with Financial Performance Metrics...

Financial Performance Report:
{'AUC': 0.8540461045991916, 'Brier': 0.06652425984665152, 'Expected_Loss': 3.4703427542687297}


## Expected Monetary Loss Analysis

In [18]:
# After train-validation split and model predictions...

# Calculate exposure specifically for validation set
if 'CreditExposure' in X_val.columns:
    exposure = X_val["CreditExposure"].values
    print("‚úÖ Using CreditExposure from validation set")
elif 'MonthlyIncome' in X_val.columns:
    exposure = X_val["MonthlyIncome"].values * 12 * 0.5
    print("‚ö†Ô∏è Calculated exposure from MonthlyIncome (validation set)")
else:
    exposure = np.full(len(X_val), 5000)
    print("‚ö†Ô∏è Using default exposure")

# Verify sizes match
print(f"y_val length: {len(y_val)}")
print(f"exposure length: {len(exposure)}")
print(f"y_val_pred_tcgm length: {len(y_val_pred_tcgm)}")

# Now compute EML
eml = compute_expected_monetary_loss(
    y_true=y_val,
    y_prob=y_val_pred_tcgm,
    exposure=exposure,
    lgd=LGD,
    cost_fp=COST_FP
)

‚ö†Ô∏è Calculated exposure from MonthlyIncome (validation set)
y_val length: 30000
exposure length: 30000
y_val_pred_tcgm length: 30000


In [19]:
# Compute Expected Monetary Loss
print("\nComputing Expected Monetary Loss across all thresholds...")
eml = compute_expected_monetary_loss(
    y_true=y_val,
    y_prob=y_val_pred_tcgm,
    exposure=exposure,
    lgd=LGD,
    cost_fp=COST_FP
)

print(f"\nüí° Optimal Decision Threshold: {eml['best_threshold']:.4f}")
print(f"üíµ Minimum Expected Loss: ${eml['best_loss']:,.2f}")


Computing Expected Monetary Loss across all thresholds...

üí° Optimal Decision Threshold: 0.0100
üíµ Minimum Expected Loss: $39,303,041.20


In [20]:
# Save model
import joblib

joblib.dump(tcgm_model, 'tcgm_credit_scoring_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model saved successfully!")

Model saved successfully!
