In [1]:
# pip install --no-cache-dir tcgm==0.1.4

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from tcgm import TCGMRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tcgm.metrics import evaluate_regression_cost, asymmetric_mae_loss

In [3]:
df = pd.read_csv('LoanAnalysis.csv')
df.head()

Unnamed: 0,ApplicationDate,Age,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,...,CheckingAccountBalance_log,MonthlyLoanPayment_log,LoanAmount_log,MonthlyDebtPayments_log,DTI_norm,Util_norm,LoanToIncome,LTI_norm,EmploymentRisk,DomainRiskScore
0,2018-01-01,45,617,Employed,Master,22,48,Married,2,Own,...,14.404963,13.353015,16.79755,12.52271,0.597226,0.393798,0.937962,0.937962,0.5,0.616969
1,2018-01-02,38,628,Employed,Associate,15,48,Single,1,Mortgage,...,15.462244,13.990373,17.480802,13.519798,0.550456,0.097586,0.976442,0.976442,0.5,0.536166
2,2018-01-03,47,570,Employed,Bachelor,26,36,Married,2,Rent,...,14.110045,13.815122,17.090408,14.117836,0.407882,0.152682,0.953291,0.953291,0.5,0.494252
3,2018-01-04,58,545,Employed,High School,34,96,Single,1,Mortgage,...,14.417365,14.267389,17.855874,13.939939,0.727074,0.297319,0.967468,0.967468,0.5,0.645673
4,2018-01-05,37,594,Employed,Associate,17,36,Married,1,Mortgage,...,15.826606,13.112858,16.438439,12.926351,0.131474,0.35615,0.871684,0.871684,0.5,0.427974


In [4]:
# Numerical features to standardize
numeric_features = [
    'Age',
    'Experience',
    'JobTenure',
    'CreditScore',
    'PaymentHistory',
    'LengthOfCreditHistory',
    'NumberOfOpenCreditLines',
    'NumberOfCreditInquiries',
    'PreviousLoanDefaults',
    'BankruptcyHistory',
    'UtilityBillsPaymentHistory',
    'LoanDuration',
    'BaseInterestRate',
    'InterestRate',
    'TotalDebtToIncomeRatio',

    'MonthlyIncome_log',
    'AnnualIncome_log',
    'SavingsAccountBalance_log',
    'CheckingAccountBalance_log',
    'NetWorth_log',
    'TotalAssets_log',
    'TotalLiabilities_log',
    'MonthlyLoanPayment_log',
    'LoanAmount_log',
    'MonthlyDebtPayments_log'
]

In [5]:
# Categorical features to encode
categorical_features = [
    'EmploymentStatus',
    'EducationLevel',
    'MaritalStatus',
    'HomeOwnershipStatus',
    'LoanPurpose'
]

In [6]:
drop_columns = [
    'ApplicationDate',
    'LoanApproved',
    'RiskScore',

    # Domain score construction components
    'DebtToIncomeRatio',
    'CreditCardUtilizationRate',
    'LoanToIncome',
    'DTI_norm',
    'Util_norm',
    'LTI_norm',
    'EmploymentRisk'
]

X = df.drop(columns=drop_columns + ['DomainRiskScore'])
y = df['DomainRiskScore']

In [7]:
# Split Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = 42
)

In [8]:
# Build Transformer
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(
    handle_unknown= 'ignore',
    sparse_output= False
)

In [9]:
# Converting to column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

In [10]:
# TCGM Pipeline
model = TCGMRegressor(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3,
    random_state=42
)

pipeline = Pipeline(
    steps=[
        ('preprocessing', preprocessor),
        ('model', model)
    ]
)

In [11]:
# Train the model
pipeline.fit(X_train, y_train)

In [12]:
# Using loan amount as exposure proxy
X_test = X_test.copy()
X_test["exposure"] = df.loc[X_test.index, "LoanAmount_log"].apply(np.exp)

In [13]:
y_pred = pipeline.predict(X_test)

# mae = mean_absolute_error(y_test, y_pred)
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# r2 = r2_score(y_test, y_pred)

# print(f"MAE:  {mae:.4f}")
# print(f"RMSE: {rmse:.4f}")
# print(f"R²:   {r2:.4f}")

In [17]:
report = evaluate_regression_cost(
    y_true=y_test,
    y_pred=y_pred,
    c_over=1.0,
    c_under=5.0
)

for k, v in report.items():
    print(f"{k}: {v:.4f}")

Asymmetric_MAE: 0.2587
MAE: 0.0829
RMSE: 0.1019
