### ML Model for Trial 01
Gradient Boosting (LightGBM or XGBoost)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [3]:
train_df = pd.read_csv("../dataset/train.csv")
test_df = pd.read_csv("../dataset/test.csv")

In [5]:
def create_features(df):
    df_cpy = df.copy()

    df_cpy['income_to_loan_ratio'] = df_cpy['annual_income'] / (df_cpy['loan_amount'] + 1)
    df_cpy['loan_exceeds_income'] = (df_cpy['loan_amount'] > df_cpy['annual_income']).astype(int)

    df_cpy['monthly_income'] = df_cpy['annual_income'] / 12
    
    df_cpy['high_dti'] = (df_cpy['debt_to_income_ratio'] > 0.15).astype(int)
    df_cpy['low_credit_score'] = (df_cpy['credit_score'] < 650).astype(int)
    df_cpy['high_interest_rate'] = (df_cpy['interest_rate'] > 13).astype(int)

    return df_cpy

In [8]:
train_df = create_features(train_df)
test_df = create_features(test_df)

In [34]:
numerical_features = train_df.select_dtypes(include='number').columns.tolist()
numerical_features = [col for col in numerical_features if 'id' not in col.lower()]
categorical_features = train_df.select_dtypes(include='object').columns.to_list()

In [25]:
# Separating
all_features = numerical_features + categorical_features
X = train_df[all_features].copy()
y = train_df['loan_paid_back'].copy()
X_test = test_df[all_features].copy()

In [26]:
# Splitting into training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [36]:
for col in categorical_features:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')

In [38]:
print(f"\nTrain set: {X_train.shape}, Target: {y_train.value_counts(normalize=True).to_dict()}")
print(f"Val set: {X_val.shape}, Target: {y_val.value_counts(normalize=True).to_dict()}")


Train set: (475195, 17), Target: {1.0: 0.7988194320226434, 0.0: 0.20118056797735667}
Val set: (118799, 17), Target: {1.0: 0.7988198553859881, 0.0: 0.2011801446140119}


In [39]:
# LightGBM parameters
params_with_grade = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'is_unbalance': True,  # Handle 79-21 imbalance
}

In [40]:
# Create LightGBM datasets
lgb_train = lgb.Dataset(
    X_train, 
    label=y_train,
    categorical_feature=categorical_features,
    free_raw_data=False
)

lgb_val = lgb.Dataset(
    X_val, 
    label=y_val,
    categorical_feature=categorical_features,
    reference=lgb_train,
    free_raw_data=False
)

In [41]:
# Train model
print("Training Model 1 (with grade_subgrade)...")
model_with_grades = lgb.train(
    params_with_grade,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

Training Model 1 (with grade_subgrade)...
Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.919586	valid's auc: 0.917782
[200]	train's auc: 0.92318	valid's auc: 0.91944
[300]	train's auc: 0.925887	valid's auc: 0.920129
[400]	train's auc: 0.928106	valid's auc: 0.920519
[500]	train's auc: 0.930185	valid's auc: 0.920803
[600]	train's auc: 0.932025	valid's auc: 0.920988
[700]	train's auc: 0.933815	valid's auc: 0.921099
Early stopping, best iteration is:
[725]	train's auc: 0.934251	valid's auc: 0.921103


In [42]:
# Predictions
y_pred_train_1 = model_with_grades.predict(X_train, num_iteration=model_with_grades.best_iteration)
y_pred_val_1 = model_with_grades.predict(X_val, num_iteration=model_with_grades.best_iteration)

In [43]:
# Evaluate
train_auc_1 = roc_auc_score(y_train, y_pred_train_1)
val_auc_1 = roc_auc_score(y_val, y_pred_val_1)

In [44]:
print(f"\n{'='*70}")
print(f"MODEL 1 RESULTS (WITH grade_subgrade)")
print(f"{'='*70}")
print(f"Train ROC-AUC: {train_auc_1:.4f}")
print(f"Val ROC-AUC: {val_auc_1:.4f}")
print(f"Overfitting: {train_auc_1 - val_auc_1:.4f}")


MODEL 1 RESULTS (WITH grade_subgrade)
Train ROC-AUC: 0.9343
Val ROC-AUC: 0.9211
Overfitting: 0.0131


In [45]:
# feature importance
feature_importance_1 = pd.DataFrame({
    'feature': model_with_grades.feature_name(),
    'importance': model_with_grades.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

In [46]:
feature_importance_1

Unnamed: 0,feature,importance
14,employment_status,2515647.0
1,debt_to_income_ratio,930185.5
2,credit_score,512263.5
8,high_dti,145479.3
16,grade_subgrade,93868.34
3,loan_amount,57709.43
4,interest_rate,56939.0
0,annual_income,48284.13
5,income_to_loan_ratio,37101.26
9,low_credit_score,17622.44


In [47]:
# checking grade_subgrade importance
grade_importance = feature_importance_1[feature_importance_1['feature'].str.lower() == 'grade_subgrade']['importance'].values

In [None]:
grade_pct = (grade_importance[0] / feature_importance_1['importance'].sum()) * 100
print(f"grade_subgrade importance: {grade_pct:.1f}% of total")
if grade_pct > 50:
    print(" â†’ POTENTIAL LEAKAGE: Model relies heavily on grade_subgrade!")

grade_subgrade importance: 2.1% of total


In [51]:
# Remove grade_subgrade
features_no_grade = [f for f in all_features if f != 'grade_subgrade']
categorical_no_grade = [f for f in categorical_features if f != 'grade_subgrade']

X_train_no_grade = X_train[features_no_grade]
X_val_no_grade = X_val[features_no_grade]

In [52]:
# Create datasets
lgb_train_2 = lgb.Dataset(
    X_train_no_grade, 
    label=y_train,
    categorical_feature=categorical_no_grade,
    free_raw_data=False
)

lgb_val_2 = lgb.Dataset(
    X_val_no_grade, 
    label=y_val,
    categorical_feature=categorical_no_grade,
    reference=lgb_train_2,
    free_raw_data=False
)

In [53]:
# Train model
print("Training Model 2 (without grade_subgrade)...")
model_no_grade = lgb.train(
    params_with_grade,  # Same params
    lgb_train_2,
    num_boost_round=1000,
    valid_sets=[lgb_train_2, lgb_val_2],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

Training Model 2 (without grade_subgrade)...
Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.919584	valid's auc: 0.917966
[200]	train's auc: 0.922951	valid's auc: 0.919806
[300]	train's auc: 0.925472	valid's auc: 0.920514
[400]	train's auc: 0.927568	valid's auc: 0.92085
[500]	train's auc: 0.929493	valid's auc: 0.921137
[600]	train's auc: 0.931319	valid's auc: 0.921355
[700]	train's auc: 0.932987	valid's auc: 0.921586
[800]	train's auc: 0.93453	valid's auc: 0.921608
Early stopping, best iteration is:
[777]	train's auc: 0.934185	valid's auc: 0.921618


In [54]:
# Predictions
y_pred_train_2 = model_no_grade.predict(X_train_no_grade, num_iteration=model_no_grade.best_iteration)
y_pred_val_2 = model_no_grade.predict(X_val_no_grade, num_iteration=model_no_grade.best_iteration)

# Evaluate
train_auc_2 = roc_auc_score(y_train, y_pred_train_2)
val_auc_2 = roc_auc_score(y_val, y_pred_val_2)

print(f"\n{'='*70}")
print(f"MODEL 2 RESULTS (WITHOUT grade_subgrade)")
print(f"{'='*70}")
print(f"Train ROC-AUC: {train_auc_2:.4f}")
print(f"Val ROC-AUC: {val_auc_2:.4f}")
print(f"Overfitting: {train_auc_2 - val_auc_2:.4f}")


MODEL 2 RESULTS (WITHOUT grade_subgrade)
Train ROC-AUC: 0.9342
Val ROC-AUC: 0.9216
Overfitting: 0.0126


In [55]:
# Feature importance
feature_importance_2 = pd.DataFrame({
    'feature': model_no_grade.feature_name(),
    'importance': model_no_grade.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print(f"\nTop 10 Most Important Features:")
print(feature_importance_2.head(10).to_string(index=False))


Top 10 Most Important Features:
             feature   importance
   employment_status 2.515410e+06
debt_to_income_ratio 1.060086e+06
        credit_score 5.247495e+05
       interest_rate 6.758200e+04
    low_credit_score 6.573700e+04
         loan_amount 6.438220e+04
       annual_income 5.514700e+04
            high_dti 4.954648e+04
income_to_loan_ratio 4.104546e+04
        loan_purpose 1.677982e+04


In [56]:
print("\n" + "="*70)
print("STEP 6: MODEL COMPARISON")
print("="*70)

comparison = pd.DataFrame({
    'Model': ['With grade_subgrade', 'Without grade_subgrade'],
    'Train AUC': [train_auc_1, train_auc_2],
    'Val AUC': [val_auc_1, val_auc_2],
    'Overfitting': [train_auc_1 - val_auc_1, train_auc_2 - val_auc_2],
    'AUC Drop': [0, val_auc_1 - val_auc_2]
})

print(comparison.to_string(index=False))


STEP 6: MODEL COMPARISON
                 Model  Train AUC  Val AUC  Overfitting  AUC Drop
   With grade_subgrade   0.934251 0.921103     0.013148  0.000000
Without grade_subgrade   0.934185 0.921618     0.012568 -0.000515


In [61]:
X_test = test_df[all_features].copy()

In [64]:
X_test_no_grade = X_test[features_no_grade].copy()
for col in categorical_no_grade:
    X_test_no_grade[col] = X_test_no_grade[col].astype('category')

In [65]:
y_test_pred = model_no_grade.predict(X_test_no_grade, num_iteration=model_no_grade.best_iteration)

In [77]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'loan_paid_back': y_test_pred  # or y_test_pred if probabilities are required
})
submission.to_csv('submission.csv', index=False)

In [78]:
temp = pd.read_csv('submission.csv')

In [79]:
temp.head()

Unnamed: 0,id,loan_paid_back
0,593994,0.819781
1,593995,0.919589
2,593996,0.199212
3,593997,0.749866
4,593998,0.862908
