# Predicting Loan Payback
### Playground Series - Season 5, Episode 11

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from tqdm import tqdm
from itertools import combinations
import gc

import xgboost as xgb

from sklearn.manifold import TSNE as sklearn_TSNE

import optuna
import torch
import copy
import itertools
import warnings

warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('./input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [44]:
train = pd.read_csv("./input/train.csv", index_col='id')
train.head()

Unnamed: 0_level_0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [46]:
test = pd.read_csv("./input/test.csv", index_col='id')
test.head()

Unnamed: 0_level_0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
593994,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
593995,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
593996,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
593997,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
593998,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1


In [27]:
orig = pd.read_csv('./input/archive/loan_dataset_20000.csv', delimiter=',')
orig.head()

Unnamed: 0,age,gender,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,...,loan_term,installment,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back
0,59,Male,Married,Master's,24240.19,2020.02,Employed,0.074,743,17173.72,...,36,581.88,B5,7,40833.47,24302.07,1,0,1,1
1,72,Female,Married,Bachelor's,20172.98,1681.08,Employed,0.219,531,22663.89,...,60,573.17,F1,5,27968.01,10803.01,1,0,3,1
2,49,Female,Single,High School,26181.8,2181.82,Employed,0.234,779,3631.36,...,60,76.32,B4,2,15502.25,4505.44,0,0,0,1
3,35,Female,Single,High School,11873.84,989.49,Employed,0.264,809,14939.23,...,36,468.07,A5,7,18157.79,5525.63,4,0,5,1
4,63,Other,Single,Other,25326.44,2110.54,Employed,0.26,663,16551.71,...,60,395.5,D5,1,17467.56,3593.91,2,0,2,1


In [48]:
TARGET = 'loan_paid_back'
NUMS = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']
CATS = ['gender','marital_status','education_level','employment_status','loan_purpose','grade_subgrade']

In [50]:
train[CATS] = train[CATS].astype('category')
test[CATS] = test[CATS].astype('category')
orig[CATS] = orig[CATS].astype('category')

In [None]:
TE_columns = []

columns = NUMS + CATS

for r in [2]:
    for cols in tqdm(list(combinations(columns, r))):
        name = '-'.join(cols)

        train[name] = train[cols[0]].astype(str)
        for col in cols[1:]:
            train[name] = train[name] + '_' + train[col].astype(str)

        test[name] = test[cols[0]].astype(str)
        for col in cols[1:]:
            test[name] = test[name] + '_' + test[col].astype(str)

        orig[name] = orig[cols[0]].astype(str)
        for col in cols[1:]:
            orig[name] = orig[name] + '_' + orig[col].astype(str)
        
        combined = pd.concat([train[name], test[name], orig[name]], ignore_index=True)
        combined, _ = combined.factorize()
        train[name] = combined[:len(train)]
        test[name] = combined[len(train):len(train) + len(test)]
        orig[name] = combined[len(train) + len(test):]

        TE_columns.append(name)

FEATURES = train.columns.tolist()
FEATURES.remove(TARGET)

In [53]:
def target_encode(train, valid, test, col, target=TARGET, kfold=5, smooth=20, agg='mean'):
    train['kfold'] = ((train.index) % kfold)
    col_name = '_'.join(col)
    train[f'TE_{agg.upper()}_' + col_name] = 0.
    for i in range(kfold):
        df_tmp = train[train['kfold'] != i]
        if agg == 'mean': mn = train[target].mean()
        elif agg == 'median': mn = train[target].median()
        elif agg == 'min': mn = train[target].min()
        elif agg == 'max': mn = train[target].max()
        elif agg == 'nunique': mn = 0
        df_tmp = df_tmp[col + [target]].groupby(col).agg([agg, 'count']).reset_index()
        df_tmp.columns = col + [agg, 'count']
        if agg == 'nunique':
            df_tmp['TE_tmp'] = df_tmp[agg] / df_tmp['count']
        else:
            df_tmp['TE_tmp'] = ((df_tmp[agg] * df_tmp['count']) + (mn * smooth)) / (df_tmp['count'] + smooth)
        df_tmp_m = train[col + ['kfold', f'TE_{agg.upper()}_' + col_name]].merge(df_tmp, how='left', left_on=col, right_on=col)
        df_tmp_m.loc[df_tmp_m['kfold'] == i, f'TE_{agg.upper()}_' + col_name] = df_tmp_m.loc[df_tmp_m['kfold'] == i, 'TE_tmp']
        train[f'TE_{agg.upper()}_' + col_name] = df_tmp_m[f'TE_{agg.upper()}_' + col_name].fillna(mn).values

    df_tmp = train[col + [target]].groupby(col).agg([agg, 'count']).reset_index()
    if agg == 'mean': mn = train[target].mean()
    elif agg == 'median': mn = train[target].median()
    elif agg == 'min': mn = train[target].min()
    elif agg == 'max': mn = train[target].max()
    elif agg == 'nunique': mn = 0
    df_tmp.columns = col + [agg, 'count']
    if agg == 'nunique':
        df_tmp['TE_tmp'] = df_tmp[agg] / df_tmp['count']
    else:
        df_tmp['TE_tmp'] = ((df_tmp[agg] * df_tmp['count']) + (mn * smooth)) / (df_tmp['count'] + smooth)
    df_tmp_m = valid[col].merge(df_tmp, how='left', left_on=col, right_on=col)
    valid[f'TE_{agg.upper()}_' + col_name] = df_tmp_m['TE_tmp'].fillna(mn).values
    valid[f'TE_{agg.upper()}_' + col_name] = valid[f'TE_{agg.upper()}_' + col_name].astype('float32')

    df_tmp_m = test[col].merge(df_tmp, how='left', left_on=col, right_on=col)
    test[f'TE_{agg.upper()}_' + col_name] = df_tmp_m['TE_tmp'].fillna(mn).values
    test[f'TE_{agg.upper()}_' + col_name] = test[f'TE_{agg.upper()}_' + col_name].astype('float32')

    train = train.drop('kfold', axis=1)
    train[f'TE_{agg.upper()}_' + col_name] = train[f'TE_{agg.upper()}_' + col_name].astype('float32')

    return (train, valid, test)

def count_encode(train, valid, test, col):
    counts = train[col].value_counts()

    train[f'CE_{col}'] = train[col].map(counts)
    valid[f'CE_{col}'] = valid[col].map(counts).fillna(0)
    test[f'CE_{col}'] = test[col].map(counts).fillna(0)
    return (train, valid, test)

In [None]:
oof = np.zeros(len(train))
pred = np.zeros(len(test))

# ---- Configuration ----
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

def objective(trial):
    auc_scores = []

    # Hyperparamètres à optimiser
    param_grid = {
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
    }

    for idx, (train_idx, val_idx) in enumerate(skf.split(train, train[TARGET])):
        X_train, X_val = train.loc[train_idx, FEATURES], train.loc[val_idx, FEATURES]
        y_train, y_val = train.loc[train_idx, TARGET], train.loc[val_idx, TARGET]
        X_test = test.copy()

        # Ajout des données "orig"
        X_train = pd.concat([X_train, orig[FEATURES]])
        y_train = pd.concat([y_train, orig[TARGET]])

        # Encodages
        for col in TE_columns:
            X_train, X_val, X_test = target_encode(
                pd.concat([X_train, y_train], axis=1), 
                X_val, X_test, [col], smooth=10, agg='mean'
            )
            X_train = X_train.drop(TARGET, axis=1)
            X_train, X_val, X_test = count_encode(X_train, X_val, X_test, col)
            X_train = X_train.drop(col, axis=1)
            X_val = X_val.drop(col, axis=1)
            X_test = X_test.drop(col, axis=1)

        model = XGBClassifier(
            **param_grid,
            n_estimators=10000,
            objective="binary:logistic",
            eval_metric="auc",
            learning_rate=0.01,
            early_stopping_rounds=1000,
            random_state=42,
            enable_categorical=True,
            device="cuda",
            n_jobs=-1
        )

        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        y_pred = model.predict_proba(X_val)[:, 1]
        auc_scores.append(roc_auc_score(y_val, y_pred))

        del model, X_train, X_val, y_train, y_val, X_test
        gc.collect()

    return sum(auc_scores) / len(auc_scores)


# ---- Lancer l'optimisation Optuna ----
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)  # tu peux augmenter le nombre de trials

print("Meilleurs paramètres :", study.best_params)
print("Meilleur score AUC moyen :", study.best_value)

In [56]:
best_params = {'colsample_bytree': 0.7380469821937778, 'subsample': 0.998106192978282, 'reg_lambda': 0.19070641491954002, 'reg_alpha': 5.985930260763481, 'max_depth': 5}

In [None]:
# ---- Entraînement final avec les meilleurs paramètres ----
oof = np.zeros(len(train))
pred = np.zeros(len(test))

for idx, (train_idx, val_idx) in enumerate(skf.split(train, train[TARGET])):
    X_train, X_val = train.loc[train_idx, FEATURES], train.loc[val_idx, FEATURES]
    y_train, y_val = train.loc[train_idx, TARGET], train.loc[val_idx, TARGET]
    X_test = test.copy()

    X_train = pd.concat([X_train, orig[FEATURES]])
    y_train = pd.concat([y_train, orig[TARGET]])

    for col in TE_columns:
        X_train, X_val, X_test = target_encode(
            pd.concat([X_train, y_train], axis=1), 
            X_val, X_test, [col], smooth=10, agg='mean'
        )
        X_train = X_train.drop(TARGET, axis=1)
        X_train, X_val, X_test = count_encode(X_train, X_val, X_test, col)
        X_train = X_train.drop(col, axis=1)
        X_val = X_val.drop(col, axis=1)
        X_test = X_test.drop(col, axis=1)

    model = XGBClassifier(
        **best_params,
        n_estimators=10000,
        objective="binary:logistic",
        eval_metric="auc",
        learning_rate=0.01,
        early_stopping_rounds=200,
        random_state=42,
        enable_categorical=True,
        device="cuda",
        n_jobs=-1
    )

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)
    oof[val_idx] = model.predict_proba(X_val)[:, 1]
    pred += model.predict_proba(X_test)[:, 1]

    print(f"Fold {idx + 1}: {roc_auc_score(y_val, oof[val_idx])}")

    del model, X_train, X_val, y_train, y_val, X_test
    gc.collect()

pred /= 5
print(f"CV AUC: {roc_auc_score(train[TARGET], oof)}")

In [68]:
submission = pd.read_csv('./output/sample_submission.csv')
submission['loan_paid_back'] = pred

In [70]:
submission

Unnamed: 0,id,loan_paid_back
0,593994,0.952494
1,593995,0.976977
2,593996,0.496178
3,593997,0.905062
4,593998,0.964994
...,...,...
254564,848558,0.993997
254565,848559,0.820646
254566,848560,0.929454
254567,848561,0.988000


In [72]:
submission.to_csv('./output/to_submit.csv', index=False)