<h1><center>Home Credit Risk Prediction</center></h1>
<center> - sections 5/6/7/8 - </center>
<center>December 2024</center>
<center>Celine Ng</center>

# Table of Contents

1. Project Introduction
    1. Notebook Preparation
    1. Data loading
1. Main Data Preparation
    1. Data cleaning
    1. Dataframes and keys
    1. Train Test Split
    1. Quick EDA
        1. Keys present in each table
        1. Distribution
    1. Aggregation
1. Initial Data Cleaning
    1. Datatypes
    1. Missing values
1. EDA
    1. Original Application Table Distribution
    1. Correlation
    1. Statistical Inference
1. Data Preprocessing
1. Feature Selection
    1. All features included
    1. Mutual Information
    1. PCA
1. Models
    1. Baseline model
    1. Basic model
    1. Hyperparameter Tuning
    1. Test Data
    1. Final Model
    1. Deployment
    1. Model Interpretation
1. Improvements

In [1]:
%%capture
%pip install -r requirements.txt

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
import os

from utils.eda import *
from utils.stats import *
from utils.custom_preprocessor import *
from utils.model import *

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import (OrdinalEncoder, FunctionTransformer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold

import xgboost as xgb
import optuna
from scipy.sparse import csr_matrix
import shap

  from .autonotebook import tqdm as notebook_tqdm


***

# Variables & Data from previous Notebooks

Load aggregated main table

In [3]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
folder = os.path.join(project_root, "aggregated_data")
data_path = os.path.join(folder, "data_merged.pkl")
data = pd.read_pickle(data_path)

Define variables

In [4]:
target = 'TARGET'
binary_columns =['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
                 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'EMERGENCYSTATE_MODE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CODE_GENDER']
categorical_columns = ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE']

***

# 5. Data Preprocessing

Separate data and target

In [5]:
ml_data = data.copy()
ml_data_sample = ml_data.sample(n=10, random_state=42)

X, y = (ml_data.drop(columns=[target]).copy(),
        ml_data[target].reset_index(drop=True))

X_sample, _, y_sample, _ = train_test_split(
    X, y, train_size=0.5, random_state=42, stratify=y
)

**Define columns**

In [6]:
numerical_binary_columns = (ml_data_sample[binary_columns].select_dtypes
                            (include='number'))
object_binary_columns = [col for col in binary_columns if col not in numerical_binary_columns]

**Define Preprocessors**

In [7]:
preprocessor_encode = ColumnTransformer(
    transformers=[
        ('binary_encode', OrdinalEncoder(), object_binary_columns),
        ('freq_encode', FrequencyEncoder(), categorical_columns),
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

Pipeline for now, as other steps are completed without pipeline already

In [8]:
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_encode)
])

# 6. Feature Engineering
Objective: Check baseline model (before new features), and compare to model
after new features

## 6.1. Baseline model
Without new features (BUREAU_ID, PREV_ID, etc.)

Split data set into train and validation, as test
is already prepared.

In [9]:
X_train, X_eval, y_train, y_eval = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=0, stratify=y_sample
)

X_train_tf = pd.DataFrame(
    preprocessor_encode.fit_transform(X_train, y_train),
    columns=preprocessor_encode.get_feature_names_out(),
)
X_eval_tf = pd.DataFrame(
    preprocessor_encode.transform(X_eval), columns=preprocessor_encode.get_feature_names_out()
)

In [10]:
num_negative = (y_train == 0).sum()
num_positive = (y_train == 1).sum()

scale_pos_weight = num_negative / num_positive

clf_xgb = xgb.XGBClassifier(objective='binary:logistic', random_state=42,
                            scale_pos_weight=scale_pos_weight)

clf_xgb.fit(
    X_train_tf, y_train,
    eval_set=[(X_eval_tf, y_eval)],
    verbose=False
)

y_pred = clf_xgb.predict(X_eval_tf)
y_prob = clf_xgb.predict_proba(X_eval_tf)[:, 1]

evaluation_metrics = {
    'accuracy': accuracy_score(y_eval, y_pred),
    'roc_auc': roc_auc_score(y_eval, y_prob),
    'precision': precision_score(y_eval, y_pred, zero_division=0),
    'recall': recall_score(y_eval, y_pred, zero_division=0),
    'f1': f1_score(y_eval, y_pred, zero_division=0)
}

evaluation_df = pd.DataFrame(evaluation_metrics, index=['Evaluation'])
evaluation_df

Unnamed: 0,accuracy,roc_auc,precision,recall,f1
Evaluation,0.810617,0.73299,0.208379,0.480866,0.29076


**Cross Validation and Early Stopping**<br>
*Primary Metric: ROC AUC*<br>
The dataset is imbalanced and so it would be important for the model to
distinguish between the 2 classes, default or not.
<br><br>
*Secondary Metric: F1 Score*<br>
Provides a balanced view of precision and recall, ensuring
that each class is handled reasonably well in terms of both capturing and
not over-predicting each class.

In [11]:
dtrain = xgb.DMatrix(X_train_tf, label=y_train)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
    'random_state': 42
}

cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=10,
    verbose_eval=True,
    metrics='auc'
)

print(f"Best iteration: {cv_results.shape[0]}")

[0]	train-auc:0.74283+0.00110	test-auc:0.70323+0.00617
[1]	train-auc:0.76545+0.00142	test-auc:0.71641+0.00267
[2]	train-auc:0.77912+0.00128	test-auc:0.72452+0.00357
[3]	train-auc:0.78961+0.00180	test-auc:0.72911+0.00293
[4]	train-auc:0.79879+0.00178	test-auc:0.73161+0.00321
[5]	train-auc:0.80749+0.00175	test-auc:0.73458+0.00272
[6]	train-auc:0.81430+0.00171	test-auc:0.73699+0.00293
[7]	train-auc:0.82121+0.00144	test-auc:0.73913+0.00345
[8]	train-auc:0.82814+0.00159	test-auc:0.74129+0.00318
[9]	train-auc:0.83344+0.00171	test-auc:0.74269+0.00331
[10]	train-auc:0.83895+0.00155	test-auc:0.74361+0.00327
[11]	train-auc:0.84416+0.00148	test-auc:0.74461+0.00417
[12]	train-auc:0.84953+0.00167	test-auc:0.74541+0.00357
[13]	train-auc:0.85390+0.00182	test-auc:0.74575+0.00326
[14]	train-auc:0.85809+0.00211	test-auc:0.74636+0.00345
[15]	train-auc:0.86182+0.00204	test-auc:0.74703+0.00324
[16]	train-auc:0.86627+0.00197	test-auc:0.74743+0.00259
[17]	train-auc:0.87043+0.00221	test-auc:0.74700+0.00226
[1

**Hyperparameter Tuning**

In [16]:
hyperparameters_folder = os.path.join("..", "hyperparameters")
study_file = os.path.join(hyperparameters_folder, "optuna_study.pkl")

# Create the folder if it doesn't exist
os.makedirs(hyperparameters_folder, exist_ok=True)  # This is the crucial line

# Define the objective function (same as before)
def objective(trial):
    # ... (objective function code remains the same)
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'random_state': 42,
        'scale_pos_weight': (y_train == 0).sum() / (y_train == 1).sum(),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }

    X_sparse = csr_matrix(X_train_tf)
    dtrain = xgb.DMatrix(X_sparse, label=y_train)

    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=300,
        nfold=3,
        early_stopping_rounds=10,
        verbose_eval=False
    )

    return cv_results['test-auc-mean'].max()


# Check if the serialized file exists and is valid
if os.path.exists(study_file):
    try:
        with open(study_file, "rb") as f:
            study = pkl.load(f)
        print(f"Loaded existing study from: {study_file}")
    except (pkl.UnpicklingError, EOFError, FileNotFoundError) as e:
        print(f"Failed to load study from {study_file}: {e}")
        study = None
else:
    study = None

# If no valid study exists, create and optimize a new one
if study is None:
    print("No valid study found. Creating a new study...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)

    # Save the study to a file
    with open(study_file, "wb") as f:
        pkl.dump(study, f)
    print(f"New study saved to: {study_file}")

# Print the best parameters and score
if study:
    print("Best Parameters:", study.best_params)
    print("Best ROC AUC:", study.best_value)

Loaded existing study from: ../hyperparameters/optuna_study.pkl
Best Parameters: {'learning_rate': 0.10919489120891655, 'max_depth': 3, 'min_child_weight': 6.774516879120494, 'subsample': 0.9863725966320864, 'colsample_bytree': 0.8231311359839003, 'gamma': 2.923728499491975}
Best ROC AUC: 0.7715438418989846
