In [46]:
import os

import os
import glob
import pandas as pd
import pickle
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

In [47]:
# set up config -- Need to make it more modular
model_train_date_str = "2024-09-01"
train_test_period_months = 12
oot_period_months = 2
train_test_ratio = 0.8

config = {}
config["model_train_date_str"] = model_train_date_str
config["train_test_period_months"] = train_test_period_months
config["oot_period_months"] =  oot_period_months
config["model_train_date"] =  datetime.strptime(model_train_date_str, "%Y-%m-%d")
config["oot_end_date"] =  config['model_train_date'] - timedelta(days = 1)
config["oot_start_date"] =  config['model_train_date'] - relativedelta(months = oot_period_months)
config["train_test_end_date"] =  config["oot_start_date"] - timedelta(days = 1)
config["train_test_start_date"] =  config["oot_start_date"] - relativedelta(months = train_test_period_months)
config["train_test_ratio"] = train_test_ratio 


pprint.pprint(config)

{'model_train_date': datetime.datetime(2024, 9, 1, 0, 0),
 'model_train_date_str': '2024-09-01',
 'oot_end_date': datetime.datetime(2024, 8, 31, 0, 0),
 'oot_period_months': 2,
 'oot_start_date': datetime.datetime(2024, 7, 1, 0, 0),
 'train_test_end_date': datetime.datetime(2024, 6, 30, 0, 0),
 'train_test_period_months': 12,
 'train_test_ratio': 0.8,
 'train_test_start_date': datetime.datetime(2023, 7, 1, 0, 0)}


In [48]:
# Start Spark
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

### Get Lables

In [49]:
# connect to label store
folder_path = "datamart/gold/label_store/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
label_store_sdf = spark.read.option("header", "true").parquet(*files_list)
print("row_count:",label_store_sdf.count())

label_store_sdf.show()

row_count: 8974
+--------------------+-----------+-----+----------+-------------+
|             loan_id|customer_id|label| label_def|snapshot_date|
+--------------------+-----------+-----+----------+-------------+
|CUS_0x1037_2023_0...| CUS_0x1037|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1069_2023_0...| CUS_0x1069|    0|30dpd_6mob|   2023-07-01|
|CUS_0x114a_2023_0...| CUS_0x114a|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1184_2023_0...| CUS_0x1184|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1297_2023_0...| CUS_0x1297|    1|30dpd_6mob|   2023-07-01|
|CUS_0x12fb_2023_0...| CUS_0x12fb|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1325_2023_0...| CUS_0x1325|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1341_2023_0...| CUS_0x1341|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1375_2023_0...| CUS_0x1375|    1|30dpd_6mob|   2023-07-01|
|CUS_0x13a8_2023_0...| CUS_0x13a8|    0|30dpd_6mob|   2023-07-01|
|CUS_0x13ef_2023_0...| CUS_0x13ef|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1440_2023_0...| CUS_0x1440|    0|30dpd_6mob|   2023-0

In [50]:
# extract label store
labels_sdf = label_store_sdf.filter((col("snapshot_date") >= config["train_test_start_date"]) & (col("snapshot_date") <= config["oot_end_date"]))

print("extracted labels_sdf", labels_sdf.count(), config["train_test_start_date"], config["oot_end_date"])

extracted labels_sdf 6961 2023-07-01 00:00:00 2024-08-31 00:00:00


### Get Features

In [51]:
feature_location = "datamart/gold/feature_store"

features_files_list = [
    os.path.join(feature_location, os.path.basename(f))
    for f in glob.glob(os.path.join(feature_location, '*'))
]

# Load CSV into DataFrame - connect to feature store
features_store_sdf = spark.read.option("header", "true").parquet(*features_files_list)
print("row_count:",features_store_sdf.count())
features_store_sdf.show()


row_count: 8974
+-----------+-------------+---+-------------+---------------------+-----------------+---------------+-------------+-----------+-------------------+----------------------+--------------------+--------------------+----------------+------------------------+-------------------+-----------------------+---------------+------------------------+-------------+---------+-------------------+-------------+-------------+------------+----------------+-----------+-----------------------+-----------------+--------------------+-----------------+-------------------+------------------------+-------------------+--------------------+--------------------+------------------+---------------------+-----------------------+---------------------+-------------------+-----------------+------------------+-------------------------+------------------------+------------------------+-------------------+---------------+--------------+---------------------------+----------------------------+---------------

In [52]:
# extract feature store
features_sdf = features_store_sdf.filter((col("snapshot_date") >= config["train_test_start_date"]) & (col("snapshot_date") <= config["oot_end_date"]))
print("extracted features_sdf", features_sdf.count(), config["train_test_start_date"], config["oot_end_date"])

extracted features_sdf 5889 2023-07-01 00:00:00 2024-08-31 00:00:00


In [None]:
# Changed labels_store_sdf / feature_store_sdf to labels_sdf, features_sdf
y_df = labels_sdf.toPandas().sort_values(by='customer_id')
X_df = features_sdf.toPandas().sort_values(by='customer_id')

merged_df = pd.merge(X_df, y_df, on='customer_id', how='inner')
merged_df = merged_df.drop(columns=["snapshot_date_x", "snapshot_date_y", "customer_id", "loan_id", "label_def"])
merged_df.head(25)


Unnamed: 0,age,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,...,avg_fe_13,avg_fe_14,avg_fe_15,avg_fe_16,avg_fe_17,avg_fe_18,avg_fe_19,avg_fe_20,label,label_def
0,44,58918.46875,5208.872559,3.0,3.0,17.0,3.0,27,13.0,14.42,...,96.545455,128.454545,130.0,65.727273,138.363636,50.818182,133.090909,105.818182,0,30dpd_6mob
1,44,98620.976562,7962.415039,3.0,3.0,6.0,3.0,12,9.0,1.33,...,90.333333,44.583333,100.833333,141.666667,51.25,96.75,95.666667,117.0,0,30dpd_6mob
2,27,46951.019531,3725.584961,7.0,4.0,16.0,0.0,8,9.0,15.83,...,104.75,43.25,195.75,81.0,82.75,187.875,134.625,145.75,0,30dpd_6mob
3,15,61194.808594,5014.567383,7.0,7.0,23.0,8.0,19,22.0,28.629999,...,72.909091,125.636364,66.545455,59.636364,67.636364,81.363636,40.363636,63.636364,1,30dpd_6mob
4,52,170614.28125,14463.856445,2.0,6.0,9.0,2.0,0,5.0,0.73,...,54.8,101.3,114.2,52.9,149.0,149.7,105.3,110.9,0,30dpd_6mob
5,31,89064.523438,7256.043457,5.0,3.0,1.0,1.0,6,5.0,6.37,...,100.615385,102.615385,79.923077,52.076923,107.615385,96.923077,109.0,75.538462,0,30dpd_6mob
6,40,60410.941406,5274.245117,4.0,7.0,17.0,4.0,24,9.0,10.65,...,104.75,108.5,206.5,112.25,72.625,100.75,122.125,62.625,0,30dpd_6mob
7,15,8978.110352,894.175842,8.0,8.0,30.0,8.0,50,17.0,18.48,...,57.454545,109.909091,149.909091,104.363636,68.727273,157.909091,112.727273,47.636364,0,30dpd_6mob
8,27,42387.539062,3680.294922,3.0,7.0,26.0,7.0,9,8.0,10.34,...,116.0,82.0,87.428571,101.142857,77.142857,105.357143,90.285714,111.285714,1,30dpd_6mob
9,37,14981.389648,1461.449219,2.0,7.0,5.0,4.0,5,6.0,0.63,...,96.75,121.25,29.666667,115.333333,76.833333,51.583333,124.833333,97.583333,0,30dpd_6mob


In [42]:
X_df['snapshot_date'] = pd.to_datetime(X_df['snapshot_date'])
y_df['snapshot_date'] = pd.to_datetime(y_df['snapshot_date'])

X_df = X_df[np.isin(X_df['customer_id'], y_df['customer_id'].unique())]

### Prepare Data

In [None]:
# Create OOT split
y_oot = y_df[(y_df['snapshot_date'] >= config['oot_start_date']) & (y_df['snapshot_date'] <= config['oot_end_date'])]
X_oot = X_df.merge(
    y_oot[['customer_id']],
    on=['customer_id'],
    how='inner'
)

# Everything else goes into train-test
y_traintest = y_df[y_df['snapshot_date'] <= config['train_test_end_date']]
X_traintest = X_df.merge(
    y_traintest[['customer_id']],
    on=['customer_id'],
    how='inner'
)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_traintest, y_traintest, 
                                                    test_size=config['train_test_ratio'], 
                                                    random_state=42, 
                                                    shuffle=True, 
                                                    stratify=y_traintest['label'])

X_train = X_train.drop(columns=['customer_id', 'snapshot_date'])
X_test = X_test.drop(columns=['customer_id', 'snapshot_date'])
X_oot = X_oot.drop(columns=['customer_id', 'snapshot_date'])

y_train = y_train['label']
y_test = y_test['label']
y_oot = y_oot['label']

print('X_train', X_train.shape[0])
print('X_test', X_test.shape[0])
print('X_oot', X_oot.shape[0])
print('y_train', y_train.shape[0], round(y_train.mean(),2))
print('y_test', y_test.shape[0], round(y_test.mean(),2))
print('y_oot', y_oot.shape[0], round(y_oot.mean(),2))

X_train 1191
X_test 4767
X_oot 1003
y_train 1191 0.28
y_test 4767 0.28
y_oot 1003 0.29


In [11]:
transformer_stdscaler = StandardScaler()
X_train_processed = transformer_stdscaler.fit_transform(X_train)
X_test_processed = transformer_stdscaler.transform(X_test)
X_oot_processed = transformer_stdscaler.transform(X_oot)

print('X_train_processed', X_train_processed.shape[0])
print('X_test_processed', X_test_processed.shape[0])
print('X_oot_processed', X_oot_processed.shape[0])

X_train_processed 1191
X_test_processed 4767
X_oot_processed 1003


In [12]:
# Define the XGBoost classifier
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=88)

# Define the hyperparameter space to search
param_dist = {
    'n_estimators': [25, 50],
    'max_depth': [2, 3],  # lower max_depth to simplify the model
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'gamma': [0, 0.1],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Create a scorer based on AUC score
auc_scorer = make_scorer(roc_auc_score)

# Set up the random search with cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    scoring=auc_scorer,
    n_iter=100,  # Number of iterations for random search
    cv=3,       # Number of folds in cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Perform the random search
random_search.fit(X_train_processed, y_train)

# Output the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)

# Evaluate the model on the train set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_train_processed)[:, 1]
train_auc_score = roc_auc_score(y_train, y_pred_proba)
print("Train AUC score: ", train_auc_score)

# Evaluate the model on the test set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test_processed)[:, 1]
test_auc_score = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score: ", test_auc_score)

# Evaluate the model on the oot set
best_model = random_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_oot_processed)[:, 1]
oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", oot_auc_score)

print("TRAIN GINI score: ", round(2*train_auc_score-1,3))
print("Test GINI score: ", round(2*test_auc_score-1,3))
print("OOT GINI score: ", round(2*oot_auc_score-1,3))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'subsample': 0.6, 'reg_lambda': 1.5, 'reg_alpha': 0, 'n_estimators': 50, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Best AUC score:  0.7487927702060836
Train AUC score:  0.956872528648566
Test AUC score:  0.8400700776988003
OOT AUC score:  0.8842329819684158
TRAIN GINI score:  0.914
Test GINI score:  0.68
OOT GINI score:  0.768


### Save Model Artefacts

In [13]:
model_artefact = {}

model_artefact['model'] = best_model
model_artefact['model_version'] = "credit_model_"+config["model_train_date_str"].replace('-','_')
model_artefact['preprocessing_transformers'] = {}
model_artefact['preprocessing_transformers']['stdscaler'] = transformer_stdscaler
model_artefact['data_dates'] = config
model_artefact['data_stats'] = {}
model_artefact['data_stats']['X_train'] = X_train.shape[0]
model_artefact['data_stats']['X_test'] = X_test.shape[0]
model_artefact['data_stats']['X_oot'] = X_oot.shape[0]
model_artefact['data_stats']['y_train'] = round(y_train.mean(),2)
model_artefact['data_stats']['y_test'] = round(y_test.mean(),2)
model_artefact['data_stats']['y_oot'] = round(y_oot.mean(),2)
model_artefact['results'] = {}
model_artefact['results']['auc_train'] = train_auc_score
model_artefact['results']['auc_test'] = test_auc_score
model_artefact['results']['auc_oot'] = oot_auc_score
model_artefact['results']['gini_train'] = round(2*train_auc_score-1,3)
model_artefact['results']['gini_test'] = round(2*test_auc_score-1,3)
model_artefact['results']['gini_oot'] = round(2*oot_auc_score-1,3)
model_artefact['hp_params'] = random_search.best_params_


pprint.pprint(model_artefact)

{'data_dates': {'model_train_date': datetime.datetime(2024, 9, 1, 0, 0),
                'model_train_date_str': '2024-09-01',
                'oot_end_date': datetime.datetime(2024, 8, 31, 0, 0),
                'oot_period_months': 2,
                'oot_start_date': datetime.datetime(2024, 7, 1, 0, 0),
                'train_test_end_date': datetime.datetime(2024, 6, 30, 0, 0),
                'train_test_period_months': 12,
                'train_test_ratio': 0.8,
                'train_test_start_date': datetime.datetime(2023, 7, 1, 0, 0)},
 'data_stats': {'X_oot': 1003,
                'X_test': 4767,
                'X_train': 1191,
                'y_oot': 0.29,
                'y_test': 0.28,
                'y_train': 0.28},
 'hp_params': {'colsample_bytree': 0.8,
               'gamma': 0,
               'learning_rate': 0.1,
               'max_depth': 3,
               'min_child_weight': 3,
               'n_estimators': 50,
               'reg_alpha': 0,
               

In [14]:
# create model_bank dir
model_bank_directory = "model_bank/"

if not os.path.exists(model_bank_directory):
    os.makedirs(model_bank_directory)

In [15]:
# Full path to the file
file_path = os.path.join(model_bank_directory, model_artefact['model_version'] + '.pkl')

# Write the model to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(model_artefact, file)

print(f"Model saved to {file_path}")


Model saved to model_bank/credit_model_2024_09_01.pkl


### Load Model

In [16]:
# Load the model from the pickle file
with open(file_path, 'rb') as file:
    loaded_model_artefact = pickle.load(file)

y_pred_proba = loaded_model_artefact['model'].predict_proba(X_oot_processed)[:, 1]
oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", oot_auc_score)

print("Model loaded successfully!")

OOT AUC score:  0.8842329819684158
Model loaded successfully!
