In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv('train_cleaned.csv')

In [3]:
df.drop(['avg_pay_amt','avg_bill_amt'],axis=1,inplace=True)

In [4]:
df

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,pay_5,...,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default,avg_default,bill_bal_ratio,pay_bal_ratio
0,360000,0,2,2,25,0,0,0,0,0,...,12590,3479,3446,8870,2020,27043,0,0.000000,0.023771,0.026596
1,240000,0,1,2,46,2,2,0,0,0,...,0,0,2240,0,2267,3074,0,0.666667,0.004614,0.005265
2,320000,0,2,1,41,0,0,0,0,0,...,5000,2000,2000,3000,3000,1500,0,0.000000,0.129561,0.008594
3,50000,0,2,2,24,0,0,0,0,0,...,0,0,0,0,0,0,1,0.000000,0.008917,0.000000
4,80000,1,3,1,52,0,0,0,0,0,...,1500,1500,1500,1028,876,1449,0,0.000000,0.285513,0.016360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,50000,0,2,2,31,1,0,0,0,0,...,0,0,0,0,0,0,0,0.166667,0.000000,0.000000
20996,80000,0,2,1,37,0,0,0,0,0,...,5000,3000,2000,4000,2000,2000,0,0.000000,0.689981,0.037500
20997,100000,1,1,1,52,1,2,2,2,0,...,0,3600,0,6000,3000,0,1,1.166667,0.250240,0.021000
20998,90000,0,2,1,47,0,0,0,0,0,...,2000,2000,2000,2000,2000,2000,0,0.000000,0.321144,0.022222


In [5]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder,FunctionTransformer
from sklearn.decomposition import PCA

In [6]:
numerical = ['limit_bal',
 'age',
 'pay_amt1',
 'pay_amt2',
 'pay_amt3',
 'pay_amt4',
 'pay_amt5',
 'pay_amt6',
 'bill_bal_ratio',
 'pay_bal_ratio']

In [7]:
bill_amt_cols=[ 'bill_amt1',
 'bill_amt2',
 'bill_amt3',
 'bill_amt4',
 'bill_amt5',
 'bill_amt6']

In [8]:
ohe_columns=['marriage','education']

In [9]:
categorical=['sex',
 'pay_1',
 'pay_2',
 'pay_3',
 'pay_4',
 'pay_5',
 'pay_6',
 'avg_default']

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing pipeline for numerical features
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for OHE features
ohe_pipeline = Pipeline([
    ('ohe', OneHotEncoder(drop='first'))
])

# Preprocessing pipeline for bill amount columns
bill_amt_pipeline = Pipeline([
    ('pca', PCA(n_components=2))
])

def identity_func(X):
    return X

add_column_transformer = Pipeline(steps=[('add_column', FunctionTransformer(identity_func))])

# ColumnTransformer to apply different preprocessing pipelines to different column subsets
ohe = ColumnTransformer([
     ('ohe', ohe_pipeline, ohe_columns),
     ('add_column', add_column_transformer, categorical),
    ('num', numerical_pipeline, numerical)
])
pca_scaling = Pipeline([
     ('num', numerical_pipeline),
    ('bill_amt', bill_amt_pipeline)
])

pca_scaling_ohe = ColumnTransformer([
    ('ohe',ohe,ohe_columns+categorical+numerical ),
    ('pca_scaling', pca_scaling, bill_amt_cols)
])


# Split the Data

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = df.drop('default',axis=1)
y = df['default']

In [13]:
X_fulltrain, X_test, y_fulltrain,y_test = train_test_split(X,y,test_size=0.2)
X_train, X_val, y_train,y_val = train_test_split(X_fulltrain,y_fulltrain,test_size=0.25)

In [14]:
X_train.reset_index(inplace=True,drop=True)
X_val.reset_index(inplace=True,drop=True)
X_test.reset_index(inplace=True,drop=True)

In [15]:
X_train = pca_scaling_ohe.fit_transform(X_train)
X_val = pca_scaling_ohe.transform(X_val)
X_test = pca_scaling_ohe.transform(X_test)

# Handling Imbalanced Dataset

In [16]:
from imblearn.combine import SMOTETomek
from collections import Counter

In [17]:
os=SMOTETomek()
X_train_ns,y_train_ns=os.fit_resample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 9803, 1: 2797})
The number of classes after fit Counter({0: 9585, 1: 9585})


# Model

In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [19]:
ensemble = XGBClassifier()
ensemble.fit(X_train_ns,y_train_ns)
y_pred = ensemble.predict(X_train_ns)

In [20]:
from pprint import pprint
# Setting up a parameter grid for hyperparameter tuning

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 25, num = 6)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 4, 6, 8, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [1, 5, 10, 15, 20, 25, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 4, 6, 8, 10],
 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]}


In [21]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

In [22]:
# Random search of parameters, using 5 fold cross validation,
# search across 50 different combinations
xgboost_random = RandomizedSearchCV(estimator = ensemble, param_distributions = random_grid, n_iter = 10, cv = 5, scoring='recall', verbose=2, random_state=0, n_jobs = -1, error_score='raise')

# Fit the random search model
xgboost_random.fit(X_train_ns, y_train_ns)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Parameters: { "bootstrap", "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




RandomizedSearchCV(cv=5, error_score='raise',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=0, gpu_id=-1,
                                           grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.30...
                                           num_parallel_tree=1,
                                           predictor='auto', random_state=0,
                                        

In [23]:
def output_result(y,y_pred):
    print('Confusion Matrix')
    print(confusion_matrix(y,y_pred))
    print(f'Accuracy : {accuracy_score(y,y_pred)}')
    print(f'Auc score Train: {roc_auc_score(y,y_pred)}')
    print(classification_report(y,y_pred))

In [24]:
y_pred = xgboost_random.predict(X_train_ns)
output_result(y_train_ns,y_pred)

Confusion Matrix
[[9584    1]
 [   1 9584]]
Accuracy : 0.9998956703182055
Auc score Train: 0.9998956703182054
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9585
           1       1.00      1.00      1.00      9585

    accuracy                           1.00     19170
   macro avg       1.00      1.00      1.00     19170
weighted avg       1.00      1.00      1.00     19170



In [25]:
y_pred = xgboost_random.predict(X_val)
output_result(y_val,y_pred)

Confusion Matrix
[[2977  358]
 [ 528  337]]
Accuracy : 0.789047619047619
Auc score Train: 0.6411245244429808
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      3335
           1       0.48      0.39      0.43       865

    accuracy                           0.79      4200
   macro avg       0.67      0.64      0.65      4200
weighted avg       0.77      0.79      0.78      4200



In [26]:
y_pred = xgboost_random.predict(X_test)
output_result(y_test,y_pred)

Confusion Matrix
[[2885  332]
 [ 577  406]]
Accuracy : 0.7835714285714286
Auc score Train: 0.6549098112108518
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      3217
           1       0.55      0.41      0.47       983

    accuracy                           0.78      4200
   macro avg       0.69      0.65      0.67      4200
weighted avg       0.77      0.78      0.77      4200



In [27]:
X_full_train = np.concatenate([X_train_ns, X_val], axis=0)
y_full_train = np.concatenate([y_train_ns, y_val], axis=0)

In [28]:
final_model = xgboost_random.best_estimator_.fit(X_full_train,y_full_train)

Parameters: { "bootstrap", "max_features", "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [29]:
y_pred = final_model.predict(X_test)
output_result(y_test,y_pred)

Confusion Matrix
[[2919  298]
 [ 586  397]]
Accuracy : 0.7895238095238095
Auc score Train: 0.6556164147043096
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      3217
           1       0.57      0.40      0.47       983

    accuracy                           0.79      4200
   macro avg       0.70      0.66      0.67      4200
weighted avg       0.77      0.79      0.78      4200



# Save the Model

In [30]:
import pickle
import joblib

In [31]:
with open('model.pkl','wb') as f_out:
    pickle.dump((xgboost_random),f_out)

In [32]:
joblib.dump(pca_scaling_ohe, 'pipeline.pkl')

['pipeline.pkl']