In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score

In [2]:
def remove_skewed_columns(df):
    for column in df:
        if len(df[column].unique()) == 1:
            df.drop(columns=column, inplace=True)
    return df

In [4]:
df = pd.read_csv("../credit_risk_data.csv", skiprows=1, index_col="ID")
df["MARRIAGE"].unique()

array([1, 2, 3, 0])

# pre-processing
- already checked that table only contains intergers
- delete if the column only contain one value
- change sex, education, marriage data to categories data
    -- checked sex data only contain 1 and 2
    -- checked education contains 0, 1, 2, 3, 4, 5, 6 -> change 5 and 6 to 0 assume these data are missing
    -- checked marriage contains 0, 1, 2, 3 -> assum 0 was the missing data
- *may need to consider change the number to other values to check if there is better performance

In [5]:
# remove the invalid column with just one value
df = remove_skewed_columns(df)
# df.drop(["PAY_0", "PAY_2", "PAY_3","PAY_4","PAY_5", "PAY_6"], inplace=True, axis=1)

In [6]:
# changes value 5 and 6 to 0 for education column
# this repetition can be refactored, maybe using numpy
df["MARRIAGE"][df["MARRIAGE"] == 5] = 0
df["MARRIAGE"][df["MARRIAGE"] == 6] = 0
df["MARRIAGE"].unique()

array([1, 2, 3, 0])

In [7]:
# convert "SEX", "EDUCATION", "MARRIAGE" to category data
df_encoded = pd.get_dummies(df,columns=["SEX", "EDUCATION", "MARRIAGE"])

#df_encoded["MARRIAGE_0"].unique()

In [8]:
#customers_on_default_loan = df[df["default payment next month"]==1]

In [9]:
#customers_not_on_default_loan = df[df["default payment next month"]==0]

In [18]:
X = df_encoded.drop("default payment next month", axis=1).copy()
X.head()

Unnamed: 0_level_0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,24,3913,3102,689,0,0,0,0,689,...,0,1,0,0,0,0,0,1,0,0
2,120000,26,2682,1725,2682,3272,3455,3261,0,1000,...,0,1,0,0,0,0,0,0,1,0
3,90000,34,29239,14027,13559,14331,14948,15549,1518,1500,...,0,1,0,0,0,0,0,0,1,0
4,50000,37,46990,48233,49291,28314,28959,29547,2000,2019,...,0,1,0,0,0,0,0,1,0,0
5,50000,57,8617,5670,35835,20940,19146,19131,2000,36681,...,0,1,0,0,0,0,0,1,0,0


In [19]:
y = df_encoded["default payment next month"].copy()
y.unique()

array([1, 0])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y)
y_test.describe()

count    6000.000000
mean        0.221167
std         0.415067
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: default payment next month, dtype: float64

# first trial running
-- the first run attempt found the AUC was 0.539512

In [32]:
trial_xgb = xgb.XGBClassifier(n_estimators= 10,objective="binary:logistic", use_label_encoder=False, random_state = 10, verbosity=3)

In [33]:
trial_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="aucpr",eval_set=[(X_test,y_test)])

[23:26:29] DEBUG: /Users/travis/build/dmlc/xgboost/src/gbm/gbtree.cc:154: Using tree method: 2
[23:26:29] INFO: /Users/travis/build/dmlc/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[0]	validation_0-aucpr:0.42220
[23:26:29] INFO: /Users/travis/build/dmlc/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 116 extra nodes, 0 pruned nodes, max_depth=6
[1]	validation_0-aucpr:0.42401
[23:26:29] INFO: /Users/travis/build/dmlc/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[2]	validation_0-aucpr:0.43183
[23:26:30] INFO: /Users/travis/build/dmlc/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 106 extra nodes, 0 pruned nodes, max_depth=6
[3]	validation_0-aucpr:0.43729
[23:26:30] INFO: /Users/travis/build/dmlc/xgboost/src/tree/updater_prune.cc:101: tree pruning end, 120 extra nodes, 0 pruned nodes, max_depth=6
[4]	validation_0-aucpr:0.44337
[23:26:30] INFO: /Users/travis

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10, n_jobs=8, num_parallel_tree=1, random_state=10,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=3)

In [1]:
plot_confusion_matrix(trial_xgb, X_test,y_test, display_labels=["Default not on the loan", "Default on the loan"])

NameError: name 'plot_confusion_matrix' is not defined

In [24]:
evals_result = trial_xgb.evals_result()
evals_result
evals_result['validation_0']['aucpr']

[0.422205,
 0.424011,
 0.431832,
 0.437293,
 0.443365,
 0.443824,
 0.448347,
 0.450354,
 0.450111,
 0.45019,
 0.450566,
 0.451521,
 0.450899,
 0.452088,
 0.450485,
 0.450358,
 0.451957,
 0.450678,
 0.450545,
 0.451973,
 0.45125,
 0.451368,
 0.451476,
 0.451469]

In [25]:
y_pred = trial_xgb.predict(X_test)
f1_value = f1_score(y_test, y_pred)
f1_value

0.2965864577504197

# Cross validation trial

In [28]:
param_grid = {
    'max_depth': [6, 7, 8],
    'learning_rate':[0.03, 0.09, 0.3, 0.9],
    'gamma': [0, 0.25, 1],
    'reg_lambda': [0, 3, 9, 30, 90],
    'scale_pos_weight': [1, 3, 6]
}


optimal_params = GridSearchCV(estimator = xgb.XGBClassifier(objective="binary:logistic", 
                                                            random_state = 10, 
                                                            subsample=0.9,
                                                            colsample_bytree=0.5),
                              param_grid=param_grid,
                             scoring="roc_auc",cv=5, verbose=2, n_jobs=5)
optimal_params.fit(X_train, y_train)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


KeyboardInterrupt: 