In [32]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split

# import data and add default data to main dataframe
df = pd.read_csv('/Users/chrisjackson/XXXX/1_Financial Data.csv')
df2 = pd.read_csv('/Users/chrisjackson/XXXX/2_Default Data.csv')
df['default'] = np.where(df['LOAN_ID'].isin(df2['LOAN_ID']), 1, 0)

# set X and y
X = df.drop(['default', 'LOAN_ID'], axis=1)
y = df['default']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

# set up pipeline for imputation and scaling of categorical variables and numerical variables
num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
cat_transform = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# get column indexes for categorical and numerical variables
num_features = df.select_dtypes(include=['int64', 'float64']).drop([
    'PD_RISK_RATING', 'default'], axis=1)
cat_features = df.select_dtypes(include=['object']).drop(['LOAN_ID'], axis=1)

numeric_cols = df.dtypes.apply(lambda x: x.kind in 'bifc').reset_index(
    drop=True).loc[lambda x: x == True].index
cat_cols = (df.dtypes == 'object').reset_index(
    drop=True).loc[lambda x: x == True].index

# set up column transformer for categorical and numerical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transform,  selector(dtype_exclude="object")),
        ('cat', cat_transform, selector(dtype_include="object"))])


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   LOAN_ID         10000 non-null  object 
 1   PD_RISK_RATING  10000 non-null  int64  
 2   X1              10000 non-null  object 
 3   X2              9000 non-null   float64
 4   X3              10000 non-null  float64
 5   X4              9000 non-null   float64
 6   X5              10000 non-null  float64
 7   X6              10000 non-null  float64
 8   X7              10000 non-null  float64
 9   X8              10000 non-null  float64
 10  X9              10000 non-null  float64
 11  X10             10000 non-null  float64
 12  X11             10000 non-null  float64
 13  X12             10000 non-null  float64
 14  X13             10000 non-null  object 
 15  X14             10000 non-null  object 
 16  X15             10000 non-null  object 
 17  default         10000 non-null  

In [33]:
X_train_pipe = preprocessor.fit_transform(X_train)
X_test_pipe = preprocessor.fit_transform(X_test)


## Modelling


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [15]:
# Initiaze the hyperparameters for each model
paramRF = {'n_estimators': [10, 50, 100, 250],
           'max_depth': [5, 10, 20],
           'class_weight': [None, {0: 1, 1: 5}, {0: 1, 1: 10}, {0: 1, 1: 25}]}

paramSVC = {'C': [1, 10, 100],
            'gamma': [1, 0.1, 0.001, 0.0001],
            'kernel': ['linear', 'rbf']}

paramLR = {'penalty': ['l1', 'l2'],
           'C': np.logspace(-3, 3, 7),
           'solver': ['newton-cg', 'lbfgs', 'liblinear'],
           }

paramDT = {'max_depth': [5, 10, 25, None],
           'max_features': ['sqrt', 'log2'],
           'min_samples_split': [2, 5, 10],
           'class_weight': [None, {0: 1, 1: 5},
                            {0: 1, 1: 10}, {0: 1, 1: 25}],
           'criterion': ['gini', 'entropy']}

paramKN = {'n_neighbors': [2, 5, 10, 25, 50]}

paramHGB = {'learning_rate': (0.01, 0.1, 1, 10),
            'max_leaf_nodes': (3, 10, 30)}

paramXGB = {'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5, 2, 5],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'max_depth': [3, 4, 5]}


In [16]:
# set up the various models for classification
models = [
    {'model': RandomForestClassifier(random_state=42), 'param': paramRF},
    {'model': SVC(probability=True, random_state=42), 'param': paramSVC},
    {'model': LogisticRegression(random_state=42), 'param': paramLR},
    {'model': DecisionTreeClassifier(random_state=42), 'param': paramDT},
    {'model': KNeighborsClassifier(), 'param': paramKN},
    {'model': HistGradientBoostingClassifier(random_state=42), 'param': paramHGB},
    {'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 'param': paramXGB}
]


In [17]:
# function to loop through models and hyperparameters and return results

def run_models(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:

      results = []
      target_names = ['no default', 'default']
      for model in models:
            print(" Results from Grid Search ", model['model'])
            gridSearch = GridSearchCV(
                  model['model'], model['param'], cv=3, scoring='f1_micro', verbose = 1, n_jobs=-1)
            gridSearch.fit(X_train, y_train)
            print("\n The best estimator across ALL searched params:\n",
                  gridSearch.best_estimator_)
            print("\n The best score across ALL searched params:\n",
                  gridSearch.best_score_)
            print("\n The best parameters across ALL searched params:\n",
                  gridSearch.best_params_)
            predic = gridSearch.predict(X_test)
            print(classification_report(y_test, predic))
            results.append({'name': model['model'], 
                            'dataframe': pd.DataFrame(gridSearch.cv_results_),'best_estimator': gridSearch.best_estimator_,'report': classification_report(y_test, predic, target_names=target_names, output_dict=True)})
      return results

In [8]:
results = run_models(X_train_pipe, y_train, X_test_pipe, y_test)

 Results from Grid Search  RandomForestClassifier(random_state=42)
Fitting 3 folds for each of 48 candidates, totalling 144 fits

 The best estimator across ALL searched params:
 RandomForestClassifier(class_weight={0: 1, 1: 25}, max_depth=5, random_state=42)

 The best score across ALL searched params:
 0.9927145302799408

 The best parameters across ALL searched params:
 {'class_weight': {0: 1, 1: 25}, 'max_depth': 5, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2961
           1       0.67      0.67      0.67        39

    accuracy                           0.99      3000
   macro avg       0.83      0.83      0.83      3000
weighted avg       0.99      0.99      0.99      3000

 Results from Grid Search  SVC(probability=True, random_state=42)
Fitting 3 folds for each of 32 candidates, totalling 96 fits

 The best estimator across ALL searched params:
 SVC(C=1, gamma=1, kernel='linear', probability=True

42 fits failed out of a total of 126.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
21 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chrisjackson/miniforge3/envs/mini_env/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chrisjackson/miniforge3/envs/mini_env/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1094, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/chrisjackson/miniforge3/envs/mini_env/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver n


 The best estimator across ALL searched params:
 LogisticRegression(penalty='l1', random_state=42, solver='liblinear')

 The best score across ALL searched params:
 0.9934284895393919

 The best parameters across ALL searched params:
 {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2961
           1       0.90      0.67      0.76        39

    accuracy                           0.99      3000
   macro avg       0.95      0.83      0.88      3000
weighted avg       0.99      0.99      0.99      3000

 Results from Grid Search  DecisionTreeClassifier(random_state=42)
Fitting 3 folds for each of 192 candidates, totalling 576 fits

 The best estimator across ALL searched params:
 DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features='log2',
                       random_state=42)

 The best score across ALL searched params:
 0.98885708118665

 The best parameters acr

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)



 The best estimator across ALL searched params:
 KNeighborsClassifier()

 The best score across ALL searched params:
 0.9882857546181466

 The best parameters across ALL searched params:
 {'n_neighbors': 5}
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2961
           1       0.00      0.00      0.00        39

    accuracy                           0.99      3000
   macro avg       0.49      0.50      0.50      3000
weighted avg       0.97      0.99      0.98      3000

 Results from Grid Search  HistGradientBoostingClassifier(random_state=42)
Fitting 3 folds for each of 12 candidates, totalling 36 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



 The best estimator across ALL searched params:
 HistGradientBoostingClassifier(max_leaf_nodes=10, random_state=42)

 The best score across ALL searched params:
 0.9935717956035585

 The best parameters across ALL searched params:
 {'learning_rate': 0.1, 'max_leaf_nodes': 10}
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2961
           1       1.00      0.59      0.74        39

    accuracy                           0.99      3000
   macro avg       1.00      0.79      0.87      3000
weighted avg       0.99      0.99      0.99      3000

 Results from Grid Search  XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index



 The best estimator across ALL searched params:
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6,
              enable_categorical=False, eval_metric='logloss', gamma=1,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=10, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.6, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

 The best score across ALL searched params:
 0.9944288160641873

 The best parameters across ALL searched params:
 {'colsample_bytree': 0.6, 'gamma': 1, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.6}
              precision    recall 

In [None]:
results

In [21]:
import pickle

with open('NoSmoteResultsF1.pkl', 'wb') as f:
    pickle.dump(results, f)