In [18]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split

# import data and add default data to main dataframe
df = pd.read_csv('/Users/chrisjackson/XXXX/1_Financial Data.csv')
df2 = pd.read_csv('/Users/chrisjackson/XXXX/2_Default Data.csv')
df['default'] = np.where(df['LOAN_ID'].isin(df2['LOAN_ID']), 1, 0)

#for the version with the PD rating only else block out the line below and add PD to drop list on X = df.drop....

# df['PD_RISK_RATING'] = df['PD_RISK_RATING'].astype('object')

# set X and y
X = df.drop(['default', 'LOAN_ID', 'PD_RISK_RATING'], axis=1)
y = df['default']

# set up pipeline for imputation and scaling of categorical variables and numerical variables
num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
cat_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# get column indexes for categorical and numerical variables
num_features = df.select_dtypes(include=['int64', 'float64']).drop(['default'], axis=1)
cat_features = df.select_dtypes(include=['object']).drop(['LOAN_ID'], axis=1)

numeric_cols = df.dtypes.apply(lambda x: x.kind in 'bifc').reset_index(
    drop=True).loc[lambda x: x == True].index
cat_cols = (df.dtypes == 'object').reset_index(
    drop=True).loc[lambda x: x == True].index

# set up column transformer for categorical and numerical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transform,  selector(dtype_exclude="object")),
        ('cat', cat_transform, selector(dtype_include="object"))])


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   LOAN_ID         10000 non-null  object 
 1   PD_RISK_RATING  10000 non-null  int64  
 2   X1              10000 non-null  object 
 3   X2              9000 non-null   float64
 4   X3              10000 non-null  float64
 5   X4              9000 non-null   float64
 6   X5              10000 non-null  float64
 7   X6              10000 non-null  float64
 8   X7              10000 non-null  float64
 9   X8              10000 non-null  float64
 10  X9              10000 non-null  float64
 11  X10             10000 non-null  float64
 12  X11             10000 non-null  float64
 13  X12             10000 non-null  float64
 14  X13             10000 non-null  object 
 15  X14             10000 non-null  object 
 16  X15             10000 non-null  object 
 17  default         10000 non-null  

In [7]:
preprocessor

In [21]:
# use pipeline to impute and scale numerical variables and one hot encode categorical variables
X_pipe = preprocessor.fit_transform(X)

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_pipe, y, test_size=0.30, random_state=42, stratify=y)

In [9]:
# if you want to use SMOTE to balance the data, use the os_data variables instead of the X_train and y_train variables
from imblearn.combine import SMOTEENN

os = SMOTEENN(random_state=42)

os_data_X, os_data_y = os.fit_resample(X_train, y_train)

## Modelling


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


In [11]:
# Initiaze the hyperparameters for each model
paramRF = {'n_estimators': [10, 50, 100, 250],
           'max_depth': [5, 10, 20],
           'class_weight': [None, {0: 1, 1: 5}, {0: 1, 1: 10}, {0: 1, 1: 25}]}

paramSVC = {'C': [1, 10, 100],
            'gamma': [1, 0.1, 0.001, 0.0001],
            'kernel': ['linear', 'rbf'],
            'class_weight':[{0: w} for w in [1, 2, 10, 25, 100]]}

paramLR = {'penalty': ['l1', 'l2'],
           'C': [0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000],
           'solver': ['newton-cg', 'lbfgs', 'liblinear'],
           'class_weight': [None, 'balanced', {0: 1, 1: 5}, {0: 1, 1: 10}, {0: 1, 1: 25}]}


paramDT = {'max_depth': [5, 10, 25, None],
           'max_features': ['sqrt', 'log2'],
           'min_samples_split': [2, 5, 10],
           'class_weight': [None, {0: 1, 1: 5},
                            {0: 1, 1: 10}, {0: 1, 1: 25}],
           'criterion': ['gini', 'entropy']}

paramKN = {'n_neighbors': [2, 5, 10, 25, 50]}

paramHGB = {'learning_rate': (0.01, 0.1, 1, 10),
            'max_leaf_nodes': (3, 10, 30)}

paramXGB = {'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5, 2, 5],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'max_depth': [3, 4, 5]}


In [12]:
# set up the various models for classification
models = [
    {'model': RandomForestClassifier(random_state=42), 'param': paramRF},
    {'model': SVC(probability=True, random_state=42), 'param': paramSVC},
    {'model': LogisticRegression(random_state=42), 'param': paramLR},
    {'model': DecisionTreeClassifier(random_state=42), 'param': paramDT},
    {'model': KNeighborsClassifier(), 'param': paramKN},
    {'model': HistGradientBoostingClassifier(random_state=42), 'param': paramHGB},
    {'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 'param': paramXGB}
]

In [13]:
# function to loop through models and hyperparameters and return results

def run_models(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:

      results = []
      target_names = ['no default', 'default']
      for model in models:
            print(" Results from Grid Search ", model['model'])
            gridSearch = GridSearchCV(
                  model['model'], model['param'], cv=5, scoring='f1_micro', verbose = 1, n_jobs=-1)
            gridSearch.fit(X_train, y_train)
            print("\n The best estimator across ALL searched params:\n",
                  gridSearch.best_estimator_)
            print("\n The best score across ALL searched params:\n",
                  gridSearch.best_score_)
            print("\n The best parameters across ALL searched params:\n",
                  gridSearch.best_params_)
            predic = gridSearch.predict(X_test)
            print(classification_report(y_test, predic))
            results.append({'name': model['model'], 'best_estimator': gridSearch.best_estimator_,'report': classification_report(y_test, predic, target_names=target_names, output_dict=True)})
      return results



In [16]:
gridSearch = GridSearchCV(
                  SVC(probability=True, random_state=42), {'C': [1, 10, 100],
            'gamma': [1, 0.1, 0.001, 0.0001],
            'kernel': ['linear', 'rbf'],
            'class_weight':[{0: w} for w in [1, 2, 10, 25, 100]]}, cv=5, scoring='f1_micro', verbose = 1, n_jobs=-1)
gridSearch

In [None]:
results = run_models(X_train, y_train, X_test, y_test)

In [None]:
import pickle

with open('NoSmoteResultsFinal.pkl', 'wb') as f:
    pickle.dump(results, f)