# Parameter Tuning

Exhaustive search over specified parameter values for an estimator, tuning the models.

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn import metrics

Getting data

In [2]:
# Train Test Split
train_data = pd.read_pickle("train_data.pkl")
test_data = pd.read_pickle("test_data.pkl")

features = list(train_data.columns)
features.remove('status')
features.remove('loan_id')

x = train_data[features]
y = train_data['status']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Resampling - Smote
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

# Normalizing data
scaler = StandardScaler()

scaler.fit(x_train)
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

Applying grid search

In [3]:
def grid_search(model, parameter_grid):
    grid_search = GridSearchCV(model,
                               param_grid=parameter_grid,
                               scoring='roc_auc',
                               cv=5,
                               verbose=4,
                               n_jobs=-1)

    grid_search.fit(x_train, y_train)

    print(f"\nBest score: {grid_search.best_score_}")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best estimator: {grid_search.best_estimator_}")
    
    best_model = model = grid_search.best_estimator_
    best_model_pred = best_model.predict(x_test)
    best_model_pred_proba = best_model.predict_proba(x_test)[:, -1]

    # Metrics
    print("\nAUC Score: ", metrics.roc_auc_score(y_test, best_model_pred_proba))
    print(f"Confusion matrix:\n{metrics.confusion_matrix(y_test, best_model_pred)}\n")
    print(f"Classification report:\n{metrics.classification_report(y_test, best_model_pred)}\n")

## Decision Tree Classifier

In [4]:
model = DecisionTreeClassifier(random_state=0)

parameter_grid = {'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'max_features': ['sqrt', 'log2'],
            'min_samples_split':  [2, 4, 6, 8],
            'min_samples_leaf':  [1, 2, 4, 6],
            'min_impurity_split': [0.05, 0.1, 0.23, 0.3],
            'class_weight': ["balanced", None]}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 1024 candidates, totalling 5120 fits

Best score: 0.9134878369493753
Best parameters: {'class_weight': None, 'criterion': 'entropy', 'max_features': 'sqrt', 'min_impurity_split': 0.05, 'min_samples_leaf': 2, 'min_samples_split': 8, 'splitter': 'best'}
Best estimator: DecisionTreeClassifier(criterion='entropy', max_features='sqrt',
                       min_impurity_split=0.05, min_samples_leaf=2,
                       min_samples_split=8, random_state=0)

AUC Score:  0.653361344537815
Confusion matrix:
[[12  2]
 [49 36]]

Classification report:
              precision    recall  f1-score   support

           0       0.20      0.86      0.32        14
           1       0.95      0.42      0.59        85

    accuracy                           0.48        99
   macro avg       0.57      0.64      0.45        99
weighted avg       0.84      0.48      0.55        99






## Random Forest

In [5]:
model = RandomForestClassifier(random_state=0)

parameter_grid = {'n_estimators': [1000, 1500, 1800, 2000],
                  'criterion': ['gini', 'entropy'],
                  'min_samples_split': [2, 5, 10],
                  'min_samples_leaf': [1, 2, 4],
                  'max_features': ['auto', None]}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 144 candidates, totalling 720 fits

Best score: 0.9928550295857989
Best parameters: {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1500}
Best estimator: RandomForestClassifier(criterion='entropy', n_estimators=1500, random_state=0)

AUC Score:  0.773109243697479
Confusion matrix:
[[ 9  5]
 [25 60]]

Classification report:
              precision    recall  f1-score   support

           0       0.26      0.64      0.37        14
           1       0.92      0.71      0.80        85

    accuracy                           0.70        99
   macro avg       0.59      0.67      0.59        99
weighted avg       0.83      0.70      0.74        99




## KNN

In [6]:
model = KNeighborsClassifier()

parameter_grid = {'n_neighbors': [5, 10, 15],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['ball_tree', 'kd_tree', 'brute']}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

Best score: 0.951387245233399
Best parameters: {'algorithm': 'ball_tree', 'n_neighbors': 10, 'weights': 'distance'}
Best estimator: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=10, weights='distance')

AUC Score:  0.765546218487395
Confusion matrix:
[[13  1]
 [43 42]]

Classification report:
              precision    recall  f1-score   support

           0       0.23      0.93      0.37        14
           1       0.98      0.49      0.66        85

    accuracy                           0.56        99
   macro avg       0.60      0.71      0.51        99
weighted avg       0.87      0.56      0.62        99




## SVM

In [7]:
model = SVC(random_state=0, probability=True)

parameter_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto']}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best score: 0.992396449704142
Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best estimator: SVC(C=10, probability=True, random_state=0)

AUC Score:  0.807563025210084
Confusion matrix:
[[ 9  5]
 [17 68]]

Classification report:
              precision    recall  f1-score   support

           0       0.35      0.64      0.45        14
           1       0.93      0.80      0.86        85

    accuracy                           0.78        99
   macro avg       0.64      0.72      0.66        99
weighted avg       0.85      0.78      0.80        99




## AdaBoost


In [8]:
model = AdaBoostClassifier(random_state=0)

parameter_grid = {'n_estimators': [50, 100, 150],
                  'learning_rate': [0.3, 0.5, 1.0]}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 9 candidates, totalling 45 fits

Best score: 0.9602564102564102
Best parameters: {'learning_rate': 0.3, 'n_estimators': 100}
Best estimator: AdaBoostClassifier(learning_rate=0.3, n_estimators=100, random_state=0)

AUC Score:  0.7294117647058824
Confusion matrix:
[[10  4]
 [36 49]]

Classification report:
              precision    recall  f1-score   support

           0       0.22      0.71      0.33        14
           1       0.92      0.58      0.71        85

    accuracy                           0.60        99
   macro avg       0.57      0.65      0.52        99
weighted avg       0.82      0.60      0.66        99




## GradientBoost

In [9]:
model = GradientBoostingClassifier(random_state=0)

parameter_grid = {'n_estimators': [50, 100, 150],
                  'learning_rate': [0.2, 0.3, 0.5, 1.0],
                  'criterion': ['friedman_mse', 'squared_error'],
                  'min_samples_split': [2, 5, 10],
                  'min_samples_leaf': [1, 2, 4],
                  'max_features': ['auto', None]}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


 0.98313609 0.98659435 0.98544379 0.98389546 0.98478304 0.9851545
 0.98465812 0.98983563 0.98945759 0.9830046  0.9858284  0.9866075
 0.98344181 0.98705128 0.98755424 0.98344181 0.98705128 0.98755424
 0.98394806 0.9881854  0.98690007 0.98751808 0.98816239 0.98737673
 0.98314267 0.98468771 0.98660421 0.98313609 0.98659435 0.98544379
 0.98389546 0.98478304 0.9851545  0.98465812 0.98983563 0.98945759
 0.9830046  0.9858284  0.9866075  0.98344181 0.98705128 0.98755424
 0.98344181 0.98705128 0.98755424 0.98394806 0.9881854  0.98690007
 0.98496384 0.98673241 0.98672255 0.98352071 0.98609139 0.98610454
 0.98110454 0.98505588 0.98633794 0.99010191 0.98920447 0.98817554
 0.98741289 0.9898455  0.99061473 0.98508876 0.9868902  0.98688692
 0.98279093 0.98703813 0.98716634 0.98279093 0.98703813 0.98716634
 0.98742275 0.98728139 0.98791913 0.98496384 0.98673241 0.98672255
 0.98352071 0.98609139 0.98610454 0.98110454 0.98505588 0.98633794
 0.99010191 0.98920447 0.98817554 0.98741289 0.9898455  0.990614


Best score: 0.9906147271531888
Best parameters: {'criterion': 'friedman_mse', 'learning_rate': 0.3, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 150}
Best estimator: GradientBoostingClassifier(learning_rate=0.3, max_features='auto',
                           min_samples_leaf=2, min_samples_split=5,
                           n_estimators=150, random_state=0)

AUC Score:  0.7739495798319328
Confusion matrix:
[[12  2]
 [44 41]]

Classification report:
              precision    recall  f1-score   support

           0       0.21      0.86      0.34        14
           1       0.95      0.48      0.64        85

    accuracy                           0.54        99
   macro avg       0.58      0.67      0.49        99
weighted avg       0.85      0.54      0.60        99




## XGBoost

In [10]:
model = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='auc')

parameter_grid = {
    'learning_rate': [0.2, 0.3, 0.5, 1.0],
    'alpha': [0, 0.01, 0.1],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.01, 0.1]
}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 108 candidates, totalling 540 fits

Best score: 0.9869230769230768
Best parameters: {'alpha': 0.01, 'gamma': 0.01, 'learning_rate': 0.2, 'n_estimators': 200}
Best estimator: XGBClassifier(alpha=0.01, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0.01, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0.00999999978,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

AUC Score:  0.8067226890756303
Confusion matrix:
[[11  3]
 [35 50]]

Classification report:
              precision    reca