# Parameter Tuning

Exhaustive search over specified parameter values for an estimator, tuning the models.

In [56]:
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn import metrics

Getting data

In [57]:
# Train Test Split
train_data = pd.read_pickle("train_data.pkl")
test_data = pd.read_pickle("test_data.pkl")

features = list(train_data.columns)
features.remove('status')
features.remove('loan_id')

x = train_data[features]
y = train_data['status']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Resampling - Smote
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)

# Normalizing data
scaler = StandardScaler()

scaler.fit(x_train)
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

Applying grid search

In [58]:
def grid_search(model, parameter_grid):
    grid_search = GridSearchCV(model,
                               param_grid=parameter_grid,
                               scoring='roc_auc',
                               cv=5,
                               verbose=4,
                               n_jobs=-1)

    grid_search.fit(x_train, y_train)

    print(f"\nBest score: {grid_search.best_score_}")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best estimator: {grid_search.best_estimator_}")
    
    best_model = model = grid_search.best_estimator_
    best_model_pred = best_model.predict(x_test)
    best_model_pred_proba = best_model.predict_proba(x_test)[:, -1]

    # Metrics
    print("\nAUC Score: ", metrics.roc_auc_score(y_test, best_model_pred_proba))
    print(f"Confusion matrix:\n{metrics.confusion_matrix(y_test, best_model_pred)}\n")
    print(f"Classification report:\n{metrics.classification_report(y_test, best_model_pred)}\n")

## Decision Tree Classifier

In [29]:
model = DecisionTreeClassifier(random_state=0)

parameter_grid = {'criterion': ['gini', 'entropy'],
                  'splitter': ['best', 'random'],
                  'min_samples_split': [2, 5, 10],
                  'min_samples_leaf': [1, 2, 4],
                  'max_features': ['auto', None]}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 72 candidates, totalling 360 fits

Best score: 0.9102892833662064
Best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'splitter': 'best'}
Best estimator: DecisionTreeClassifier(min_samples_leaf=4, min_samples_split=10, random_state=0)

AUC Score:  0.6567226890756304
Confusion matrix:
[[11  3]
 [41 44]]

Classification report:
              precision    recall  f1-score   support

          -1       0.21      0.79      0.33        14
           1       0.94      0.52      0.67        85

    accuracy                           0.56        99
   macro avg       0.57      0.65      0.50        99
weighted avg       0.83      0.56      0.62        99




## Random Forest

In [30]:
model = RandomForestClassifier(random_state=0)

parameter_grid = {'n_estimators': [1000, 1500, 1800, 2000],
                  'criterion': ['gini', 'entropy'],
                  'min_samples_split': [2, 5, 10],
                  'min_samples_leaf': [1, 2, 4],
                  'max_features': ['auto', None]}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 144 candidates, totalling 720 fits

Best score: 0.994595660749507
Best parameters: {'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1500}
Best estimator: RandomForestClassifier(n_estimators=1500, random_state=0)

AUC Score:  0.7495798319327731
Confusion matrix:
[[ 9  5]
 [31 54]]

Classification report:
              precision    recall  f1-score   support

          -1       0.23      0.64      0.33        14
           1       0.92      0.64      0.75        85

    accuracy                           0.64        99
   macro avg       0.57      0.64      0.54        99
weighted avg       0.82      0.64      0.69        99




## KNN

In [31]:
model = KNeighborsClassifier()

parameter_grid = {'n_neighbors': [5, 10, 15],
                  'weights': ['uniform', 'distance'],
                  'algorithm': ['ball_tree', 'kd_tree', 'brute']}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

Best score: 0.9559960552268244
Best parameters: {'algorithm': 'ball_tree', 'n_neighbors': 10, 'weights': 'distance'}
Best estimator: KNeighborsClassifier(algorithm='ball_tree', n_neighbors=10, weights='distance')

AUC Score:  0.8151260504201682
Confusion matrix:
[[14  0]
 [44 41]]

Classification report:
              precision    recall  f1-score   support

          -1       0.24      1.00      0.39        14
           1       1.00      0.48      0.65        85

    accuracy                           0.56        99
   macro avg       0.62      0.74      0.52        99
weighted avg       0.89      0.56      0.61        99




## SVM

In [36]:
model = SVC(random_state=0, probability=True)

parameter_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto']}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best score: 0.9976397107166338
Best parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Best estimator: SVC(C=10, gamma='auto', probability=True, random_state=0)

AUC Score:  0.8243697478991597
Confusion matrix:
[[10  4]
 [13 72]]

Classification report:
              precision    recall  f1-score   support

          -1       0.43      0.71      0.54        14
           1       0.95      0.85      0.89        85

    accuracy                           0.83        99
   macro avg       0.69      0.78      0.72        99
weighted avg       0.87      0.83      0.84        99




## AdaBoost


In [43]:
model = AdaBoostClassifier(random_state=0)

parameter_grid = {'n_estimators': [50, 100, 150],
                  'learning_rate': [0.3, 0.5, 1.0]}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 9 candidates, totalling 45 fits

Best score: 0.972922419460881
Best parameters: {'learning_rate': 1.0, 'n_estimators': 100}
Best estimator: AdaBoostClassifier(n_estimators=100, random_state=0)

AUC Score:  0.6789915966386555
Confusion matrix:
[[10  4]
 [41 44]]

Classification report:
              precision    recall  f1-score   support

          -1       0.20      0.71      0.31        14
           1       0.92      0.52      0.66        85

    accuracy                           0.55        99
   macro avg       0.56      0.62      0.48        99
weighted avg       0.81      0.55      0.61        99




## GradientBoost

In [65]:
model = GradientBoostingClassifier(random_state=0)

parameter_grid = {'n_estimators': [50, 100, 150],
                  'learning_rate': [0.2, 0.3, 0.5, 1.0],
                  'criterion': ['friedman_mse', 'squared_error'],
                  'min_samples_split': [2, 5, 10],
                  'min_samples_leaf': [1, 2, 4],
                  'max_features': ['auto', None]}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


 0.97842867 0.98267916 0.9819165  0.98216305 0.98499671 0.98615056
 0.98382972 0.98447732 0.98627548 0.97909599 0.98371466 0.98294214
 0.97832018 0.98294214 0.98435569 0.97832018 0.98294214 0.98435569
 0.97651874 0.98345825 0.98615056 0.98073964 0.98627876 0.98486851
 0.98152202 0.98498028 0.98473702 0.97842867 0.98267916 0.9819165
 0.98216305 0.98499671 0.98615056 0.98382972 0.98447732 0.98627548
 0.97909599 0.98371466 0.98294214 0.97832018 0.98294214 0.98435569
 0.97832018 0.98294214 0.98435569 0.97651874 0.98345825 0.98615056
 0.98037475 0.98306706 0.98627876 0.97562459 0.98140039 0.98089415
 0.97729783 0.98320184 0.98294543 0.98371466 0.98756082 0.98653517
 0.98177515 0.98499343 0.98563445 0.98165352 0.98422419 0.98550953
 0.98550296 0.98845496 0.98820513 0.98550296 0.98845496 0.98820513
 0.98306049 0.98729454 0.988833   0.98037475 0.98306706 0.98627876
 0.97562459 0.98140039 0.98089415 0.97729783 0.98320184 0.98294543
 0.98371466 0.98756082 0.98653517 0.98177515 0.98499343 0.98563


Best score: 0.9889579224194609
Best parameters: {'criterion': 'friedman_mse', 'learning_rate': 0.5, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 150}
Best estimator: GradientBoostingClassifier(learning_rate=0.5, max_features='auto',
                           min_samples_leaf=4, min_samples_split=10,
                           n_estimators=150, random_state=0)

AUC Score:  0.7478991596638656
Confusion matrix:
[[12  2]
 [41 44]]

Classification report:
              precision    recall  f1-score   support

          -1       0.23      0.86      0.36        14
           1       0.96      0.52      0.67        85

    accuracy                           0.57        99
   macro avg       0.59      0.69      0.51        99
weighted avg       0.85      0.57      0.63        99




## XGBoost

In [66]:
model = XGBClassifier(objective='binary:logistic', use_label_encoder=True, eval_metric='auc')

parameter_grid = {}

grid_search(model, parameter_grid)

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Best score: 0.9844806048652203
Best parameters: {}
Best estimator: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

AUC Score:  0.8151260504201681
Confusion matrix:
[[13  1]
 [45 40]]

Classification report:
              precision    recall  f1-score   support

          -1       0.22      0.93      0.36        14
           1       0.98      0.47      0.63        85



