# Hyperparameter optimisation

Importing libraries we will use, nump,y pandas, scikit-learn models and metrics, etc...

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from hyperopt import STATUS_OK, hp, tpe, fmin
import mlflow
import itertools

import warnings
warnings.simplefilter(action='ignore')

seed = 42
np.random.seed(seed)

In [None]:
# We will use these metrics to check our model performance

metrics = [
    ('Precision', precision_score, False),
    ('Recall', recall_score, False),
#     ('MCC', matthews_corrcoef, False),
#     ('F1', f1_score, False),
     ('ROC-AUC', roc_auc_score, True)
]

## Load dataset

Load data from Google Cloud Storage.

In [None]:
# train = pd.read_csv('gs://home-credit-simonyi-workshop/input/application_train.subsample.csv')
train = pd.read_csv('input/application_train.csv')

train.head()

Let's use some new fields: **EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3**

In [None]:
target = 'TARGET'

features = [
    'DAYS_EMPLOYED',
    'DAYS_BIRTH',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'CNT_FAM_MEMBERS',
    'AMT_ANNUITY',
    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',
    'NAME_TYPE_SUITE', # categorical
    'NAME_INCOME_TYPE', # categorical
]

for f in features:
    train.loc[train[f] == np.inf, f] = np.nan

X = train.loc[:, features]
y = train.loc[:, target]

print("Train features DataFrame shape:", X.shape)
print("Train target Series shape:", y.shape)

## Train-test split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=train[target], test_size=0.5, random_state=seed)

print('Train features shape: ', X_train.shape)
print('Train target shape: ', y_train.shape)
print('Validate features shape: ', X_valid.shape)
print('Validate target shape: ', y_valid.shape)

## More advanced Pipeline with categorical data

We use RandomForestClassifier as our model.

In [None]:
num_feats = list(range(0, 9))
num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

# Columns can be accessed with names also.
cat_feats = ['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE'] 
cat_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transform, num_feats),
    ('cat', cat_transform, cat_feats)
])

preprocessed_train = preprocessor.fit_transform(X_train)
preprocessed_valid = preprocessor.fit_transform(X_valid)

classifier = HistGradientBoostingClassifier(max_iter=40, max_depth=12)

classifier.fit(preprocessed_train, y_train)

Check prediction performance on train dataset.

In [None]:
pred_train = classifier.predict(preprocessed_train)
proba_train = classifier.predict_proba(preprocessed_train)[:,1]

for m in metrics:
    score = m[1](y_train, proba_train) if m[2] else m[1](y_train, pred_train)
    print('%s on train: %.3f' % (m[0], score))

Of course it is close to perfect, but this is expected. But let's check out model on new data!

## Evaluate model

In [None]:
pred_valid = classifier.predict(preprocessed_valid)
proba_valid = classifier.predict_proba(preprocessed_valid)[:,1]

In [None]:
for m in metrics:
    score = m[1](y_valid, proba_valid) if m[2] else m[1](y_valid, pred_valid)
    print('%s on CV: %.3f' % (m[0], score))

## Time for hyperoptimisation

### Things we need:

+ ##### Objective function
+ ##### Search space
* ##### Search algorithm

In [None]:
def objective_function(hyperparameters):
    #Create a model with the hyperparameters
    #Train the model
    #Evaluate the model
    #Return the loss

In [None]:
#Define the search space
#Evaluate over the search space with the algorithm
#Find the best hyperparameters