# HM1 algorithms and hyperparameters tuning

In [48]:
# todo:
# 1. data and alg choosement
# 2. data preparation
# 3. GS for each alg
# 4. average model creation - default hyperparams selection

# 5. Bayes Optimization
# - in each iteration check the results and compare it with 'defaults'

# Algorithms 

SVM

NN

Random forest

# Data

44 spam

1067 nasa https://www.openml.org/search?type=data&status=active&id=1067

1464 blood-transfusion-service-center

40701 churn

### Data input

In [66]:
dataset = {}
data = {}
labels = {}
id = [44,1067,1464,40701]
for index in id:
    dataset[index] = openml.datasets.get_dataset(index)
    data[index], y, _, _ = dataset[index].get_data(dataset_format="dataframe")
    data[index] = data[index].replace('nan', np.nan) # not always needed
    labels[index] = list(data[index].columns.values)



### Libraries

In [109]:
import openml
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

### Preprocessing

In [92]:
def preprocess():
    num_pipeline = Pipeline(steps=[
        ('scale',MinMaxScaler())
    ])

    ### operacje dla kolumn kategorycznych
    cat_pipeline = Pipeline(steps=[
        ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))

    ])

    bool_pipeline = Pipeline(steps=[])

    #### W funkcji ColumnTransformer podajemy liste transformerow dla poszczegolnych zestawow kolumn- (name, transformer, columns) 
    #### UWAGA: zamiast column selector (make_column_selector( dtype_include= np.number)) mozna podac wektor nazw kolumn, ale to rozwiazanie dla konkretnych danych


    col_trans = ColumnTransformer(transformers=[
        ('num_pipeline', num_pipeline, make_column_selector( dtype_exclude= np.object_)),
        ('cat_pipeline',cat_pipeline,make_column_selector( dtype_include= np.object_)),
        # ('bool_pipeline',bool_pipeline,make_column_selector( dtype_include= np.bool_)),
        ],
    
        n_jobs=-1)
    
    model = Pipeline([('preprocessing', col_trans)])
    return model

In [93]:
data_prepared = {}
for index in id:
    datad = data[index]
    model =  preprocess()
    model.get_params()
    model.fit(datad)
    after_process = model.transform(datad)
    data_prepared[index] = pd.DataFrame(after_process, columns = labels[index])

44
1067
1464
40701


### Data split

In [102]:
yColumnName = {44:"class",1067:"defects",1464:"Class",40701:"class"}
X = {}
y = {}
X_train = {}
X_test = {}
y_train = {}
y_test = {}
for index in id:

    X[index] = data_prepared[index].drop(yColumnName[index], axis=1)
    y[index] = data_prepared[index][yColumnName[index]]

    X_train[index],  X_test[index], y_train[index], y_test[index] = train_test_split(X[index], y[index])

### comment:
crossvalidation(?)

## Grid search for tuning hyperparams

### RandomForestClassifier

In [106]:
pipeline = Pipeline([('RandomForestClassifier', RandomForestClassifier())])

In [130]:
pipeline.get_params()

{'memory': None,
 'steps': [('RandomForestClassifier', RandomForestClassifier())],
 'verbose': False,
 'RandomForestClassifier': RandomForestClassifier(),
 'RandomForestClassifier__bootstrap': True,
 'RandomForestClassifier__ccp_alpha': 0.0,
 'RandomForestClassifier__class_weight': None,
 'RandomForestClassifier__criterion': 'gini',
 'RandomForestClassifier__max_depth': None,
 'RandomForestClassifier__max_features': 'sqrt',
 'RandomForestClassifier__max_leaf_nodes': None,
 'RandomForestClassifier__max_samples': None,
 'RandomForestClassifier__min_impurity_decrease': 0.0,
 'RandomForestClassifier__min_samples_leaf': 1,
 'RandomForestClassifier__min_samples_split': 2,
 'RandomForestClassifier__min_weight_fraction_leaf': 0.0,
 'RandomForestClassifier__n_estimators': 100,
 'RandomForestClassifier__n_jobs': None,
 'RandomForestClassifier__oob_score': False,
 'RandomForestClassifier__random_state': None,
 'RandomForestClassifier__verbose': 0,
 'RandomForestClassifier__warm_start': False}

In [198]:
grid_params = [{'RandomForestClassifier__n_estimators':np.linspace(10, 100, 10).astype(int),
                'RandomForestClassifier__min_samples_leaf': np.linspace(1, 2, 1).astype(int),
                'RandomForestClassifier__random_state': [1]}]

In [199]:
best_hyperparams = []
score_for_best_hyperparams = {}

for index in id:
    gs_model_pipeline = GridSearchCV(estimator=pipeline,param_grid=grid_params,scoring='roc_auc')
    gs_model_pipeline.fit(X_train[index], y_train[index])
    best_hyperparams.append(gs_model_pipeline.best_params_)

    # score for test data
    score_for_best_hyperparams[index] = gs_model_pipeline.score(X_test[index], y_test[index])

In [202]:
my_list = []

for index in range(0,4):
    my_list.append( list(best_hyperparams[index].values()))

keys = best_hyperparams[0].keys()

mean_hyperparams = np.mean(np.array((my_list)),axis=0) # change it!

mean_hyperparams = [ int(x) for x in mean_hyperparams ]
keys = [ x.replace("RandomForestClassifier__","") for x in keys ]

defaults = dict(zip(keys, mean_hyperparams))

In [203]:
defaults

{'min_samples_leaf': 1, 'n_estimators': 52, 'random_state': 1}

### Diff for defaults - best

In [204]:
score_for_default_hyperparams = {}
diff = {}

for index in id:
    RF = RandomForestClassifier(min_samples_leaf = mean_hyperparams[0], n_estimators= mean_hyperparams[1], random_state= mean_hyperparams[2]) # change it in the future! 
    RF.fit(X_train[index], y_train[index])
    score_for_default_hyperparams[index] = RF.score(X_test[index], y_test[index])
    diff[index] = score_for_default_hyperparams[index] - score_for_best_hyperparams[index]
diff

{44: -0.03548873791079321,
 1067: 0.010059570059570033,
 1464: 0.04377184142515811,
 40701: 0.040565261028010724}

# Bayes Optimization

In [None]:
from sklearn.ensemble import RandomForestClassifier


### definition of model

# params_init = np.array([[-0.9], [1.1]])
# Y_init = f(X_init)

def rf_to_opt(params,X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',MinMaxScaler())])

    cat_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    col_trans = ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline, make_column_selector( dtype_include= np.number)),
    ('cat_pipeline',cat_pipeline,make_column_selector( dtype_include= np.object_))
    ],
    remainder='drop',
    n_jobs=-1)
    model_pipeline = Pipeline([('preprocessing', col_trans), 
                               ('model', RandomForestClassifier(n_estimators=np.int64(params[0])))])

    
    model_pipeline.fit(X_train, y_train)
    return model_pipeline.score(X_test, y_test)


# rf_to_opt(params=[50])


bounds = np.array([[5, 100]])
params_init = np.array([[np.int64(5)], [np.int64(20)]])
print(params_init)
acc_init = np.array([rf_to_opt(params=params_init[0]),rf_to_opt(params=params_init[1]) ])

In [None]:
from sklearn.base import clone
from skopt import gp_minimize
from skopt.learning import GaussianProcessRegressor
from skopt.learning.gaussian_process.kernels import ConstantKernel, Matern
from bayesian_optimization_util import plot_approximation, plot_acquisition, plot_convergence

np.int = np.int_


noise = 0.2
# Use custom kernel and estimator to match previous example

m52 = ConstantKernel(1.0) * Matern(length_scale=1.0, nu=2.5)
gpr = GaussianProcessRegressor(kernel=m52, alpha=noise**2)

r = gp_minimize(lambda x: -rf_to_opt(params=x), 
                bounds.tolist(),
                base_estimator=gpr,
                acq_func='EI',      # expected improvement
                xi=0.01,            # exploitation-exploration trade-off
                n_calls=10,         # number of iterations
                n_random_starts=0,  # initial samples are provided
                x0=params_init.tolist(), # initial samples
                y0=-acc_init.ravel()
                )

# print(r)

## Fit GP model to samples for plotting results
gpr.fit(r.x_iters, -r.func_vals)

In [None]:
plot_convergence(np.array(r.x_iters), -r.func_vals)