# [75.06 / 95.58] Organización de Datos <br> Trabajo Práctico 2: Machine Learning
# Parameter Tuning

**Grupo 30: Datatouille**

**http://fdelmazo.github.io/7506-Datos/**

In [None]:
def get_hiper_params():
    # Se esconde en un def para poder importarlo

    return {
        'decision_tree':{'criterion': 'gini',
 'max_features': 0.30000000000000004,
 'max_depth': 8.75,
 'min_samples_split': 0.1,
 'min_samples_leaf': 0.0001},

        'random_forest': {'n_estimators': 200,
 'criterion': 'entropy',
 'max_features': 0.2,
 'max_depth': 16.5,
 'min_samples_split': 0.1,
 'min_samples_leaf': 0.1},
        
        # AUC: 0.8695
        'xgboost': {'learning_rate': 0.1,
 'objective': 'binary:logistic',
 'n_estimators': 16,
 'scale_pos_weight': 2,
 'max_depth': 4,
 'min_child_weight': 5,
 'gamma': 0.0,
 'colsample_bytree': 0.7500000000000001,
 'subsample': 0.7,
 'colsample_bylevel': 0.65},
        
        'knn': {'n_neighbors':21, 'weights':'uniform', 'n_jobs':-1},
                
        # AUC: 0.8700
        'lightgbm': {'objective': 'binary',
 'num_leaves': 36,
 'n_estimators': 70,
 'min_split_gain': 0.01,
 'min_child_weight': 5.00001,
 'max_depth': 4,
 'learning_rate': 0.05,
 'lambda_l2': 0,
 'feature_fraction': 0.7000000000000001,
 'bagging_fraction': 1.0},

        'catboost': { 'eval_metric': 'AUC',
                     'iterations': 678,
                     'random_strength': 42,
                     'learning_rate': 0.01,
                     'depth': 1,
                     'l2_leaf_reg': 2},
        
        'gradient_boosting': {'max_leaf_nodes': None,
                     'min_weight_fraction_leaf': 0,
                     'learning_rate': 0.1,
                     'max_features': 1,
                     'min_samples_split': 1.0,
                     'min_samples_leaf': 0.1,
                     'max_depth': 1.0,
                     'n_estimators': 1,
                     'subsample': 0.8,
                     'loss': 'deviance',
                     'warm_start': False,
                     'presort': 'auto'},
        
        'neuralnetwork': {'hidden_layer_sizes': (4, 4),
 'activation': 'relu',
 'alpha': 0.0001,
 'beta_1': 0.05,
 'beta_2': 0.86,
 'early_stopping': False,
 'epsilon': 1e-08,
 'learning_rate': 'constant',
 'solver': 'adam',
 'validation_fraction': 0.15}
    
        
}

---
---
---


In [None]:
import nbimporter # pip install nbimporter
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import submission_framework as SF

In [None]:
df_users = pd.read_csv('data/user-features.csv',low_memory=False).set_index('person')
df_y = pd.read_csv('data/labels_training_set.csv').groupby('person').sum()

In [None]:
columnas_a_mano = ['avg_events_per_session',
 'doy_last_conversion',
 'total_events',
 'dow_last_conversion',
 'has_checkout',
 'has_checkout_month_5',
 'total_checkouts_month_5',
 'timestamp_last_event',
 'dom_last_event',
 'doy_last_checkout',
 'days_to_last_event',
 'percentage_session_ad',
 'total_checkouts',
 'dow_last_checkout',
 'timestamp_last_checkout',
 'has_checkout_month_1',
 'total_sessions',
 'total_session_conversion',
 'woy_last_event',
 'total_conversions',
 'total_checkouts_last_week',
 'total_session_checkout',
 'total_events_last_week',
 'has_conversion_last_week',
 'total_checkouts_months_1_to_4']

In [None]:
def find_best_params(df_x, df_y, orig_model_with_name, default_params, list_of_progressive_params,
                     columns=None, seed=0, cv=5, normalize=False):
    
    orig_model_name, orig_model = orig_model_with_name
    orig_model_name+='_GSF'
    acc_params = {}
    i=1
    
    for params_grid in list_of_progressive_params:
        print(f"Best Params So Far: {default_params} {acc_params}\n\n")

        model_new = GridSearchCV(orig_model(**default_params,**acc_params,random_state=seed) ,params_grid, cv=cv, verbose=1, scoring='roc_auc',n_jobs=4)
   
        model_with_name = (orig_model_name,model_new)
        model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name, columns=columns, normalize=normalize)
        acc_params.update(model.best_params_)
        
        i+=1

    default_params.update(acc_params)
    return default_params

In [None]:
def find_best_params_random(df_x, df_y, orig_model_with_name, default_params, params_grid,
                            n_iter=15, columns=None, seed=0, cv=5):
    
    orig_model_name, orig_model = orig_model_with_name
    orig_model_name+='_RS'
    i=1

    model_new = RandomizedSearchCV(orig_model(**default_params,random_state=seed), params_grid, n_iter=n_iter, cv=cv, verbose=1, scoring='roc_auc', n_jobs=4)

    model_with_name = (orig_model_name,model_new)
    model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columns)
    
    return model.best_params_

In [None]:
def find_best_params_gridsearch(df_x, df_y, orig_model_with_name, default_params, params_grid,
                                columns=None, seed=42, cv=5):
    
    orig_model_name, orig_model = orig_model_with_name
    orig_model_name+='_GS'
    i=1
    
    model_new = GridSearchCV(orig_model(**default_params), params_grid, cv=cv, verbose=1, scoring='roc_auc', n_jobs=4)
    
    model_with_name = (orig_model_name,model_new)
    model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columns)
    
    return model.best_params_

## Decision Tree

> https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
list_of_progressive_params = [{'criterion':['gini','entropy']},
                              {'max_features': np.arange(0.1,0.8,0.1)},
                              {'max_depth': np.linspace(1, 32, 5, endpoint=True)},
                              {'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)},
                              {'min_samples_leaf': np.arange(0.0001,0.5,0.1)}
]

model_with_name = ('decision_tree', DecisionTreeClassifier)

best_params_decision_tree = find_best_params(df_users,df_y,model_with_name, {},list_of_progressive_params, columns=columnas_a_mano) 
best_params_decision_tree

### Método Random

In [None]:
list_of_progressive_params = {'criterion':['gini','entropy'],
                              'max_features': np.arange(0.1,0.8,0.1),
                              'max_depth': np.linspace(1, 32, 5, endpoint=True),
                              'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
                              'min_samples_leaf': np.arange(0.0001,0.5,0.1)
                             }


model_with_name = ('decision_tree', DecisionTreeClassifier)

best_params_decision_tree = find_best_params_random(df_users,df_y,model_with_name, {}, list_of_progressive_params, n_iter=100, columns=columnas_a_mano) 
best_params_decision_tree

### Método GridSearch

In [None]:
list_of_progressive_params = {'criterion':['gini','entropy'],
                              'max_features': np.arange(0.1,0.8,0.1),
                              'max_depth': np.linspace(1, 32, 5, endpoint=True),
                              'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
                              'min_samples_leaf': np.arange(0.0001,0.5,0.1)
                             }


model_with_name = ('decision_tree', DecisionTreeClassifier)

best_params_decision_tree = find_best_params_gridsearch(df_users,df_y,model_with_name, {}, list_of_progressive_params, columns=columnas_a_mano) 
best_params_decision_tree

## Random Forest

> https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d

In [None]:
from sklearn.ensemble import RandomForestClassifier
   
list_of_progressive_params = [{'n_estimators':[1, 2, 4, 8, 16, 32, 64, 100, 200]},
                              {'criterion':['gini','entropy']},
                              {'max_features': np.arange(0.1,0.4,0.1)},
                              {'max_depth': np.linspace(1, 32, 3, endpoint=True)},
                              {'min_samples_split': np.arange(0.1, 1.0, 0.1)},
                              {'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)}
                   ]  

model_with_name = ('random_forest', RandomForestClassifier)

best_params_random_forest = find_best_params(df_users,df_y,model_with_name, {},list_of_progressive_params, columns=columnas_a_mano) 
best_params_random_forest

## XGBoost

> https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [None]:
import xgboost as xgb #conda install -c conda-forge xgboost 

# Pls dejar, dio muy bien y no es muy replicable
"""
0.8695

{'learning_rate': 0.1,
 'objective': 'binary:logistic',
 'n_estimators': 16,
 'scale_pos_weight': 2,
 'max_depth': 4,
 'min_child_weight': 5,
 'gamma': 0.0,
 'colsample_bytree': 0.7500000000000001,
 'subsample': 0.7,
 'colsample_bylevel': 0.65}
"""

    
list_of_progressive_params = [
                              {'objective': ['binary:logistic'],'learning_rate':np.arange(0.1,0.5,0.1)},
                              {'n_estimators':np.arange(16,116,15)},
                              {'scale_pos_weight':np.arange(2,6,1)},
                              {'max_depth':np.arange(4,12,1),'min_child_weight':np.arange(1,10,1)},
                              {'gamma':np.arange(0,0.5,0.1)},
                              {'subsample':np.arange(0.6,1,0.1),'colsample_bytree':np.arange(0.6,0.91,0.05)},
                              {'colsample_bylevel':np.arange(0.6,0.91,0.05)}#,
                             # {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]} # Empeoraba muchísimo con esto, y Luis dijo que no importaba
                   ]

model_with_name = ('xgbost', xgb.XGBClassifier)

best_params_xgboost = find_best_params(df_users,df_y,model_with_name,{}, list_of_progressive_params, columns=columnas_a_mano) 
best_params_xgboost

In [None]:
list_of_progressive_params = [
                              {'objective': ['binary:logistic','reg:linear'],'learning_rate':np.arange(0.1,0.5,0.1)},
                              {'n_estimators':np.arange(16,116,15)},
                              {'scale_pos_weight':np.arange(2,6,1)},
                              {'max_depth':np.arange(4,12,1),'min_child_weight':np.arange(1,10,1)},
                              {'gamma':np.arange(0,0.5,0.1)},
                              {'subsample':np.arange(0.6,1,0.1),'colsample_bytree':np.arange(0.6,0.91,0.05)},
                              {'colsample_bylevel':np.arange(0.6,0.91,0.05)}#,
                             # {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]} # Empeoraba muchísimo con esto, y Luis dijo que no importaba
                   ]

model_with_name = ('xgbost', xgb.XGBClassifier)

best_params_xgboost = find_best_params(df_users_norm,df_y,model_with_name,columnas_a_mano,{}, list_of_progressive_params) 
best_params_xgboost

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
    
list_of_progressive_params = [
                              {'n_neighbors': np.arange(1,30)},
                              {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
                              {'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
                   ]

model_with_name = ('knn', KNeighborsClassifier)

best_params_knn = find_best_params(df_users, df_y, model_with_name, {},list_of_progressive_params, seed=-1, normalize=True) 
best_params_knn

## Light GBM

> https://www.kaggle.com/sz8416/simple-bayesian-optimization-for-lightgbm
> https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc

### Método original

In [None]:
import lightgbm as lgb #conda install -c conda-forge lightgbm 

"""
0.8688

{'objective': 'binary',
 'learning_rate': 0.01,
 'n_estimators': 190,
 'num_leaves': 27,
 'feature_fraction': 0.9000000000000001,
 'bagging_fraction': 0.8,
 'max_depth': 4,
 'lambda_l2': 2,
 'min_split_gain': 0.01,
 'min_child_weight': 10.00001}
"""

list_of_progressive_params = [{'objective':['binary']},
                             {'learning_rate':[0.005,0.01,0.05,0.1,0.3]},
                             {'n_estimators':np.arange(25,200,15)},
                             {'num_leaves': np.arange(24, 45,3)},
                             {'feature_fraction': np.arange(0.1, 0.91, 0.2)},   
                             {'bagging_fraction': np.arange(0.8, 1.01, 0.1)},
                             {'max_depth': np.arange(3, 12, 1)},
                            #{'lambda_l1': np.arange(0, 5)}, # Restaba mucho
                             {'lambda_l2': np.arange(0, 3)},
                             {'min_split_gain': [0.001, 0.01, 0.1]},
                             {'min_child_weight': [1e-05]+np.arange(5, 11)}
                             ]


model_with_name = ('lightgbm', lgb.LGBMClassifier)

best_params_lightgbm= find_best_params(df_users,df_y,model_with_name,{}, list_of_progressive_params) 
best_params_lightgbm

### Método Random (lightgbm)

In [None]:
list_of_progressive_params = {'objective':['binary'],
                             'learning_rate':[0.005,0.01,0.05,0.1,0.3],
                             'n_estimators':np.arange(25,200,15),
                             'num_leaves': np.arange(24, 45,3),
                             'feature_fraction': np.arange(0.1, 0.91, 0.2),   
                             'bagging_fraction': np.arange(0.8, 1.01, 0.1),
                             'max_depth': np.arange(3, 12, 1),
                             'lambda_l2': np.arange(0, 3),
                             'min_split_gain': [0.001, 0.01, 0.1],
                             'min_child_weight': [1e-05]+np.arange(5, 11)
                             }


model_with_name = ('lightgbm', lgb.LGBMClassifier)

best_params_decision_tree = find_best_params_random(df_users,df_y,model_with_name, {}, list_of_progressive_params, n_iter=100) 
best_params_decision_tree

### Former Champion

In [None]:
# Last best parameters 
# AUC: 0.0700 con todo el dataset
# AUC: 0.8711 con columnas a mano
params = {'objective': 'binary',
 'num_leaves': 36,
 'n_estimators': 70,
 'min_split_gain': 0.01,
 'min_child_weight': 5.00001,
 'max_depth': 4,
 'learning_rate': 0.05,
 'lambda_l2': 0,
 'feature_fraction': 0.7000000000000001,
 'bagging_fraction': 1.0}

model_with_name = ('lightgbm', lgb.LGBMClassifier(**params))
SF.full_framework_wrapper(df_users, df_y, model_with_name)

## Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing

list_of_progressive_params = [{'hidden_layer_sizes':[(4,7), (4,4), (4,3,2)]},
                              {'activation':['relu', 'logistic']},
                              {'alpha':[1e-06,1e-05,1e-04,1e-03,1e-02,1e-01,1]},
                              {'beta_1':[0.7,0.91,0.05]},
                              {'beta_2':[0.75, 0.86, 0.05]},
                              {'early_stopping':[False]},
                              {'epsilon':[1e-07,1e-08]},
                              {'learning_rate':['constant', 'adaptive']},
                              {'solver':['adam', 'lbfgs']},
                              {'validation_fraction':np.arange(0.15,0.26,0.05)}
                             ]
    
    
model_with_name = ('neuralnetwork', MLPClassifier)

"""
min_max_scaler = preprocessing.MinMaxScaler()
df_users_norm = pd.DataFrame(min_max_scaler.fit_transform(df_users.values))
df_users_norm.columns = df_users.columns
df_users_norm.index = df_users.index
"""

best_params_neuralnetwork = find_best_params(df_users, df_y, model_with_name, {}, list_of_progressive_params, normalize=True)
best_params_neuralnetwork

## CatBoost

> https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning-docpage/

In [None]:
import catboost as cb #conda install -c conda-forge catboost 

list_of_progressive_params = [{'random_strength':[42],'eval_metric':['AUC'],'iterations': [80, 100,256,465,678,1000]},
                             {'learning_rate':[0.01,0.05,0.1,0.3]},
                             {'depth':np.arange(1,12,1)},
                             {'l2_leaf_reg':np.arange(2,10,1)},
                             ]

model_with_name = ('catboost', cb.CatBoostClassifier)

best_params_catboost = find_best_params(df_users,df_y,model_with_name,{'verbose':True}, list_of_progressive_params, cv=2,columns=columnas_a_mano)
best_params_catboost

## Gradient Boosting

> https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
> https://medium.com/all-things-ai/in-depth-parameter-tuning-for-gradient-boosting-3363992e9bae

In [None]:
from sklearn.ensemble import GradientBoostingClassifier as GBC  

list_of_progressive_params = [
                             {'max_leaf_nodes': [None]},
                             {'min_weight_fraction_leaf': [0]},
                             {'learning_rate': [0.1]},
                             {'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)},
                             {'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)},  
                             {'max_features' : list(range(1,len(columnas_a_mano)))},
                             {'max_depth': np.linspace(1, 32, 32, endpoint=True)},
                             {'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 20]},
                             {'subsample': np.arange(0.8, 1)},
                             {'loss': ['deviance']},
                             {'warm_start': [False]},
                             {'presort': ['auto']}
                             ]
    
model_with_name = ('gradient_boosting', GBC)

best_params_boosting= find_best_params(df_users,df_y,model_with_name, list_of_progressive_params, columns=columnas_a_mano) 
best_params_boosting