# [75.06 / 95.58] Organización de Datos <br> Trabajo Práctico 2: Machine Learning
# Parameter Tuning

**Grupo 30: Datatouille**

**http://fdelmazo.github.io/7506-Datos/**

# AL SER UN PROCESO MUY COSTOSO EN TIEMPO Y MEMORIA, SE GUARDAN LOS RESULTADOS FINALES EN VARIABLES

In [None]:
def get_hiper_params():
    # Se esconde en un def para poder importarlo

    return {
        'decision_tree':{'criterion': 'entropy',
                                     'max_features': 0.1,
                                     'max_depth': 1.0,
                                     'min_samples_split': 0.1,
                                     'min_samples_leaf': 0.1},

        'random_forest': {'n_estimators': 200,
                                     'criterion': 'gini',
                                     'max_features': 0.3,
                                     'max_depth': 1.0,
                                     'min_samples_split': 0.1,
                                     'min_samples_leaf': 0.1},
        
        'xgboost': {'learning_rate': 0.1,
                     'objective': 'binary:logistic',
                     'n_estimators': 200,
                     'scale_pos_weight': 2,
                     'max_depth': 1,
                     'min_child_weight': 1,
                     'gamma': 0.0,
                     'colsample_bytree': 0.79,
                     'subsample': 0.89,
                     'reg_alpha': 100},
        
        'knn': {'n_neighbors':21, 'weights':'uniform', 'n_jobs':-1},
        
        'naive_bayes' : {'var_smoothing': 1e-09},
        
        'lightgbm': {'bagging_fraction': 0.8,
                     'feature_fraction': 0.1,
                     'lambda_l1': 4,
                     'lambda_l2': 0,
                     'max_depth': 5,
                     'min_child_weight': 8,
                     'min_split_gain': 0.001,
                     'num_leaves': 24}
        
        

}

---
---
---


In [None]:
import nbimporter # pip install nbimporter
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import submission_framework as SF

df_users = pd.read_csv('data/user-features.csv',low_memory=False).set_index('person')
df_y = pd.read_csv('data/labels_training_set.csv').groupby('person').sum()

In [None]:
columnas_a_mano = ['dow_last_conversion', 'has_conversion_last_week', 
                         'total_conversions_month_4', 'total_session_checkouts', 
                         'doy_last_conversion', 'timestamp_last_event', 
                         'dow_last_checkout', 'total_checkouts', 
                         'has_checkout', 'doy_last_checkout', 
                         'has_checkout_month_1', 'timestamp_last_checkout', 
                         'total_sessions', 'woy_last_event', 'has_checkout_month_5', 
                         'avg_events_per_session']

In [None]:
def find_best_params(df_x, df_y, orig_model_with_name, columns, list_of_progressive_params,seed=42):
    
    orig_model_name, orig_model = orig_model_with_name
    orig_model_name+='_GS'
    params = {}
    params_grid = {}
    i=1
    
    for params_grid in list_of_progressive_params:
        model_new = GridSearchCV(orig_model(**params,random_state=seed),params_grid,cv=5,verbose=1)
        model_with_name = (orig_model_name,model_new)
        
        model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columnas_a_mano)
        params.update(model.best_params_)
        i+=1

    return params

## Decision Tree

> https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
  
list_of_progressive_params = [{'criterion':['gini','entropy']},
                              {'max_features': np.arange(0.1,0.4,0.1)},
                              {'max_depth': np.linspace(1, 32, 5, endpoint=True)},
                              {'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)},
                              {'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)}
]

model_with_name = ('decision_tree', DecisionTreeClassifier)

best_params_decision_tree = find_best_params(df_users,df_y,model_with_name,columnas_a_mano, list_of_progressive_params) 
best_params_decision_tree

## Random Forest

> https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d

In [None]:
from sklearn.ensemble import RandomForestClassifier
   
list_of_progressive_params = [{'n_estimators':[1, 2, 4, 8, 16, 32, 64, 100, 200]},
                              {'criterion':['gini','entropy']},
                              {'max_features': np.arange(0.1,0.4,0.1)},
                              {'max_depth': np.linspace(1, 32, 3, endpoint=True)},
                              {'min_samples_split': np.arange(0.1, 1.0, 0.1)},
                              {'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)}
                   ]  

model_with_name = ('random_forest', RandomForestClassifier)

best_params_random_forest = find_best_params(df_users,df_y,model_with_name,columnas_a_mano, list_of_progressive_params) 
best_params_random_forest

## XGBoost

> https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [None]:
import xgboost as xgb #conda install -c conda-forge xgboost 
    
list_of_progressive_params = [
                              {'objective': ['binary:logistic','reg:linear'],'learning_rate':[0.1,0.3]},
                              {'n_estimators':np.arange(200,500,100)},
                              {'scale_pos_weight':np.arange(2,6,1)},
                              {'max_depth':np.arange(1,10,2),'min_child_weight':np.arange(1,10,2)},
                              {'gamma':np.arange(0,0.5,0.1)},
                              {'subsample':np.arange(0.6,1,0.1),'colsample_bytree':np.arange(0.6,1,0.1)},
                              {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}
                   ]

model_with_name = ('xgbost', xgb.XGBClassifier)

best_params_xgboost = find_best_params(df_users,df_y,model_with_name,columnas_a_mano, list_of_progressive_params) 
best_params_xgboost

## KNN

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# model_name = 'knn'

# rango = 5
# def knn_gridsearch(x_train, y_train, x_test, y_test):
#     y_train.shape = y_train.shape[0]
#     k_max = (0,0)
#     for k in range(1,100, rango):
#         knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', n_jobs=-1)
#         knn.fit(x_train, y_train)
#         score = knn.score(x_test, y_test)*100
#         if score > k_max[1]:
#             k_max = (k,score)
#         print("K: {}, {}".format(k, score))
#     a = k_max[0]
#     k_max = (0,0)
#     for k in range(a - rango, a + rango):
#         knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', n_jobs=-1)
#         knn.fit(x_train, y_train)
#         score = knn.score(x_test, y_test)*100
#         if score > k_max[1]:
#             k_max = (k,score)
#         print("K: {}, {}".format(k, score))
#     return k_max[0]

# X, y = SF.fr1_extract_X_y(df_users, df_y)
# X_train, X_test, y_train, y_test = SF.fr2_train_test_split(X, y, seed, test_size)    
# K = knn_gridsearch(X_train, y_train, X_test, y_test)

# def knn(X_train, y_train, seed, k=K):
#     knn = KNeighborsClassifier(n_neighbors=k, weights='uniform', n_jobs=-1)
#     y_train.shape = y_train.shape[0]
#     knn.fit(X_train, y_train)
#     return knn

# SF.full_framework_wrapper(f'KNN{K}',knn,columns=columnas_a_mano,verbosity=-1)

## Light GBM

In [None]:
import lightgbm as lgb #conda install -c conda-forge lightgbm 

list_of_progressive_params = [
                             {'num_leaves': np.arange(24, 45)},
                             {'feature_fraction': np.arange(0.1, 0.9)},   
                             {'bagging_fraction': np.arange(0.8, 1)},
                             {'max_depth': np.arange(5, 9)},
                             {'lambda_l1': np.arange(0, 5)},
                             {'lambda_l2': np.arange(0, 3)},
                             {'min_split_gain': np.arange(0.001, 0.1)},
                             {'min_child_weight': np.arange(5, 50)}
                             ]
    
model_with_name = ('lightgbm', lgb.LGBMClassifier)

best_params_lightgbm= find_best_params(df_users,df_y,model_with_name,columnas_a_mano, list_of_progressive_params) 
best_params_lightgbm