In [1]:
import sys
import numpy as np
from skopt import gp_minimize, forest_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Integer, uniform, Log10, Categorical 

import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.linear_model import Ridge, Lasso, ElasticNet, BayesianRidge

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.kernel_ridge import KernelRidge

from skopt.plots import plot_convergence
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [3]:
train   = pd.read_csv('train_cleaning.csv')
test    = pd.read_csv('test_cleaning.csv')
y_train = pd.read_csv('y_train.csv') 

In [4]:
y_train = y_train.values.ravel()

### Split Data into training and validation sets

In [5]:
X_train, X_val, Y_train, Y_val = train_test_split(train, y_train, test_size = 0.20, random_state=42)

## Tune hyperparameters

### Kernel Ridge

In [6]:
### Define model
KR = KernelRidge()

### Suggest parameters
params_KR = [Real(0, 1, name='alpha'),
             Categorical(['linear', 'polynomial'], name='kernel'),
            Integer(0, 5, name='degree'),
            Real(0.8, 3.5, name='coef0')] 


@use_named_args(params_KR) 
def KR_cross_val(**params):
    KR.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(KR, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

KR_forest = forest_minimize(KR_cross_val, params_KR, acq_func="EI", n_calls=100, random_state=0)
KR_gp = gp_minimize(KR_cross_val, params_KR, acq_func="gp_hedge", n_calls=100, random_state=0)

print("Best score using forest: {}".format(KR_forest.fun))
print("Best parameters using forest: {}".format(KR_forest.x))
print("Best score using gp: {}".format(KR_gp.fun))
print("Best parameters using gp: {}".format(KR_gp.x))

Best score using forest: 0.12017173630105528
Best parameters using forest: [0.8823747834898533, 'polynomial', 2, 3.271883237199301]
Best score using gp: 0.1201002868065733
Best parameters using gp: [0.9815970581715737, 'polynomial', 2, 3.5]


### Elastic Net

In [7]:
### Define model
ENet = ElasticNet()

### Suggest parameters
params_ENet = [Real(5e-8, 5e-4, name='alpha'),
              Real(0.2, 1.0, name='l1_ratio'),
              Integer(0, 5, name='random_state')]

@use_named_args(params_ENet) 
def ENet_cross_val(**params):
    ENet.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(ENet, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

ENet_forest = forest_minimize(ENet_cross_val, params_ENet, acq_func="EI", n_calls=100, random_state=0)
ENet_gp = gp_minimize(ENet_cross_val, params_ENet, acq_func="gp_hedge", n_calls=100, random_state=0)

print("Best score using forest: {}".format(ENet_forest.fun))
print("Best parameters using forest: {}".format(ENet_forest.x))
print("Best score using gp: {}".format(ENet_gp.fun))
print("Best parameters using gp: {}".format(ENet_gp.x))

Best score using forest: 0.11791052772354833
Best parameters using forest: [0.0003319688370362717, 0.9491280902552712, 3]
Best score using gp: 0.11780500178610491
Best parameters using gp: [0.00028349467970292434, 0.999990545429329, 0]


### Bayesian Ridge

In [8]:
### Define model
BRidge = BayesianRidge()

### Suggest parameters
params_BRidge = [Integer(100, 800, name='n_iter'),
                Real(6.e-5, 1.e-2, name='alpha_1'),
                Real(6.e-5, 1.e-2, name='alpha_2'),
                Real(6.e-5, 1.e-2, name='lambda_1'),
                Real(6.e-5, 1.e-2, name='lambda_2'),
                Real(1.e-5, 1.e-3, name='tol')]


@use_named_args(params_BRidge) 
def BRidge_cross_val(**params):
    BRidge.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(BRidge, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

BRidge_forest = forest_minimize(BRidge_cross_val, params_BRidge, acq_func="EI", n_calls=100, random_state=0)
BRidge_gp = gp_minimize(BRidge_cross_val, params_BRidge, acq_func="gp_hedge", n_calls=100, random_state=0)

print("Best score using forest: {}".format(BRidge_forest.fun))
print("Best parameters using forest: {}".format(BRidge_forest.x))
print("Best score using gp: {}".format(BRidge_gp.fun))
print("Best parameters using gp: {}".format(BRidge_gp.x))

Best score using forest: 0.12091791624380736
Best parameters using forest: [636, 0.006679796535760416, 0.009297122618697523, 0.004265266270456232, 0.00019827930480987256, 0.0003983852956390158]
Best score using gp: 0.12091745390933518
Best parameters using gp: [100, 6e-05, 0.01, 0.01, 6e-05, 1e-05]


### Lasso

In [9]:
### Define model
Lass = Lasso()

### Suggest parameters
params_Lass = [Real(5e-5, 5e-4, name='alpha'),
               Integer(1, 5, name='random_state')]

@use_named_args(params_Lass) 
def Lass_cross_val(**params):
    Lass.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(Lass, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

Lass_forest = forest_minimize(Lass_cross_val, params_Lass, acq_func="EI", n_calls=100, random_state=0)
Lass_gp = gp_minimize(Lass_cross_val, params_Lass, acq_func="gp_hedge", n_calls=100, random_state=0)

print("Best score using forest: {}".format(Lass_forest.fun))
print("Best parameters using forest: {}".format(Lass_forest.x))
print("Best score using gp: {}".format(Lass_gp.fun))
print("Best parameters using gp: {}".format(Lass_gp.x)) 

Best score using forest: 0.11781159209470064
Best parameters using forest: [0.0002649493027946075, 1]
Best score using gp: 0.11780330323019933
Best parameters using gp: [0.0002809545919333257, 5]


### XGBoost Regressor

In [13]:
### Define model
XGBoost = xgb.XGBRegressor()

### Suggest parameters
params_XGBoost = [Integer(1, 5, name='max_depth'),
          Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
          Real(0, 1, name='subsample'),
#           Real(0, 1, name='reg_alpha'),
#           Real(0, 1, name='reg_lambda'),
          Integer(1, 100, name='min_child_weight'),
          Real(0.1, 1, name='colsample_bytree'),
#           Integer(0, 10, name='random_state'),
          Integer(500, 2500, name='n_estimators'),
#           Real(0, 1, name='gamma')
         ]

@use_named_args(params_XGBoost) 
def XGBoost_cross_val(**params):
    XGBoost.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(XGBoost, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

XGBoost_forest = forest_minimize(XGBoost_cross_val, params_XGBoost, acq_func="EI", n_calls=100, random_state=0)
XGBoost_gp = gp_minimize(XGBoost_cross_val, params_XGBoost, acq_func="gp_hedge", n_calls=100, random_state=0)

print("Best score using forest: {}".format(XGBoost_forest.fun))
print("Best parameters using forest: {}".format(XGBoost_forest.x))
print("Best score using gp: {}".format(XGBoost_gp.fun))
print("Best parameters using gp: {}".format(XGBoost_gp.x))

Best score using forest: 0.11708911585189467
Best parameters using forest: [4, 0.008462445689673028, 0.96507271384635, 1, 0.5384179495416508, 2048]
Best score using gp: 0.11213851621648407
Best parameters using gp: [3, 0.022496892681737726, 0.6982452734370306, 1, 0.1, 2500]


### Gradient Boosting Regressor

In [14]:
### Define model
Graboost = GradientBoostingRegressor()

### Suggest parameters
params_Graboost = [Integer(1, 5, name='max_depth'),
          Real(5e-7, 5e-2, name='learning_rate'),
          #Integer(1, x_train.shape[1], name='max_features'),
          Categorical(['auto', 'sqrt', 'log2'], name='max_features'),
          Integer(2, 100, name='min_samples_split'),
          Categorical(['huber', 'ls', 'lad', 'quantile'], name='loss'),
          Integer(2, 100, name='min_samples_leaf'),
          Integer(50, 3000, name='n_estimators'),
          Integer(0, 5, name='random_state'),
         ]

@use_named_args(params_Graboost) 
def Graboost_cross_val(**params):
    Graboost.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(Graboost, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

Graboost_forest = forest_minimize(Graboost_cross_val, params_Graboost, acq_func="EI", n_calls=100, random_state=0)
Graboost_gp = gp_minimize(Graboost_cross_val, params_Graboost, acq_func="gp_hedge", n_calls=100, random_state=0)

print("Best score using forest: {}".format(Graboost_forest.fun))
print("Best parameters using forest: {}".format(Graboost_forest.x))
print("Best score using gp: {}".format(Graboost_gp.fun))
print("Best parameters using gp: {}".format(Graboost_gp.x))

Best score using forest: 0.11484191083214167
Best parameters using forest: [4, 0.009731934054429674, 'log2', 59, 'ls', 5, 2500, 5]
Best score using gp: 0.11158165300008473
Best parameters using gp: [4, 0.022295633577807207, 'sqrt', 100, 'huber', 2, 3000, 0]


### Ridge

In [15]:
### Define model
RidgeR = Ridge()

### Suggest parameters
params_RidgeR = [Real(0.05, 35, name='alpha'),
                Categorical(['auto', 'svd', 'cholesky'], name='solver')]

@use_named_args(params_RidgeR) 
def RidgeR_cross_val(**params):
    RidgeR.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(RidgeR, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

RidgeR_forest = forest_minimize(RidgeR_cross_val, params_RidgeR, acq_func="EI", n_calls=100, random_state=0)
RidgeR_gp = gp_minimize(RidgeR_cross_val, params_RidgeR, acq_func="gp_hedge", n_calls=100, random_state=0)

print("Best score using forest: {}".format(RidgeR_forest.fun))
print("Best parameters using forest: {}".format(RidgeR_forest.x))
print("Best score using gp: {}".format(RidgeR_gp.fun))
print("Best parameters using gp: {}".format(RidgeR_gp.x))

Best score using forest: 0.12088383125384887
Best parameters using forest: [9.57933749557496, 'svd']
Best score using gp: 0.12084343540191145
Best parameters using gp: [8.843480739403299, 'cholesky']


### LightGBM Regressor

In [20]:
### Define model
LGBM = lgb.LGBMRegressor()

### Suggest parameters
params_LGBM = [#Categorical(['gbdt','rf','dart', 'goss'], name='boosting_type'),
#              Integer(50, 10000, name='n_estimators'),
             Real(1e-7, 2e-2, name='learning_rate'),
             Integer(2, 100, name='num_leaves'),
             Categorical(['regression'], name='objective'),
#              Integer(1, 100, name='bagging_freq'),
             Real(0.1, 1.0, name='bagging_fraction'),
             Integer(50, 500, name='num_iterations'),
             Real(0.1, 1, name='feature_fraction'),
#              Real(0, 10, name='reg_alpha'),
             Integer(1, 100, name='min_data_in_leaf'),
             Integer(1, 100, name='max_depth')]

@use_named_args(params_LGBM) 
def LGBM_cross_val(**params):
    LGBM.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(LGBM, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

LGBM_forest = forest_minimize(LGBM_cross_val, params_LGBM, acq_func="EI", n_calls=100, random_state=0)
LGBM_gp = gp_minimize(LGBM_cross_val, params_LGBM, n_calls=100, acq_func="gp_hedge", random_state=0)

print("Best score using forest: {}".format(LGBM_forest.fun))
print("Best parameters using forest: {}".format(LGBM_forest.x))
print("Best score using gp: {}".format(LGBM_gp.fun))
print("Best parameters using gp: {}".format(LGBM_gp.x))

Best score using forest: 0.12631631587322953
Best parameters using forest: [0.012955227267013002, 49, 'regression', 0.3525455167623106, 476, 0.8492987114701519, 16, 90]
Best score using gp: 0.11860755354959966
Best parameters using gp: [0.016612303637686433, 30, 'regression', 1.0, 500, 0.21753146276979324, 7, 68]


### CatBoostRegressor

In [23]:
### Define model
CatB = CatBoostRegressor()

### Suggest parameters
params_CatB = [Integer(300, 1000, name='iterations'),
              Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
              Integer(0, 10, name='depth'),
              Categorical(['RMSE'], name='loss_function'),
#               Integer(0, 100, name='random_seed'),
#               Categorical(['Silent'],name='logging_level'),
#               Integer(0, 100, name='bagging_temperature'),
              Integer(0, 100, name='l2_leaf_reg'),
              Real(0.001, 1, name='rsm'),
              Integer(1, 255, name='one_hot_max_size')
              ]

@use_named_args(params_CatB) 
def CatB_cross_val(**params):
    CatB.set_params(**params)
    kf = KFold(5, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-np.mean(cross_val_score(CatB, X=train, y=y_train, scoring="neg_mean_squared_error", cv = kf, n_jobs=-1)))
    return(rmse)

CatB_forest = forest_minimize(CatB_cross_val, params_CatB, acq_func="EI", n_calls=100, random_state=0)
CatB_gp = gp_minimize(CatB_cross_val, params_CatB, acq_func="gp_hedge", n_calls=100, random_state=0)

print("Best score using forest: {}".format(CatB_forest.fun))
print("Best parameters using forest: {}".format(CatB_forest.x))
print("Best score using gp: {}".format(CatB_gp.fun))
print("Best parameters using gp: {}".format(CatB_gp.x))

Best score using forest: 0.14860429908957817
Best parameters using forest: [420, 0.03826404453243567, 3, 'RMSE', 1, 0.36722528261107323, 20]
Best score using gp: 0.12706474879033458
Best parameters using gp: [929, 0.022859933246184225, 2, 'RMSE', 0, 1.0, 194]
