In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_squared_error as mse


from xgboost import XGBRegressor as xgbr

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

In [39]:
diamonds = pd.read_csv('input/train.csv')
predict = pd.read_csv('input/test.csv')

In [40]:
X=diamonds.drop('price', axis=1)
y=diamonds.price

In [41]:
clarity={'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7}
cut={'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}
color={'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6}

In [42]:
def labeling(s, dic):
    return dic[s]

In [43]:
X.clarity=X.clarity.apply(lambda x: labeling(x, clarity))
X.cut=X.cut.apply(lambda x: labeling(x, cut))
X.color=X.color.apply(lambda x: labeling(x, color))

In [44]:
X=X.drop('table', axis=1)
X.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,x,y,z
0,0,0.53,2,3,2,63.4,5.09,5.13,3.24
1,1,0.41,4,6,2,63.0,4.8,4.75,3.01
2,2,0.32,4,1,3,61.6,4.37,4.39,2.7
3,3,0.31,4,2,5,61.2,4.34,4.37,2.66
4,4,1.35,3,0,3,60.5,7.19,7.12,4.33


In [45]:
X_train, X_test, y_train, y_test=tts(X, y)

In [46]:
space ={'n_estimators': hp.quniform('n_estimators', 10, 1000, 25),
        'learning_rate': hp.uniform('learning_rate', 0.0001, 1.0),
        'max_depth': hp.quniform('max_depth', 4, 16, 1),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
        'subsample': hp.uniform ('subsample', 0.7, 1),
        'gamma' : hp.uniform ('gamma', 0.1,0.5),
        'reg_lambda' : hp.uniform ('reg_lambda', 0,1)
    }


In [47]:
def objetivo(space):

    modelo=xgbr(n_estimators=int(space['n_estimators']),
                learning_rate=space['learning_rate'],
                max_depth=int(space['max_depth']),
                min_child_weight=space['min_child_weight'],
                subsample=space['subsample'],
                gamma=space['gamma'],
                reg_lambda=space['reg_lambda'],
                objective='reg:squarederror')
    
    eval_set=[(X_train, y_train), (X_test, y_test)]
    
    modelo.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse', verbose=False)
    
    y_pred=modelo.predict(X_test)
    
    rmse=mse(y_test, y_pred)**(0.5)
    
    return {'loss':rmse, 'status':STATUS_OK}

In [48]:
best=fmin(fn=objetivo, space=space, algo=tpe.suggest, max_evals=20, trials=Trials())

100%|██████████| 20/20 [03:37<00:00, 10.89s/trial, best loss: 0.08946070104443866]


In [49]:
best

{'gamma': 0.21085997005628046,
 'learning_rate': 0.16139112865744362,
 'max_depth': 11.0,
 'min_child_weight': 2.0,
 'n_estimators': 100.0,
 'reg_lambda': 0.21650145908164087,
 'subsample': 0.8338529782063951}

In [50]:
modelo=xgbr(n_estimators=int(best['n_estimators']), 
            gamma=best['gamma'],
            learning_rate=best['learning_rate'],
            max_depth=int(best['max_depth']), 
            min_child_weight= best['min_child_weight'], 
            reg_lambda=best['reg_lambda'], 
            subsample= best['subsample'],
            objective='reg:squarederror')

In [51]:
modelo.fit(X_train, y_train)
y_pred=modelo.predict(X_test)

print ('RMSE: {}'.format(mse(y_test, y_pred)**0.5))

RMSE: 0.08946070104443866


### Con este tipo de grid search no podemos poner MSE como métrica, aún así obtenemos la RMSE y yo me calculo la MSE elevando al cuadrado por mi cuenta.