## GridSearchCV hyperparameter tuning riddle
I fitted a basic XGBoost model to have something as a benchmark, then I wanted to optimize hyperparameters with GridSearchCV. I expected a decrease of MAE as a result of this, but I found that it actually increased. I cannot wrap my head around it, because the parameter grid that I used contains the values that is used in the benchmark model, and the CV folds are also the same. It seems to me that GridSearchCV selects a model which is inferior to the one I started with.

What am I missing here? How can this happen?

In [1]:
# Load packages
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score, GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

# Ignore warnings
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [None]:
train_file_path = 'housing_train.csv'
train_data = pd.read_csv(train_file_path)

test_file_path = 'housing_test.csv'
test_data = pd.read_csv(test_file_path)

y = train_data.SalePrice
features = train_data.columns.drop(['SalePrice', 'Neighborhood', 'Exterior1st', 'Exterior2nd', 
                                    'Alley', 'PoolQC', 'Fence',  'MiscFeature'])
features_short_list = ['OverallQual', 'GrLivArea', 'YearBuilt', 'TotalBsmtSF', 'OverallCond', 'BsmtFinSF1', 
                       'LotArea', 'YearRemodAdd', '2ndFlrSF', 'GarageCars', 'Fireplaces', 'GarageArea', 'GarageYrBlt', 
                       'BsmtQual_Ex', 'Functional_Typ', 'PoolArea', 'ScreenPorch', 'MSZoning_RM', 'MasVnrArea', 
                       'WoodDeckSF', 'BsmtFinType1_GLQ', 'HalfBath', 'KitchenQual_Gd', 'CentralAir_N', '1stFlrSF', 
                       'KitchenAbvGr', 'BsmtFinSF2', 'TotRmsAbvGrd', 'SaleCondition_Family', 'BsmtUnfSF', 'ExterQual_Gd', 
                       'FullBath', 'KitchenQual_TA', 'BsmtFullBath', 'LowQualFinSF', 'RoofMatl_WdShngl', 'Condition1_Norm',
                       'MSZoning_FV', 'BsmtExposure_Gd', 'MoSold', 'Condition1_RRAe', 'MSZoning_RL', 
                       'SaleCondition_Abnorml', 'GarageQual_TA', 'BsmtExposure_No', 'FireplaceQu_Gd', 
                       'LotConfig_CulDSac', 'GarageType_Attchd', 'LotConfig_FR2', 'SaleType_CWD', 'PavedDrive_Y', 
                       'GarageType_Detchd', 'Condition1_PosN', 'Condition1_Artery', 'MSZoning_C (all)', 'BsmtCond_Fa', 
                       'LandSlope_Mod', 'HeatingQC_Ex', 'RoofMatl_Tar&Grv', 'Functional_Mod', 'EnclosedPorch', 
                       'LandSlope_Gtl', 'BsmtFinType2_ALQ', 'BsmtFinType1_ALQ', 'BsmtFinType1_LwQ', '3SsnPorch']

X = train_data[features]
X_test = test_data[features]

X_one_hot_encoded = pd.get_dummies(X).loc[:,features_short_list]

X_train_one_hot_encoded, X_val_one_hot_encoded, y_train, y_val = train_test_split(X_one_hot_encoded, y, random_state = 1)

In [None]:
# GridSearchCV Brute-force
pipeline_XGB = Pipeline([('imputer', Imputer(axis=0)), ('xgbrg', XGBRegressor())])

param_grid = {
    "xgbrg__n_estimators": [100, 200, 300, 500, 1000, 2000],
    "xgbrg__learning_rate": [0.01, 0.05, 0.075, 0.1],
    "xgbrg__subsample": [0.6, 0.7, 0.8, 0.9, 1]
}

fit_params = {"xgbrg__eval_set": [(np.array(X_val_one_hot_encoded),  y_val)], 
              "xgbrg__early_stopping_rounds": 1000, 
              "xgbrg__verbose": False} 

searchCV = GridSearchCV(pipeline_XGB, cv=KFold(n_splits=3), scoring = 'neg_mean_absolute_error', n_jobs=4,
                        param_grid=param_grid, fit_params=fit_params)
searchCV.fit(np.array(X_train_one_hot_encoded), y_train)
print(searchCV.best_params_)

In [20]:
# Use the best parameters from Grid Search, and calculate MAE
pipeline_XGB_BF = make_pipeline(
    Imputer(), 
    XGBRegressor(n_estimators=searchCV.best_params_["xgbrg__n_estimators"], 
                 learning_rate=searchCV.best_params_["xgbrg__learning_rate"],
                 subsample=searchCV.best_params_["xgbrg__subsample"],                           
                 n_jobs=4,
                 verbose=0))

pipeline_XGB_BF.fit(X_train_one_hot_encoded, y_train)

scores = cross_val_score(pipeline_XGB_BF, X_one_hot_encoded, y, cv=searchCV.cv, scoring='neg_mean_absolute_error')
print(scores)
print('Mean Absolute Error %2f' %(-1 * scores.mean()))

[-14229.77359792 -15794.87814425 -15348.03788902]
Mean Absolute Error 15124.229877


In [21]:
# Benchmark model
pipeline_XGB_benchmark = make_pipeline(
    Imputer(), 
    XGBRegressor(n_estimators=1000, 
                 learning_rate=0.05,
#                  early_stopping_rounds = 2,
                 subsample = 1,
                 eval_set= [(X_val_one_hot_encoded,  y_val)],
                 verbose= 0))
pipeline_XGB_benchmark.fit(X_train_one_hot_encoded, y_train)

scores = cross_val_score(pipeline_XGB_benchmark, X_one_hot_encoded, y, cv=searchCV.cv, scoring='neg_mean_absolute_error')
print(scores)
print('Mean Absolute Error %2f' %(-1 * scores.mean()))

[-14263.05455916 -16332.07410646 -16203.03404707]
Mean Absolute Error 15599.387571
