### https://machinelearningmastery.com/feature-importance-and-feature-selection-with-xgboost-in-python/

## Set up Dataset

In [5]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

housing = clean(config.HOUSING_CSV)
housing = add_features(housing)

## Create testing and training sets

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
X = housing[config.CHOSEN_VARIABLES].copy()
dummy_df = dummify(housing, config.VARS_TO_DUMMIFY, drop_first=False)
X = pd.concat([X,dummy_df], axis=1)
y = np.log(housing['SalePrice'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## Import model library

In [7]:
from xgboost import XGBRegressor
from matplotlib import pyplot

## Grid Search for Optimal Parameters using Cross-Validation

In [None]:
# Number of trees in random forest
n_estimators = [40,100,1000] #[int(x) for x in np.linspace(start=10, stop=80, num=10)]
# Maximum number of levels in tree
max_depth = [i*2+3 for i in range(1,5)] + [None]
# Minimum number of samples required to split a node
learning_rates = [10**i for i in range(-3,2)]
# Method of selecting learning rate
colsample_bytree = [0.5, 0.8, 1]
# number of columns used by each tree
gamma = [0,1,5]
# Acts as a regularization parameter

bootstrap = [True]

param_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth,
             'learning_rates': learning_rates,
             'colsample_bytree': colsample_bytree,
             'gamma': gamma}

In [None]:
%timeit
xgb = XGBRegressor(n_estimators =100,
                    max_depth=5,
                    learning_rates=0.001,
                    colsample_bytree=0.5,
                    gamma=0,
                    random_state=42)
from sklearn.model_selection import GridSearchCV
xgb_Grid = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=10, verbose=2, n_jobs=4)
xgb_Grid.fit(X_train,y_train)

## Select the best version of the model

In [None]:
print(xgb_Grid.best_params_)
xgb_best = xgb_Grid.best_estimator_

In [None]:
xgb_best.score(X_test, y_test)

In [8]:
xgb = XGBRegressor(n_estimators =100,
                    max_depth=None,
                    learning_rates=0.001,
                    colsample_bytree=0.5,
                    gamma=0,
                    random_state=42) 

xgb.fit(X_train, y_train) 

Parameters: { learning_rates } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, learning_rates=0.001, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [9]:
print(xgb.score(X_train, y_train))
print(xgb.score(X_test, y_test))

0.9935090348324125
0.9088619027301676


In [10]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y_test, xgb.predict(X_test), squared=False)
print(rms)

0.11789346822920295


## Feature Importance

In [13]:
from model_analysis import get_feature_importance, graph_importance
feats = get_feature_importance(xgb, X)
fig = graph_importance(feats, 'XGBoost')
fig.show()