In [None]:
import pandas as pd
import numpy as np

#import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
#from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

#from sklearn.ensemble import RandomForestClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.svm import SVC
#from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import StandardScaler

# to perform a hyperparameter scan using Bayesian Optimization
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer

import time

In [None]:
# Read the data
X = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
X_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')

X

In [None]:
# Remove rows with missing target, separate target from predictors
y = X.SalePrice
X.drop(columns=['SalePrice'], axis=1, inplace=True)

In [None]:
X.isnull().sum(), X.nunique(), X.dtypes

In [None]:
cat_cols = [cname for cname in X.columns if
                    X[cname].dtype == "object"]

# cat2_cols = [cname for cname in X.columns if
#                     X[cname].nunique() >= 10 and 
#                     X[cname].dtype == "object"]

# Select numerical columns
num_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

In [None]:
# Preprocessing for numerical data
num_transf = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('sca', StandardScaler())
])

# Preprocessing for categorical data
cat_transf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# cat2_transf = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=3.2))
# ])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transf, num_cols),
        ('cat', cat_transf, cat_cols)       
    ])

# Define model
model = XGBRegressor(tree_method='gpu_hist', use_label_encoder=False, eval_metric='rmse', random_state=7)

# Bundle preprocessing and modeling code in a pipeline
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

In [None]:
# params = {'model__n_estimators':np.arange(100, 800, 10),'model__learning_rate':np.arange(0.001, 0.2, 0.001),
#           'model__max_depth':np.arange(3, 12, 1), 'model__subsample':np.arange(0.1, 1, 0.1), 'model__colsample_bytree':np.arange(0.1, 1.1, 0.1),
#          'model__colsample_bylevel':np.arange(0.1, 1.1, 0.1), 'model__min_child_weight':np.arange(0,10,1), 'model__reg_alpha':np.arange(0,15,1), 
#               'model__reg_lambda':np.arange(0,30,1)}

params = {'model__n_estimators':np.arange(100, 800, 10),'model__learning_rate':np.arange(0.001, 0.2, 0.001),
          'model__max_depth':np.arange(3, 12, 1)}

kfold = KFold(n_splits=7, shuffle=True, random_state=7)

scan = RandomizedSearchCV(pipe, params, n_iter=100, scoring='neg_root_mean_squared_error', cv=kfold, random_state=7)



# Preprocessing of training data, fit model 
result = scan.fit(X, y)


# print the best score and parameters found during the scan
print("Best: %f using %s" % (-1*result.best_score_, result.best_params_))


In [None]:
# In order to use rnd_result.best_params_ in the next model, we need to remove "model__" from the keys
best_parameters = dict(result.best_params_.copy())

for k in best_parameters.keys():
    best_parameters[k.replace("model__","")] = best_parameters.pop(k)

# # for some reason, the loop is not replacing two model__ instances, so we do it one by one
best_parameters['reg_alpha'] = best_parameters.pop('model__reg_alpha')
best_parameters['n_estimators'] = best_parameters.pop('model__n_estimators')
best_parameters

In [None]:
# now, we fit using the role train data and the best parameters in the scan

model_scan = XGBRegressor(**best_parameters, eval_metric='rmse', use_label_encoder=False, tree_method='gpu_hist')

# Defining the pipeline with the same preprocessing as before, but with the tuned model
pipe_scan = Pipeline(steps=[('preprocessing', preprocessor), ('model', model_scan)])

# Fitting the whole dataset
pipe_scan.fit(X, y)

pred_scan = pipe_scan.predict(X_test)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred_scan})
output.to_csv('submission.csv', index=False)

output