In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# load the train dataset

df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv",index_col='Id')

#see the head of the data

df_train.head()

In [None]:
#Well, a lot of columns and we can see a lot of null values as well

# the data description also indicates a number of categories

# some categories maybe better stored as numericals ? 0-poor to 4-excellent

# let us first check the number of null values and see what columns we can realistically use

# we have 1460 rows of data

null_columns=df_train.columns[df_train.isnull().any()]

df_train[null_columns].isnull().sum()


*Using a simple imputer for missing values

In [None]:
df_train_knn = df_train

df_train_knn.info(verbose=True)

In [None]:
df_train_knn[['SalePrice']].describe()

In [None]:
# convert all objects into categorical data types

# use a for loop to convert all of these columns into categoris

df_train_2 = df_train_knn.select_dtypes(include='object')

columns_obj_test= df_train_2.columns

for col in columns_obj_test:
    df_train_knn[col] = df_train_knn[col].astype('category')
    
# let us see if this has worked

df_train_4 = df_train_knn.drop(columns=['Alley','MiscFeature','SalePrice'])

df_train_4.info()

# yes, it has

# Using Pipelines for Imputation

In [None]:
# we will use the KnnImputer for numeric imputations and Simple Imputer for categorical imputation

# let us first bring all the necessary modules

# guide on how to do this with categorical tranfromer as in https://medium.com/vickdata/a-simple-guide-to-scikit-learn-pipelines-4ac0d974bdcf

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

from sklearn.pipeline import make_pipeline

from fancyimpute import KNN

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder

numeric_transformer = Pipeline(steps=[('imputer',SimpleImputer(missing_values=np.nan, strategy='mean') ),('scaler',StandardScaler())])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])



In [None]:
#Next we use the ColumnTransformer to apply the transformations to the correct columns in the dataframe. 

# Before building this we have stored lists of the numeric and categorical columns using the pandas dtype method

numeric_features = df_train_4.select_dtypes(include=['int64', 'float64']).columns

categorical_features = df_train_4.select_dtypes(include=['category']).columns 

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


#transformed = preprocessor.fit_transform(df_train_4)

#df_train_clean = pd.DataFrame(data=transformed, index=df_train_4.index, columns=df_train_4.columns)

#df_train_clean.head()


In [None]:
# Wow, the amazing pipelines have done everything for us

# let us double check to make sure there are no null columns any more

#null_columns2=df_train_clean.columns[df_train_clean.isnull().any()]

#df_train_clean[null_columns2].isnull().sum()

Wow, that is all the null values dealt with and very effeciently

# Using Pipelines with Scikit learn models

In [None]:
# now while actually running a model we will run the pre-processing step above as just one part of a pipeline

# first let us bring the features and labels

X = df_train_4

y = df_train[['SalePrice']]

# let us import train test split

from sklearn.model_selection import train_test_split

# Split data into 70% train and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size= 0.3,random_state= 1)


In [None]:
# let us bring the scikit classifiers and also bring in XG boost

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import cross_val_score

# let us use PCA as well 
 
# In some cases dimensionality reduction may filter out some noice and unnecessary details and result in higher performance

# but in general it won't, it will just speed up training

from sklearn.decomposition import PCA

# we need to import TruncatedSVD as PCA doesn't support sparse matrixes

from sklearn.decomposition import TruncatedSVD

SEED=1

ridge = Ridge(alpha=0.1)
knn = KNeighborsRegressor()
tree = DecisionTreeRegressor()
rf = RandomForestRegressor()
xgb= xgb.XGBRegressor()

# Define a list called classifier that contains the tuples (classifier_name, classifier)

classifiers = [('Ridge Regression', ridge),('K Nearest Neighbours', knn),('Decision Tree', tree),('Random Forest', rf),('XGBoost', xgb)]


In [None]:
# Iterate over the defined list of tuples containing the classifiers

for clf_name, clf in classifiers:
    
# create the full pipeline to the training set

   pipe = Pipeline(steps=[('preprocessor', preprocessor),('reducer',TruncatedSVD(n_components=30)),('regressor',  clf)])

   pipe.fit(X_train, np.ravel(y_train))
    
# Predict the labels of the test set

   y_pred = pipe.predict(X_test)
           
# Evaluate accuracies using cross_val_score
    
   cv_scores_rmse = np.sqrt(-1*(cross_val_score(pipe,X,np.ravel(y),cv=5,scoring='neg_mean_squared_error').mean()))
   cv_scores_r2 = cross_val_score(pipe,X,np.ravel(y),cv=5).mean()
          

# print the cv_scores for each classifier
   print('{:s} : {:.3f}'.format(clf_name, cv_scores_rmse))
   print('{:s} : {:.3f}'.format(clf_name,  cv_scores_r2))

Without any tuning, our XGBoost algorithm is already our best

Let us see how the PCA impacts our models and see how much the dimensionality reduction helps us 

In [None]:
# we are going to access step 2 which is at index 1

pipe.steps[1]

# let us see how the components explain the variances

var_sum = pipe.steps[1][1].explained_variance_ratio_.cumsum()

var_sum

In [None]:
# let us plot this to find the elbow in the plot

import matplotlib.pyplot as plt

var = pipe.steps[1][1].explained_variance_ratio_

plt.plot(var)
plt.xlabel('Principal component index')
plt.ylabel('Explained variance ratio')
plt.show()

 * Let us just work with the XG Boost model, first with random hyper parameteres chosen by us

In [None]:

import xgboost as xgb


xgb= xgb.XGBRegressor(colsample_bytree=0.3, learning_rate= 0.1, max_depth= 4)

xgb_pipe1 = Pipeline(steps=[('preprocessor', preprocessor),('reducer',TruncatedSVD(n_components=30)),('regressor',  xgb)])

# let us see if we can put in an RFE for feature selection


xgb_pipe1.fit(X_train, np.ravel(y_train))
    
# Predict the labels of the test set

y_pred = xgb_pipe1.predict(X_test)

# Evaluate accuracies using cross_val_score
    
cv_scores_rmse = np.sqrt(-1*(cross_val_score(xgb_pipe1,X,np.ravel(y),cv=5,scoring='neg_mean_squared_error').mean()))
cv_scores_r2 = cross_val_score(xgb_pipe1,X,np.ravel(y),cv=5).mean()
    
# print the cv_scores for each classifier
print('{:s} : {:.3f}'.format(clf_name, cv_scores_rmse))
print('{:s} : {:.3f}'.format(clf_name,  cv_scores_r2))

An improvement in performace to 1%

Let us try to do Randomozed Search CV

In [None]:
# import random search cv

import xgboost as xgb

from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import GridSearchCV

import time

t_start=time.time()

xgb= xgb.XGBRegressor(seed=123)

xgb_pipe = Pipeline(steps=[('preprocessor', preprocessor),('reducer',TruncatedSVD(n_components=30)),('regressor',  xgb)])

gbm_param_grid = {'regressor__learning_rate': np.arange(0.05,1.05,.05),'regressor__n_estimators': np.arange(50, 500, 50),'regressor__subsample': np.arange(0.05,1.05,.05),'regressor__max_depth' : np.arange(2, 6),'regressor__colsample_bytree': np.arange(0.05,1.05,.05)}

grid_mse = RandomizedSearchCV(estimator=xgb_pipe,param_distributions=gbm_param_grid,n_iter= 25, cv=4, verbose=1,n_jobs=-1)

# Fit randomized_mse to the data

grid_mse.fit(X_train, np.ravel(y_train))

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)

print("Lowest R squared found: ", np.sqrt(np.abs(grid_mse.best_score_)))

t_end=time.time()

print(t_end-t_start)


# Trying some dask

In [47]:
# let us see how the above work with dask hyperparamter method

from dask_ml.model_selection import RandomizedSearchCV

import xgboost as xgb

import time

t_start=time.time()

xgb= xgb.XGBRegressor(seed=123)

xgb_pipe = Pipeline(steps=[('preprocessor', preprocessor),('reducer',TruncatedSVD(n_components=30)),('regressor',  xgb)])

gbm_param_grid = {'regressor__learning_rate': np.arange(0.05,1.05,.05),'regressor__n_estimators': np.arange(50, 500, 50),'regressor__subsample': np.arange(0.05,1.05,.05),'regressor__max_depth' : np.arange(2, 6),'regressor__colsample_bytree': np.arange(0.05,1.05,.05)}

grid_mse = RandomizedSearchCV(estimator=xgb_pipe,param_distributions=gbm_param_grid,n_iter= 50, cv=4, n_jobs=-1)

# Fit randomized_mse to the data

grid_mse.fit(X_train, np.ravel(y_train))

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)

print("Lowest R squared found: ", np.sqrt(np.abs(grid_mse.best_score_)))

t_end=time.time()

print(t_end-t_start)

Best parameters found:  {'regressor__subsample': 0.8, 'regressor__n_estimators': 100, 'regressor__max_depth': 3, 'regressor__learning_rate': 0.3, 'regressor__colsample_bytree': 0.7000000000000001}
Lowest R squared found:  0.9228162836303117
42.00663757324219


Dask Hyperparameter tuning was 20% faster! We need to try this on a larger data set though!

Why you use Dask Hyperparameter tuning, see reasons in https://ml.dask.org/hyper-parameter-search.html

# Let's use Dask to distribute work across a cluster

In [48]:
# let us see how the above work with dask hyperparamter method

from dask_ml.model_selection import RandomizedSearchCV

# let us import client to use my local machine as a cluster

from dask.distributed import Client

# we need joblib as well in backend

import joblib

import xgboost as xgb

import time

#create local cluster

client = Client(processes=False)             

t_start=time.time()

xgb= xgb.XGBRegressor(seed=123)

xgb_pipe = Pipeline(steps=[('preprocessor', preprocessor),('reducer',TruncatedSVD(n_components=30)),('regressor',  xgb)])

gbm_param_grid = {'regressor__learning_rate': np.arange(0.05,1.05,.05),'regressor__n_estimators': np.arange(50, 500, 50),'regressor__subsample': np.arange(0.05,1.05,.05),'regressor__max_depth' : np.arange(2, 6),'regressor__colsample_bytree': np.arange(0.05,1.05,.05)}

grid_mse = RandomizedSearchCV(estimator=xgb_pipe,param_distributions=gbm_param_grid,n_iter= 50, cv=4, n_jobs=-1)

with joblib.parallel_backend('dask'):
    
# Fit randomized_mse to the data

    grid_mse.fit(X_train, np.ravel(y_train))
    
    # Print the best parameters and lowest RMSE
    print("Best parameters found: ", grid_mse.best_params_)

    print("Lowest R squared found: ", np.sqrt(np.abs(grid_mse.best_score_)))
    

t_end=time.time()

print(t_end-t_start)

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


Best parameters found:  {'regressor__subsample': 0.05, 'regressor__n_estimators': 200, 'regressor__max_depth': 3, 'regressor__learning_rate': 0.05, 'regressor__colsample_bytree': 0.8}
Lowest R squared found:  0.9134398223065845
22.721360206604004


*** Let us now work on the test data with the Pipeline architecture and make predictions**

Now, we are talking with a 92.70 % R squared value

In [None]:
# load the test dataset

df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv",index_col='Id')

#see the head of the data

df_test.head()

In [None]:
# check the info of the data

df_test.info()


In [None]:
pred = grid_mse.predict(df_test)

pred

In [None]:
# let us put this into the submission format

xgb_output=pd.DataFrame({'Id':df_test.index, 'SalePrice': pred}) 

# save to kaggle

xgb_output.to_csv('my_submission_xgb.csv', index=False)

print("Your submission was successfully saved!")

# Trying a Random Forest model with hyperparameter tuning

Our untuned random forest gave us an R-squared of 0.828; can we improve on this?

In [None]:
from sklearn.metrics import mean_squared_log_error

# instantiate the random forest regressor

rf = RandomForestRegressor()

rf_pipe = Pipeline(steps=[('preprocessor', preprocessor),('reducer',TruncatedSVD(n_components=30)),('regressor',  rf)])

# set the important parameters for random forest

# n_etimators will increase the number of trees built and bring in more diversity and a better model

# max_features, decreasing this limit from one will make the number of features low at every tree and bring more diversity in each tree output or less correlated trees

# n_jobs = -1 to use all CPU Cores (should be used within random search cv)

# max_depth to be limited to restrict over fitting and for model to finish quickly

# use random state to repeat the results

rf_param_grid = {'regressor__n_estimators': np.arange(100,1000,100),'regressor__max_features': np.arange(0.05,1.05,.05),'regressor__max_depth':np.arange(4,8,1)}

grid_mse_rf = RandomizedSearchCV(estimator=rf_pipe,param_distributions=rf_param_grid,n_iter= 20, cv=4, verbose=1,n_jobs=-1, random_state=123)

# Fit randomized_mse to the data

grid_mse_rf.fit(X_train, np.ravel(y_train))

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse_rf.best_params_)

print("Lowest R squared found: ", np.sqrt(np.abs(grid_mse_rf.best_score_)))

Wow, contrary to my earlier assumptions; hyperparameter tuning improved the random forest performance by a massive 10%

In [None]:
# let us predict the results with the new random forest regressor and see how it performs in kaggle

pred_rf = grid_mse_rf.predict(df_test)

# let us put this into the submission format

rf_output=pd.DataFrame({'Id':df_test.index, 'SalePrice': pred_rf}) 

# save to kaggle

rf_output.to_csv('my_submission_rf_withhyperparametertuning.csv', index=False)

print("Your submission was successfully saved!")

# Hyperparameter Tuning using Informed Search - Bayesian Optimization

In [None]:
# hyperparameter tuning is very important for models like XGBoost

# while random search cv gives us a good chance of finding optimal parameters, let us try a newer method of informed search

# Bayesian Optimization is what we will try

# detailed e.g. in datacamp and in https://towardsdatascience.com/an-introductory-example-of-bayesian-optimization-in-python-with-hyperopt-aae40fff4ff0

# let us import the hyperplot package which will do this for us

from hyperopt import fmin, tpe, hp

import hyperopt.pyll.stochastic as st

# first step is to build the grid or domain

space = {'regressor__learning_rate': hp.uniform('regressor__learning_rate',0.05,1.05),'regressor__n_estimators': hp.quniform('regressor__n_estimators',500,2000,100),'regressor__subsample': hp.uniform('regressor__subsample',0.05,1.05),'regressor__max_depth' : hp.quniform('regressor__max_depth', 2,6,1),'regressor__colsample_bytree': hp.uniform('regressor__colsample_bytree',0.05,1.05),}

# next, define the objective function

def objective(params):
    
 params = {'regressor__learning_rate': params['regressor__learning_rate'],'regressor__n_estimators': int(params['regressor__n_estimators']), 'regressor__subsample': params['regressor__subsample'],'regressor__max_depth': int(params['regressor__max_depth']), 'regressor__colsample_bytree': params['regressor__colsample_bytree']}
 
 import xgboost as xgb

 xgb1= xgb.XGBRegressor(seed=123)

 xgb_pipe = Pipeline(steps=[('preprocessor', preprocessor),('regressor',  xgb1)])

 best_score = cross_val_score(xgb_pipe, X, y,cv=10, n_jobs=-1).mean()
 loss = 1 - best_score
 
 return loss

print (st.sample(space))

In [None]:
# now let us run the optimization algorithm

best_result = fmin(fn=objective,space=space,max_evals=25,rstate=np.random.RandomState(42),algo=tpe.suggest)

best_result

In [None]:
# let us run the XgBoost algorim with the best_result hyperparameters

# import random search cv

import xgboost as xgb

xgb= xgb.XGBRegressor(seed=123,colsample_bytree=0.5351871929759634,learning_rate = 0.36944725103976733,max_depth=2,n_estimators=1700, subsample=0.50 )

xgb_pipe_bayes= Pipeline(steps=[('preprocessor', preprocessor),('regressor',  xgb)])

# Fit the pipeline object to the data

xgb_pipe_bayes.fit(X_train, np.ravel(y_train))

# Predict the labels of the test set

y_pred = xgb_pipe_bayes.predict(X_test)

# Evaluate accuracies using cross_val_score

cv_scores_r2 = cross_val_score(xgb_pipe_bayes,X,np.ravel(y),cv=5).mean()

# Print the best parameters and lowest RMSE

print("Lowest R squared found: ", cv_scores_r2)



Not great, the bayer hyper parameter optimization is giving us a worse off result than even if we do not set any hyper parameters

# TPOT algorithm run 

Let's now turn ourselves to our ML assistant TPOT

In [None]:
# Handy guide to TPOT is in DataCamp Hyper parameter course as well as in https://towardsdatascience.com/tpot-automated-machine-learning-in-python-4c063b3e5de9

# TPOT claims to do all pre-processing of data

# let us pick the raw housing data in that case without any pre-processing

# df_train holds our training data

# tpot seems to run into problems with one hot encoding in this dataset

# let us then use our own pre-processing steps as in pipeline, comment out everything else

#df_train.head()

#X = df_train.drop(columns=['SalePrice'])

#y = df_train['SalePrice']

In [None]:
# let us import train test split

#from sklearn.model_selection import train_test_split

# Split data into 70% train and 30% test

#X_train, X_test, y_train, y_test = train_test_split(X, y,test_size= 0.3,random_state= 1)

In [None]:
# let us now try tpot

# remember to not use TPOT default settings as that may run for well over an hour 

from tpot import TPOTRegressor

tpot = TPOTRegressor(generations=20, population_size=20,verbosity=2, offspring_size=20,config_dict='TPOT sparse',scoring='r2', cv=5,n_jobs=-1)

tpot_pipe = Pipeline(steps=[('preprocessor', preprocessor),('regressor',  tpot)])

tpot_pipe.fit(X_train, np.ravel(y_train))

tpot_pipe.score(X_test, np.ravel(y_test))

Does evaluate that an XGBoost Regressor is the best! But best result is 89% given that the default XGBoost gave 88% accuracy, this is a disappointment. Should really be running it for much more iterations for TPOT to be really useful 

In [None]:
# let us use TPOT's export function to export the code of the best pipeline

# acess tpot in pipeline

tpot_step = tpot_pipe[-1]

tpot_step.export('tpt.py')



In [None]:
tpot_step.evaluated_individuals_