In [1]:
import pandas as pd

url = "https://drive.google.com/file/d/1z1Zcpx1FpHokhSwmGSd7-hk9O_tdcKqW/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]

data = pd.read_csv(path)
data.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive'],
      dtype='object')

## Split the data

In [2]:
X= data.copy()

In [3]:
y= X.pop('Expensive')

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

## Impute missing values

In [8]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer() # initialise
my_imputer.fit(X_train) # fit on the train set
X_imputed_train = my_imputer.transform(X_train) # transform the train set
X_imputed_test = my_imputer.transform(X_test) # transform the test set

## Modelling: Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
my_tree = DecisionTreeClassifier(max_depth=5,
                                 min_samples_leaf=10
                                 )

In [11]:
my_tree.fit(X = X_imputed_train,
            y = y_train)

## Check accuracy on the train set

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
train_pred = my_tree.predict(X_imputed_train)

In [14]:
train_pred

array([1, 0, 0, ..., 0, 0, 0])

In [15]:
accuracy_score(y_true = y_train,
               y_pred = train_pred)

0.9289383561643836

## Check accuracy on the test set

In [16]:
test_pred = my_tree.predict(X_imputed_test)

In [17]:
accuracy_score(y_true = y_test,
               y_pred = test_pred)

0.928082191780822

## Create Pipeline


In [20]:
from sklearn.pipeline import make_pipeline

In [21]:
imputer = SimpleImputer(strategy="median")
dtree = DecisionTreeClassifier(max_depth=5,
                               min_samples_leaf=10,
                               random_state=1994)

pipe = make_pipeline(imputer,dtree)

In [22]:
pipe.fit(X_train, y_train)

## Use GridSearchCV to find the best parameters of the model

So far, we tuned the hyperparameters of the decision tree manually. This is not ideal, for two reasons:

- It's not efficient in terms of quickly finding the best combination of parameters.
- If we keep checking the performance on the test set over and over again, we might end up creating a model that fits that particular test set, but does not generalize as well with new data. Test sets are meant to reamain unseen until the very last moment of ML development —we have been cheating a bit!

Grid Search Cross Validation solves both issues:

In [23]:
# 1. initialize transformers & model without specifying the parameters
imputer = SimpleImputer()
dtree = DecisionTreeClassifier()

# 2. Create a pipeline
pipe = make_pipeline(imputer,
                     dtree)

pipe

In [24]:
param_grid = {
    'decisiontreeclassifier__max_depth': range(1,11),
    'decisiontreeclassifier__min_samples_leaf': range(3, 10, 2),
    'simpleimputer__strategy' : ['mean','median','most_frequent'],
    'decisiontreeclassifier__criterion': ['gini','entropy','log_loss']
    }

In [25]:
from sklearn.model_selection import GridSearchCV

In [32]:
search = GridSearchCV(pipe,
                      param_grid,
                      cv=5, # the value for K in K-fold Cross Validation
                      #cv=1 means just using a single train-test split(GridSearch without cross-validation
                      scoring='accuracy', # the performance metric to use,
                      #n_jobs=-1, # uses all available cores of the CPU to run faster (warning: may slow down other applications)
                      verbose=3) # we want informative outputs during the training process

In [33]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV] END decisiontreeclassifier__criterion=gini, decisiontreeclassifier__max_depth=1, decisiontreeclassifier__min_samples_leaf=3, simpleimputer__strategy=mean; total time=   0.0s
[CV] END decisiontreeclassifier__criterion=gini, decisiontreeclassifier__max_depth=1, decisiontreeclassifier__min_samples_leaf=3, simpleimputer__strategy=mean; total time=   0.0s
[CV] END decisiontreeclassifier__criterion=gini, decisiontreeclassifier__max_depth=1, decisiontreeclassifier__min_samples_leaf=3, simpleimputer__strategy=mean; total time=   0.0s
[CV] END decisiontreeclassifier__criterion=gini, decisiontreeclassifier__max_depth=1, decisiontreeclassifier__min_samples_leaf=3, simpleimputer__strategy=mean; total time=   0.0s
[CV] END decisiontreeclassifier__criterion=gini, decisiontreeclassifier__max_depth=1, decisiontreeclassifier__min_samples_leaf=3, simpleimputer__strategy=mean; total time=   0.0s
[CV] END decisiontreeclassifier__criterio

In [35]:
search.best_params_

{'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__min_samples_leaf': 3,
 'simpleimputer__strategy': 'mean'}

In [36]:
# the mean cross-validated score of the best estimator
search.best_score_

np.float64(0.9229485345365174)

In [37]:
# training accuracy
train_pred = search.predict(X_train)

accuracy_score(y_train, train_pred)

0.940068493150685

In [31]:
#testing accuracy
test_pred= search.predict(X_test)

accuracy_score(y_test, test_pred)

0.928082191780822

## Use GridSearchCV to find the best parameters of the pipeline

Add a scaler to the pipeline, and use GridSearchCV to tune the parameters of the scaler, as well as the parameters of the imputer and the decision tree. Scalers don't affect decision trees, but we'll need them for other models so it's good to see how they fit in to the pipeline.

This shows how Grid Search Cross Validation can be used to not only tune the parameters of the model but also the parameters of all the transformers in a pipeline, thus helping us find the best preprocessing strategy for our data.

In [38]:
from sklearn.preprocessing import StandardScaler

In [43]:
# initialize transformers & model
imputer = SimpleImputer()
scaler = StandardScaler()
dtree = DecisionTreeClassifier()

In [44]:
# create the pipeline
pipe = make_pipeline(imputer,
                     scaler,
                     dtree)

In [45]:
pipe

In [46]:
# create parameter grid
param_grid = {
    "simpleimputer__strategy":["mean", "median"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(3, 10)
}

In [47]:
# define cross validation
search = GridSearchCV(pipe,
                      param_grid,
                      cv=5,
                      verbose=1)

In [48]:
# fit
search.fit(X_train, y_train)

Fitting 5 folds for each of 672 candidates, totalling 3360 fits


In [49]:
# cross validation average accuracy
search.best_score_

np.float64(0.922952202780529)

In [50]:
# best parameters
search.best_params_

{'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__min_samples_leaf': 3,
 'simpleimputer__strategy': 'median',
 'standardscaler__with_mean': False,
 'standardscaler__with_std': True}

In [53]:
#Train accuracy
pred_train= search.predict(X_train)

accuracy_score(y_train, pred_train)

0.9392123287671232

In [51]:
#Test accuracy

pred_test= search.predict(X_test)

In [52]:
accuracy_score(y_test, pred_test)

0.9246575342465754