In [14]:
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV


pd.set_option('display.max_rows', 100)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


### Load the data

In [15]:
house_data = pd.read_csv("../data/Housing_data/housing-classification-iter3.csv")

X = house_data.drop(columns=['Expensive'])
y = house_data['Expensive']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)#, stratify=categoric_features)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
884,7150,65.0,892,3,0,0,1,0,0,RL,Norm,GasA,Pave,Y,CBlock
641,7050,,1057,3,1,0,2,0,0,FV,Norm,GasA,Pave,Y,PConc
1357,12537,,1078,3,1,0,2,0,0,RL,Norm,GasA,Pave,Y,CBlock
469,9291,76.0,832,3,0,0,2,144,0,RL,RRNe,GasA,Pave,Y,PConc
222,11475,85.0,713,3,1,0,2,209,0,RL,RRAn,GasA,Pave,Y,CBlock


### Check the one-hot encoding

In [16]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_cat_enc = enc.fit_transform(X_train[categoric_features])
X_test_cat_enc = enc.transform(X_test[categoric_features])

# modify X_train
X_train_cat_enc = pd.DataFrame(X_train_cat_enc, columns=list(enc.get_feature_names()), index=X_train.index)

X_train_enc = pd.concat([X_train, X_train_cat_enc], axis=1, ignore_index=False).drop(columns=categoric_features)
X_train_enc.head()



Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,x0_C (all),...,x3_Grvl,x3_Pave,x4_N,x4_Y,x5_BrkTil,x5_CBlock,x5_PConc,x5_Slab,x5_Stone,x5_Wood
884,7150,65.0,892,3,0,0,1,0,0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
641,7050,,1057,3,1,0,2,0,0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1357,12537,,1078,3,1,0,2,0,0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
469,9291,76.0,832,3,0,0,2,144,0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
222,11475,85.0,713,3,1,0,2,209,0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


### Create the scaling/encoding pipelines for categorical and numerical data

In [17]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', MinMaxScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

categoric_features = list(X_train.select_dtypes(include=["object"]))
numeric_features = list(X_train.select_dtypes(exclude=["object"]))

# Apply the scaling pipeline to both categorical and numerical columns
full_processor = ColumnTransformer(transformers=[
    ('numerical', numeric_pipeline, numeric_features), 
    ('categorical', categorical_pipeline, categoric_features)
])

# pd.DataFrame(full_processor.fit_transform(X_train))

# Create a pipeline for the full model, including scaling transformations
tree_pipeline = Pipeline(steps=[
    ('preprocess', full_processor), 
    ('model', DecisionTreeClassifier())
])

### Get the pipeline parameters to decide what to vary

In [18]:
tree_pipeline.get_params()

{'memory': None,
 'steps': [('preprocess',
   ColumnTransformer(transformers=[('numerical',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='median')),
                                                    ('scale', MinMaxScaler())]),
                                    ['LotArea', 'LotFrontage', 'TotalBsmtSF',
                                     'BedroomAbvGr', 'Fireplaces', 'PoolArea',
                                     'GarageCars', 'WoodDeckSF', 'ScreenPorch']),
                                   ('categorical',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('one-hot',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   spars

### Use GridSearch to find the best parameters for the model

In [19]:
# automating the search of the different parameters for our model
param_grid = {
    'model__max_depth': range(1, 8),
    'model__min_samples_leaf': range(1, 10),
    'model__min_samples_split': range(2, 10),
    'model__criterion':['gini','entropy']
    }
search = GridSearchCV(tree_pipeline,     # you have defined this beforehand
                      param_grid,         # the parameter grid
                      cv=5,               # the value for K in K-fold Cross Validation
                      scoring='accuracy', # the performance metric to use
                      verbose=1, 
                      refit=True, 
                      n_jobs=-1)

# fit the model to the trainin data
_ = search.fit(X_train, y_train)

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits


In [20]:
search.best_score_

0.9297861413741242

In [21]:
search.best_params_

{'model__criterion': 'entropy',
 'model__max_depth': 4,
 'model__min_samples_leaf': 7,
 'model__min_samples_split': 2}

In [22]:
accuracy_score(search.predict(X_train), y_train)

0.9375

In [23]:
accuracy_score(search.predict(X_test), y_test)

0.886986301369863

In [24]:
search.cv_results_

{'mean_fit_time': array([0.03027487, 0.02890191, 0.02905803, ..., 0.03342915, 0.02922044,
        0.0229363 ]),
 'std_fit_time': array([0.00351102, 0.00224645, 0.00336995, ..., 0.00307455, 0.00460674,
        0.00535479]),
 'mean_score_time': array([0.01177626, 0.01193171, 0.01288142, ..., 0.01205897, 0.00994892,
        0.00705085]),
 'std_score_time': array([0.00125704, 0.0014504 , 0.0044219 , ..., 0.00214823, 0.00154462,
        0.0010163 ]),
 'param_model__criterion': masked_array(data=['gini', 'gini', 'gini', ..., 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_model__max_depth': masked_array(data=[1, 1, 1, ..., 7, 7, 7],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_model__min_samples_leaf': masked_array(data=[1, 1, 1, ..., 9, 9, 9],
              mask=[False, False, 