In [1]:
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV


pd.set_option('display.max_rows', 100)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


### Load the data

In [2]:
house_data = pd.read_csv("../data/Housing_data/housing-classification-iter3.csv")

X = house_data.drop(columns=['Expensive'])
y = house_data['Expensive']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=8)#, stratify=categoric_features)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

### Create the scaling/encoding pipelines for categorical and numerical data

In [3]:
numeric_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', MinMaxScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

categoric_features = list(X_train.select_dtypes(include=["object"]))
numeric_features = list(X_train.select_dtypes(exclude=["object"]))

# Apply the scaling pipeline to both categorical and numerical columns
full_processor = ColumnTransformer(transformers=[
    ('numerical', numeric_pipeline, numeric_features), 
    ('categorical', categorical_pipeline, categoric_features)
])

# pd.DataFrame(full_processor.fit_transform(X_train))

# Create a pipeline for the full model, including scaling transformations
tree_pipeline = Pipeline(steps=[
    ('preprocess', full_processor), 
    ('model', DecisionTreeClassifier())
])

### Use RandomizedSearchCV to find the best parameters for the model

In [None]:
'''
NEED TO SWITCH THIS CODE TO RANDOMIZED SEARCH
'''

# automating the search of the different parameters for our model
param_grid = {
    'model__max_depth': range(1, 8),
    'model__min_samples_leaf': range(1, 10),
    'model__min_samples_split': range(2, 10),
    'model__criterion':['gini','entropy']
    }
search = GridSearchCV(tree_pipeline,     # you have defined this beforehand
                      param_grid,         # the parameter grid
                      cv=5,               # the value for K in K-fold Cross Validation
                      scoring='accuracy', # the performance metric to use
                      verbose=1, 
                      refit=True, 
                      n_jobs=-1)

# fit the model to the trainin data
_ = search.fit(X_train, y_train)

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits


In [None]:
search.best_score_

In [None]:
search.best_params_

In [None]:
accuracy_score(search.predict(X_train), y_train)

In [None]:
accuracy_score(search.predict(X_test), y_test)

In [None]:
search.cv_results_