In [35]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('housing-classification-iter5.csv')

In [3]:
data

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
0,8450,65.0,856,3,0,0,2,0,0,0,...,1,8,2003.0,548,61,0,0,0,2,2008
1,9600,80.0,1262,3,1,0,2,298,0,0,...,1,6,1976.0,460,0,0,0,0,5,2007
2,11250,68.0,920,3,1,0,2,0,0,0,...,1,6,2001.0,608,42,0,0,0,9,2008
3,9550,60.0,756,3,1,0,3,0,0,0,...,1,7,1998.0,642,35,272,0,0,2,2006
4,14260,84.0,1145,4,1,0,3,192,0,0,...,1,9,2000.0,836,84,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,1,7,1999.0,460,40,0,0,0,8,2007
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,1,7,1978.0,500,0,0,0,0,2,2010
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,1,9,1941.0,252,60,0,0,2500,5,2010
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,1,5,1950.0,240,0,112,0,0,4,2010


In [5]:
y = data.pop('Expensive')

In [39]:
# Since scikit-Learn models cannot deal with categorical features, we will keep only the numerical features
X = data
X_num = X.select_dtypes(include='number')

In [42]:
# splitting data
X_num_train, X_num_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=123)

In [43]:
# Impute missing values
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer().set_output(transform='pandas') # initialize
my_imputer.fit(X_num_train) # fit on the train set
X_num_imputed_train = my_imputer.transform(X_num_train) # transform the train set
X_num_imputed_test = my_imputer.transform(X_num_test) # transform the test set

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(SimpleImputer(strategy='mean'))

# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(SimpleImputer(strategy='constatnt', fill_value='N_A'),
                               OneHotEncoder(sparse_output=False))

In [29]:
# now transform the columns
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
preprocessor = make_column_transformer(
    (numeric_pipe, make_column_selector(dtype_include='number')),
    (categoric_pipe, make_column_selector(dtype_include='object'))
)
preprocessor

In [36]:
# Creating the full_pipeline (preprocessor + Decision Tree)
full_pipeline = make_pipeline(preprocessor,
                              KNeighborsClassifier()).set_output(transform='pandas')
full_pipeline

In [44]:
full_pipeline.fit(X_num_train, y_train)

In [46]:
y_train_predict = full_pipeline.predict(X_num_train)
accuracy_score(y_train, y_train_predict)

0.9452054794520548

In [47]:
y_test_predict = full_pipeline.predict(X_num_test)
accuracy_score(y_test, y_test_predict)

0.9041095890410958

In [67]:
# now we use GrisCV to deal with over-fitting
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=6) # handle_unknown is set to ignore the values that are not familiar
)

preprocessor = make_column_transformer(
        (numeric_pipe, make_column_selector(dtype_include='number')),
        (categoric_pipe, make_column_selector(dtype_include='object'))
)
preprocessor



from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, 
                              KNeighborsClassifier()).set_output(transform='pandas')

param_grid = {
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"],
    #"columntransformer__numeric_pipe__simpleimputer__strategy":["mean", "median"]
    #"kneighborsclassifier__": range(3, 12, 2)
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_num_imputed_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [68]:
search.best_params_

{'kneighborsclassifier__n_neighbors': 3,
 'kneighborsclassifier__weights': 'uniform'}

In [69]:
search.best_score_

0.9220938336818165

In [70]:
y_train_predict = search.predict(X_train)
accuracy_score(y_train, y_train_predict)

0.946917808219178

In [71]:
y_test_predict = search.predict(X_test)
accuracy_score(y_test, y_test_predict)

0.9075342465753424