In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn import set_config

In [9]:
X = pd.read_csv('housing-classification-iter5.csv'
               )
y = X.pop('Expensive')
X.head(3)

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
0,8450,65.0,856,3,0,0,2,0,0,RL,...,1,8,2003.0,548,61,0,0,0,2,2008
1,9600,80.0,1262,3,1,0,2,298,0,RL,...,1,6,1976.0,460,0,0,0,0,5,2007
2,11250,68.0,920,3,1,0,2,0,0,RL,...,1,6,2001.0,608,42,0,0,0,9,2008


In [10]:
# SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

##### Preprocessing pipeline

In [11]:
# BUILD PIPELINE
X_cat = X.select_dtypes(exclude='number').copy()
X_num = X.select_dtypes(include='number').copy()

numeric_pipe = make_pipeline(
    SimpleImputer(strategy='mean'))

categoric_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N_A'),
                              OneHotEncoder(handle_unknown='ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipe', numeric_pipe, X_num.columns),
        ('cat_pipe', categoric_pipe, X_cat.columns),
    ]
 )    

##### Using Kneighbour classifier

In [13]:
# DEFINE MODELS AND PARAMETERS
from sklearn.neighbors import KNeighborsClassifier

knn_full_pipeline = make_pipeline(preprocessor, KNeighborsClassifier())

from sklearn.model_selection import GridSearchCV

param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "kneighborsclassifier__n_neighbors": range(2, 20),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

search = GridSearchCV(knn_full_pipeline,
                          param_grid,
                          cv=8,
                          verbose=1)
search.fit(X_train, y_train)
scores = {"knn" : search.best_score_}

scores

Fitting 8 folds for each of 72 candidates, totalling 576 fits


{'knn': 0.9178082191780822}