In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [3]:
data = pd.read_csv('housing-classification-iter6.csv')

In [4]:
data

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Attchd,RFn,TA,TA,Y,,GdPrv,Shed,WD,Normal
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Attchd,Unf,TA,TA,Y,,,,WD,Normal


In [5]:
y = data.pop('Expensive')
# Since scikit-Learn models cannot deal with categorical features, we will keep only the numerical features
X = data
X_num = X.select_dtypes(include='number')

In [6]:
# splitting data
X_num_train, X_num_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=123)

In [7]:
# Impute missing values
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer().set_output(transform='pandas') # initialize
my_imputer.fit(X_num_train) # fit on the train set
X_num_imputed_train = my_imputer.transform(X_num_train) # transform the train set
X_num_imputed_test = my_imputer.transform(X_num_test) # transform the test set

In [8]:
from sklearn.preprocessing import OneHotEncoder

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(SimpleImputer(strategy='mean'))

# create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(SimpleImputer(strategy='constatnt', fill_value='N_A'),
                               OneHotEncoder(sparse_output=False))

In [9]:
# now transform the columns
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
preprocessor = make_column_transformer(
    (numeric_pipe, make_column_selector(dtype_include='number')),
    (categoric_pipe, make_column_selector(dtype_include='object'))
)
preprocessor

In [10]:
from sklearn.svm import SVC
full_pipeline = make_pipeline(preprocessor,
                              SVC()).set_output(transform='pandas')
full_pipeline

In [11]:
full_pipeline.fit(X_num_train, y_train)

In [12]:
y_train_predict = full_pipeline.predict(X_num_train)
accuracy_score(y_train, y_train_predict)

0.875

In [13]:
y_test_predict = full_pipeline.predict(X_num_test)
accuracy_score(y_test, y_test_predict)

0.8732876712328768

In [None]:
# now we use GrisCV to deal with over-fitting
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=6) # handle_unknown is set to ignore the values that are not familiar
)

preprocessor = make_column_transformer(
        (numeric_pipe, make_column_selector(dtype_include='number')),
        (categoric_pipe, make_column_selector(dtype_include='object'))
)
preprocessor



from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, 
                              SVC()).set_output(transform='pandas')

param_grid = {
    # "svc__c": range(1, 50),
    "svc__kernel": ['poly', 'rbf', 'sigmoid', 'precomputed'],
    'svc__gamma': ['scale', 'auto'],
    'svc__degree': range(1, 50)
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_num_train, y_train)

Fitting 5 folds for each of 392 candidates, totalling 1960 fits


In [34]:
# usuing KNeighborClassifier
full_pipeline = make_pipeline(preprocessor,
                              KNeighborsClassifier()).set_output(transform='pandas')
full_pipeline.fit(X_num_train, y_train)

In [35]:
# now we use GrisCV to deal with over-fitting
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))
 
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=6) # handle_unknown is set to ignore the values that are not familiar
)

preprocessor = make_column_transformer(
        (numeric_pipe, make_column_selector(dtype_include='number')),
        (categoric_pipe, make_column_selector(dtype_include='object'))
)
preprocessor



from sklearn.model_selection import GridSearchCV

full_pipeline = make_pipeline(preprocessor, 
                              KNeighborsClassifier()).set_output(transform='pandas')

param_grid = {
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"],
    #"columntransformer__numeric_pipe__simpleimputer__strategy":["mean", "median"]
    #"kneighborsclassifier__": range(3, 12, 2)
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_num_imputed_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [36]:
search.best_params_

{'kneighborsclassifier__n_neighbors': 6,
 'kneighborsclassifier__weights': 'distance'}

In [42]:
y_train_predict = search.predict(X_train),
accuracy_score(y_train, y_train_predict)

NameError: name 'X_train' is not defined