In [242]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.pipeline import Pipeline

In [41]:
hr = pd.read_csv("HR_comma_sep.csv")
X = hr.drop('left', axis=1)
y = hr['left']

In [43]:

X_train, X_test, y_train, y_test = train_test_split(X, y, 
        random_state=24, test_size=0.3, stratify=y)

In [33]:
# Using standard scaler
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')
knn = KNeighborsClassifier(n_neighbors=2)
pipe = Pipeline([('CT',ct),('SCL',scaler_std),('LR',knn)])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.9568793065125584
1.0085222147022455


In [37]:
# Using Min max scaler
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')
knn = KNeighborsClassifier(n_neighbors=3)
pipe = Pipeline([('CT',ct),('SCL',scaler_mm),('LR',knn)])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.9462102689486552
0.8468015081770698


In [57]:
# Grid search


kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)
pipe = Pipeline([('CT',ct),('SCL',None),('KNN',knn)])
params = {"KNN__n_neighbors" : list(range(1, 11)),
          'SCL':[scaler_mm, scaler_std, None]}

In [59]:
gcv = GridSearchCV(pipe, param_grid=params,
                   scoring = "neg_log_loss", 
                   cv=kfold, 
                   verbose=3)

In [61]:
gcv.fit(X,y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.202 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.262 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.142 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.106 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.118 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.274 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.238 total time=   0.1s
[CV 3/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.166 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.106 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.106 total time=   0.0s
[CV 1/5] END .....KNN__n_neighbors=1, SCL=None;, score=-1.550 total 

In [63]:
print(gcv.best_params_)
print(gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

{'KNN__n_neighbors': 10, 'SCL': StandardScaler()}
-0.46380636508556156
(30, 15)


# Glass Identification Dataset


In [112]:
df = pd.read_csv("Glass.csv")
df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,building_windows_float_processed
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,building_windows_float_processed
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,headlamps
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,headlamps
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,headlamps
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,headlamps


In [114]:
encoder = LabelEncoder()
df['Type'] = encoder.fit_transform(df['Type'])

In [116]:
X = df.drop("Type", axis = 'columns')
y = df['Type']

In [158]:

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=24, 
                                                    test_size=0.3, 
                                                    stratify=y)

In [170]:
# Using Min max scaler
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')

knn = KNeighborsClassifier(n_neighbors=5)

pipe = Pipeline([('CT',ct),('SCL',scaler_mm),('KNN',knn)])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.6
4.370012303184123


In [172]:
# Grid search


kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)
pipe = Pipeline([('CT',ct),('SCL',None),('KNN',knn)])
params = {"KNN__n_neighbors" : list(range(1, 21)),
          "KNN__metric" : ['cityblock', 'manhattan', 'minkowski', 'haversine'], 
          'SCL':[scaler_mm, scaler_std, None]}

In [174]:
gcv = GridSearchCV(pipe, param_grid=params,
                   scoring = "neg_log_loss", 
                   cv=kfold, 
                   verbose=3)

In [176]:
gcv.fit(X,y)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV 1/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-10.897 total time=   0.0s
[CV 2/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-6.706 total time=   0.0s
[CV 3/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-10.897 total time=   0.0s
[CV 4/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-9.220 total time=   0.0s
[CV 5/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-12.015 total time=   0.0s
[CV 1/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=-8.382 total time=   0.0s
[CV 2/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=-6.706 total time=   0.0s
[CV 3/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=-10.059 total time=   0.0s
[CV 4/5] END KNN__metric=cityblock, KNN__n_neighbors=1

300 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"]

In [178]:
print(gcv.best_params_)
print(gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

{'KNN__metric': 'cityblock', 'KNN__n_neighbors': 19, 'SCL': MinMaxScaler()}
-1.1160978169091647
(240, 16)


# KNN Regression

In [210]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error

In [199]:
df = pd.read_csv("Boston.csv")
df


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [201]:
X = df.drop('medv', axis = 'columns')
y = df['medv']

In [206]:

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=24, 
                                                    test_size=0.3, 
                                                    )

In [212]:
# Using Min max scaler
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')

scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')

knn = KNeighborsRegressor()

pipe = Pipeline([('CT',ct),('SCL',scaler_mm),('KNN',knn)])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
root_mean_squared_error(y_test, y_pred)

4.53483679274592

In [272]:
# Grid search
kfolds = StratifiedKFold(n_splits=5, 
                        random_state=24, 
                        shuffle=True)

kfold = KFold(n_splits=5, 
              random_state=24, 
              shuffle=True)

pipe = Pipeline([('CT',ct),('SCL',None),('KNN',knn)])
params = {"KNN__n_neighbors" : list(range(1, 21)),
          "KNN__metric" : ['cityblock', 'manhattan', 'minkowski', 'haversine'], 
          'SCL':[scaler_mm, scaler_std, None]}

gcv = GridSearchCV(pipe, 
                   param_grid=params,
                   scoring  = 'r2',
                   cv = kfold,
                   verbose=3)


In [274]:
gcv.fit(X,y)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV 1/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.764 total time=   0.0s
[CV 2/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.829 total time=   0.0s
[CV 3/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.723 total time=   0.0s
[CV 4/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.528 total time=   0.0s
[CV 5/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.755 total time=   0.0s
[CV 1/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=0.677 total time=   0.0s
[CV 2/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=0.872 total time=   0.0s
[CV 3/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=0.720 total time=   0.0s
[CV 4/5] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=Standa

300 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"]

In [276]:
print(gcv.best_params_)
print(gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

{'KNN__metric': 'cityblock', 'KNN__n_neighbors': 3, 'SCL': StandardScaler()}
0.7875954797972841
(240, 16)
