In [30]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
import os 
import warnings
warnings.filterwarnings('ignore')


In [31]:
hr = pd.read_csv("HR_comma_sep.csv")
X = hr.drop('left', axis=1)
y = hr['left']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
        random_state=24, test_size=0.3, stratify=y)

In [33]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')
knn = KNeighborsClassifier(n_neighbors=3)       # here tuning the K value 
pipe = Pipeline([('CT',ct),('SCL',None),('KNN',knn)])

# pipe.fit(X_train, y_train)
# y_pred = pipe.predict(X_test)
# print("Accuracy Score: ",accuracy_score(y_test, y_pred))

# # Log loss is more reliable in log term (Ideal-0 near to 0, more accurate model is)
# y_pred_prob = pipe.predict_proba(X_test)
# print("Log Loss: ",log_loss(y_test, y_pred_prob))
# pipe.get_params()

In [29]:
# Grid Search with ROC_AUC

kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)
params = {'KNN__n_neighbors': np.arange(1,11),
          'SCL':[scaler_mm, scaler_std, None]}
# Setting the parameter roc_auc in scoring
gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='roc_auc',          
                   cv=kfold, verbose=3)
gcv.fit(X,y)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.964 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.960 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.963 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.965 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=0.967 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.962 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.961 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.965 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.966 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=0.967 total time=   0.0s
[CV 1/5] END ......KNN__n_neighbors=1, SCL=None;, score=0.954 total time=   0.

In [35]:
# Grid Search with neg_log_loss

kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)
params = {'KNN__n_neighbors': np.arange(1,11),
          'SCL':[scaler_mm, scaler_std, None]}
# Setting the parameter roc_auc in scoring
gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='neg_log_loss',          
                   cv=kfold, verbose=3)
gcv.fit(X,y)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.202 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.262 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.142 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.106 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-1.118 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.274 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.238 total time=   0.1s
[CV 3/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.166 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.106 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-1.106 total time=   0.0s
[CV 1/5] END .....KNN__n_neighbors=1, SCL=None;, score=-1.550 total 

In [47]:
# KNN for the Glass dataset
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets 
  
# metadata 
print(glass_identification.metadata) 
  
# variable information 
print(glass_identification.variables) 
print(y.shape)

{'uci_id': 42, 'name': 'Glass Identification', 'repository_url': 'https://archive.ics.uci.edu/dataset/42/glass+identification', 'data_url': 'https://archive.ics.uci.edu/static/public/42/data.csv', 'abstract': 'From USA Forensic Science Service; 6 types of glass; defined in terms of their oxide content (i.e. Na, Fe, K, etc)', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 214, 'num_features': 9, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Type_of_glass'], 'index_col': ['Id_number'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1987, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5WW2P', 'creators': ['B. German'], 'intro_paper': None, 'additional_info': {'summary': 'Vina conducted a comparison test of her rule-based system, BEAGLE, the nearest-neighbor algorithm, and discriminant analysis.  BEAGLE is a product available through VRS Consulting, In

In [37]:
import sklearn
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=24,test_size=0.3,stratify=y['Type_of_glass'])


In [40]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')
knn = KNeighborsClassifier(n_neighbors=3)       # here tuning the K value 
pipe = Pipeline([('CT',ct),('SCL',None),('KNN',knn)])

In [41]:
kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)
# here we are checking which KNN value is best in the range
params = {'KNN__n_neighbors': np.arange(1,11),      
          'SCL':[scaler_mm, scaler_std, None]}
# Setting the parameter ng_log_loss in scoring
gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='neg_log_loss',          
                   cv=kfold, verbose=3)
gcv.fit(X,y)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-12.573 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-8.382 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-13.412 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-10.059 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=MinMaxScaler();, score=-12.015 total time=   0.0s
[CV 1/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-10.059 total time=   0.0s
[CV 2/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-7.544 total time=   0.0s
[CV 3/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-11.735 total time=   0.0s
[CV 4/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-10.059 total time=   0.0s
[CV 5/5] END KNN__n_neighbors=1, SCL=StandardScaler();, score=-13.731 total time=   0.0s
[CV 1/5] END ....KNN__n_neighbors=1, SCL=None;, score=-10.89

In [52]:
# Using MinMax Scaler

kfold = StratifiedKFold(n_splits=5, random_state=24, 
                        shuffle=True)

# Here we have added the KNN__metric to use different distances methods 
params = {'KNN__n_neighbors': np.arange(1,11),
          'KNN__metric':['cityblock','minkowski','manhatan','haversine'],
          'SCL':[scaler_mm, scaler_std, None]}
# Setting the parameter roc_auc in scoring
gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='neg_log_loss',          
                   cv=kfold)
gcv.fit(X,y)
print("Best Parameters: ",gcv.best_params_)
print("Best Score: ",gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

Best Parameters:  {'KNN__metric': 'cityblock', 'KNN__n_neighbors': np.int64(10), 'SCL': None}
Best Score:  -1.9992426425743097
(120, 16)
