# Tunability for KNeighborsClassifier

# Intro

In [17]:
import pandas as pd

random_state = pd.read_csv("../../data/vars.csv")['random_state'].iloc[0]
n_iter = pd.read_csv("../../data/vars.csv")['n_iter'].iloc[0]
n_iter_BS = pd.read_csv("../../data/vars.csv")['n_iter_BS'].iloc[0]

In [2]:
df_college = pd.read_csv("../../data/processed/college.csv")
df_credit = pd.read_csv("../../data/processed/credit.csv")
df_diabetes = pd.read_csv("../../data/processed/diabetes.csv")
df_penguins = pd.read_csv("../../data/processed/penguins.csv")

In [3]:
print(df_college.info())
print()
print(df_credit.info())
print()
print(df_diabetes.info())
print()
print(df_penguins.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   type_school            1000 non-null   object 
 1   school_accreditation   1000 non-null   object 
 2   gender                 1000 non-null   object 
 3   interest               1000 non-null   object 
 4   residence              1000 non-null   object 
 5   parent_age             1000 non-null   int64  
 6   parent_salary          1000 non-null   int64  
 7   house_area             1000 non-null   float64
 8   average_grades         1000 non-null   float64
 9   parent_was_in_college  1000 non-null   bool   
 10  will_go_to_college     1000 non-null   int64  
dtypes: bool(1), float64(2), int64(3), object(5)
memory usage: 79.2+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype 

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
import numpy as np


num_pipeline = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('cat_impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

model_pipeline = Pipeline(steps=[
    ('preprocessing', col_trans),
    ('model', KNeighborsClassifier())
])

# RandomizedSearchCV

In [5]:
knn_params = {
    'model__n_neighbors': range(1, 31),
    'model__weights': ['uniform', 'distance'],
    'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'model__metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}

## Credit dataset

In [6]:
from sklearn.model_selection import RandomizedSearchCV

X = df_credit.iloc[:, 0:-1]
y = df_credit.iloc[:, -1]

random_search_model = RandomizedSearchCV(estimator = model_pipeline,
                                   param_distributions = knn_params,
                                   n_iter = n_iter,
                                   cv = 5,
                                   scoring = 'f1',
                                   random_state = random_state)

random_search_model.fit(X, y)

best_f1_credit = random_search_model.best_score_

print(f"Best f1 score: {best_f1_credit}")
print(f"Best params: {random_search_model.best_params_}")

random_search_model.best_estimator_

Best f1 score: 0.8451285323122008
Best params: {'model__weights': 'uniform', 'model__n_neighbors': 11, 'model__metric': 'minkowski', 'model__algorithm': 'brute'}


## Data frame with params, results and mean result

In [7]:
tested_params = random_search_model.cv_results_['params']

In [8]:
df_knn = pd.DataFrame(tested_params)
df_knn['credit_f1'] = random_search_model.cv_results_['mean_test_score']
df_knn

Unnamed: 0,model__weights,model__n_neighbors,model__metric,model__algorithm,credit_f1
0,distance,24,manhattan,auto,0.839104
1,distance,30,minkowski,auto,0.836725
2,distance,28,manhattan,kd_tree,0.840619
3,distance,10,chebyshev,ball_tree,0.653419
4,distance,27,manhattan,kd_tree,0.838621
...,...,...,...,...,...
95,distance,15,euclidean,kd_tree,0.828324
96,distance,26,minkowski,ball_tree,0.838821
97,distance,17,manhattan,auto,0.835890
98,distance,30,euclidean,ball_tree,0.836725


## College dataset

In [9]:
from sklearn.model_selection import GridSearchCV

X = df_college.iloc[:, 0:-1]
y = df_college.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_knn['college_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_college = grid_search_model.best_score_

print(f"Best f1 score: {best_f1_college}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 0.8451903226329179
Best params: {'model__algorithm': 'auto', 'model__metric': 'manhattan', 'model__n_neighbors': 6, 'model__weights': 'distance'}


## Diabetes dataset

In [10]:
from sklearn.model_selection import GridSearchCV

X = df_diabetes.iloc[:, 0:-1]
y = df_diabetes.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_knn['diabetes_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_diabetes = grid_search_model.best_score_

print(f"Best f1 score: {best_f1_diabetes}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 0.6491640095014931
Best params: {'model__algorithm': 'ball_tree', 'model__metric': 'euclidean', 'model__n_neighbors': 12, 'model__weights': 'distance'}


## Penguins dataset

In [11]:
from sklearn.model_selection import GridSearchCV

X = df_penguins.iloc[:, 0:-1]
y = df_penguins.iloc[:, -1]

formatted_params = []
for params in tested_params:
    formatted_param = {key: [value] for key, value in params.items()}
    formatted_params.append(formatted_param)

grid_search_model = GridSearchCV(model_pipeline,
                                 param_grid=formatted_params,
                                 scoring='f1',
                                 cv = 5)

grid_search_model.fit(X, y)

df_knn['penguins_f1'] = grid_search_model.cv_results_['mean_test_score']

best_f1_penguins = grid_search_model.best_score_


print(f"Best f1 score: {best_f1_penguins}")
print(f"Best params: {grid_search_model.best_params_}")
grid_search_model.best_estimator_

Best f1 score: 0.9966101694915255
Best params: {'model__algorithm': 'kd_tree', 'model__metric': 'euclidean', 'model__n_neighbors': 18, 'model__weights': 'distance'}


## Tunability 

In [12]:
df_knn['mean_f1'] = df_knn[['credit_f1', 'college_f1', 'diabetes_f1', 'penguins_f1']].mean(axis=1)
df_knn

Unnamed: 0,model__weights,model__n_neighbors,model__metric,model__algorithm,credit_f1,college_f1,diabetes_f1,penguins_f1,mean_f1
0,distance,24,manhattan,auto,0.839104,0.813815,0.641121,0.989474,0.820879
1,distance,30,minkowski,auto,0.836725,0.812504,0.628796,0.989474,0.816875
2,distance,28,manhattan,kd_tree,0.840619,0.810984,0.632397,0.989474,0.818368
3,distance,10,chebyshev,ball_tree,0.653419,0.797746,0.611785,0.960000,0.755738
4,distance,27,manhattan,kd_tree,0.838621,0.815281,0.620727,0.989474,0.816026
...,...,...,...,...,...,...,...,...,...
95,distance,15,euclidean,kd_tree,0.828324,0.822725,0.634524,0.993103,0.819669
96,distance,26,minkowski,ball_tree,0.838821,0.822061,0.639117,0.993103,0.823275
97,distance,17,manhattan,auto,0.835890,0.822727,0.629678,0.993103,0.820350
98,distance,30,euclidean,ball_tree,0.836725,0.812504,0.628796,0.989474,0.816875


In [13]:
# sigma star
sigma_star_row = df_knn[df_knn['mean_f1'] == max(df_knn['mean_f1'])]
sigma_star_row

Unnamed: 0,model__weights,model__n_neighbors,model__metric,model__algorithm,credit_f1,college_f1,diabetes_f1,penguins_f1,mean_f1
64,uniform,11,minkowski,brute,0.845129,0.819695,0.644186,0.993103,0.825528


### (3.3) Measuring Overall Tunability of a ML Algorithm

In [14]:
d_credit = sigma_star_row['credit_f1'].iloc[0] - best_f1_credit
d_college = sigma_star_row['college_f1'].iloc[0] - best_f1_college
d_diabetes = sigma_star_row['diabetes_f1'].iloc[0] - best_f1_diabetes
d_penguins = sigma_star_row['penguins_f1'].iloc[0] - best_f1_penguins

d = pd.Series([d_credit, d_college, d_diabetes, d_penguins])
print(d)
print(f"Mean: {d.mean()}")


0    0.000000
1   -0.025495
2   -0.004978
3   -0.003507
dtype: float64
Mean: -0.008495101420125079


# BayesSearchCV

In [16]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

knn_params = {
    'model__n_neighbors': Integer(1, 31),
    'model__weights': Categorical(['uniform', 'distance']),
    'model__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
    'model__metric': Categorical(['euclidean', 'manhattan', 'chebyshev', 'minkowski'])
}

## Credit dataset

In [18]:
X = df_credit.iloc[:, 0:-1]
y = df_credit.iloc[:, -1]

bayes_search_model = BayesSearchCV(estimator=model_pipeline,
                                search_spaces=knn_params,
                                n_iter=int(n_iter_BS),
                                random_state=random_state,
                                cv=5,
                                scoring='f1')
                                
bayes_search_model.fit(X, y)

best_BS_f1_credit = bayes_search_model.best_score_

print(f"Best f1 score: {best_BS_f1_credit}")
print(f"Best params: {bayes_search_model.best_params_}")
bayes_search_model.best_estimator_

Best f1 score: 0.8451285323122008
Best params: OrderedDict([('model__algorithm', 'ball_tree'), ('model__metric', 'euclidean'), ('model__n_neighbors', 11), ('model__weights', 'uniform')])


## College dataset

In [19]:
X = df_college.iloc[:, 0:-1]
y = df_college.iloc[:, -1]

bayes_search_model = BayesSearchCV(estimator=model_pipeline,
                                search_spaces=knn_params,
                                n_iter=int(n_iter_BS),
                                random_state=random_state,
                                cv=5,
                                scoring='f1')
                                
bayes_search_model.fit(X, y)

best_BS_f1_college = bayes_search_model.best_score_

print(f"Best f1 score: {best_BS_f1_college}")
print(f"Best params: {bayes_search_model.best_params_}")
bayes_search_model.best_estimator_



Best f1 score: 0.834868626246948
Best params: OrderedDict([('model__algorithm', 'brute'), ('model__metric', 'chebyshev'), ('model__n_neighbors', 1), ('model__weights', 'distance')])


## Diabetes dataset

In [20]:
X = df_diabetes.iloc[:, 0:-1]
y = df_diabetes.iloc[:, -1]

bayes_search_model = BayesSearchCV(estimator=model_pipeline,
                                search_spaces=knn_params,
                                n_iter=int(n_iter_BS),
                                random_state=random_state,
                                cv=5,
                                scoring='f1')
                                
bayes_search_model.fit(X, y)

best_BS_f1_diabetes = bayes_search_model.best_score_

print(f"Best f1 score: {best_BS_f1_diabetes}")
print(f"Best params: {bayes_search_model.best_params_}")
bayes_search_model.best_estimator_

Best f1 score: 0.6597307060755336
Best params: OrderedDict([('model__algorithm', 'kd_tree'), ('model__metric', 'manhattan'), ('model__n_neighbors', 18), ('model__weights', 'distance')])


## Penguins dataset

In [22]:
X = df_penguins.iloc[:, 0:-1]
y = df_penguins.iloc[:, -1]

bayes_search_model = BayesSearchCV(estimator=model_pipeline,
                                search_spaces=knn_params,
                                n_iter=int(n_iter_BS),
                                random_state=random_state,
                                cv=5,
                                scoring='f1')
                                
bayes_search_model.fit(X, y)

best_BS_f1_penguins = bayes_search_model.best_score_

print(f"Best f1 score: {best_BS_f1_penguins}")
print(f"Best params: {bayes_search_model.best_params_}")
bayes_search_model.best_estimator_

Best f1 score: 1.0
Best params: OrderedDict([('model__algorithm', 'brute'), ('model__metric', 'minkowski'), ('model__n_neighbors', 17), ('model__weights', 'uniform')])


## Tunability

### (3.3) Measuring Overall Tunability of a ML Algorithm

In [23]:
d_BS_credit = sigma_star_row['credit_f1'].iloc[0] - best_BS_f1_credit
d_BS_college = sigma_star_row['college_f1'].iloc[0] - best_BS_f1_college
d_BS_diabetes = sigma_star_row['diabetes_f1'].iloc[0] - best_BS_f1_diabetes
d_BS_penguins = sigma_star_row['penguins_f1'].iloc[0] - best_BS_f1_penguins

d_BS = pd.Series([d_BS_credit, d_BS_college, d_BS_diabetes, d_BS_penguins])
print(d_BS)
print(f"Mean: {d_BS.mean()}")

0    0.000000
1   -0.015174
2   -0.015545
3   -0.006897
dtype: float64
Mean: -0.009403809094261362
