In [3]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]
data

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3
...,...,...,...
339,4000.0,207.0,55.8
340,3400.0,202.0,43.5
341,3775.0,193.0,49.6
342,4100.0,210.0,50.8


In [6]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [8]:
target

0            Adelie Penguin (Pygoscelis adeliae)
1            Adelie Penguin (Pygoscelis adeliae)
2            Adelie Penguin (Pygoscelis adeliae)
4            Adelie Penguin (Pygoscelis adeliae)
5            Adelie Penguin (Pygoscelis adeliae)
                         ...                    
339    Chinstrap penguin (Pygoscelis antarctica)
340    Chinstrap penguin (Pygoscelis antarctica)
341    Chinstrap penguin (Pygoscelis antarctica)
342    Chinstrap penguin (Pygoscelis antarctica)
343    Chinstrap penguin (Pygoscelis antarctica)
Name: Species, Length: 342, dtype: object

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [25]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
print(f"accuracy is {cv_results['test_score'].mean():.2} +/- {cv_results['test_score'].std():.2} ")

accuracy is 0.88 +/- 0.042 


In [29]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00505,0.000591,0.004423,0.000212,5,{'classifier__n_neighbors': 5},0.979592,1.0,0.959184,0.918367,0.918367,0.959184,0.979167,0.959123,0.028819,1
1,0.004415,3.6e-05,0.005054,0.000834,51,{'classifier__n_neighbors': 51},0.979592,1.0,0.938776,0.918367,0.938776,0.959184,0.979167,0.959123,0.026675,1
2,0.004698,0.000505,0.00508,8.5e-05,101,{'classifier__n_neighbors': 101},0.897959,0.938776,0.918367,0.877551,0.857143,0.918367,0.9375,0.906523,0.028364,3


In [30]:
model.set_params(classifier__n_neighbors=5)
cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
print(f"accuracy with n = 5 is {cv_results['test_score'].mean():.2} +/- {cv_results['test_score'].std():.2} ")

model.set_params(classifier__n_neighbors=51)
cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
print(f"accuracy with n = 51 is {cv_results['test_score'].mean():.2} +/- {cv_results['test_score'].std():.2} ")

model.set_params(classifier__n_neighbors=101)
cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
print(f"accuracy with n = 101 is {cv_results['test_score'].mean():.2} +/- {cv_results['test_score'].std():.2} ")


accuracy with n = 5 is 0.95 +/- 0.04 
accuracy with n = 51 is 0.94 +/- 0.039 
accuracy with n = 101 is 0.88 +/- 0.042 


In [31]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [32]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {"preprocessor": all_preprocessors, 
              "classifier__n_neighbors": [5, 51, 101]}
model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=2, cv=7)
model_grid_search.fit(data, target)

CPU times: user 133 ms, sys: 9.43 ms, total: 142 ms
Wall time: 606 ms


GridSearchCV(cv=7,
             estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier',
                                        KNeighborsClassifier(n_neighbors=101))]),
             n_jobs=2,
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]})

In [33]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results = cv_results[column_results]

def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Unnamed: 0,param_preprocessor,n_neighbors,mean_test_score,std_test_score,rank_test_score
2,MinMaxScaler(),5,0.967869,0.028541,1
3,QuantileTransformer(n_quantiles=100),5,0.964954,0.020982,2
1,StandardScaler(),5,0.959123,0.028819,3
6,StandardScaler(),51,0.959123,0.026675,3
4,PowerTransformer(method='box-cox'),5,0.956207,0.022895,5
7,MinMaxScaler(),51,0.950316,0.026362,6
9,PowerTransformer(method='box-cox'),51,0.941569,0.033434,7
8,QuantileTransformer(n_quantiles=100),51,0.938715,0.037723,8
11,StandardScaler(),101,0.906523,0.028364,9
12,MinMaxScaler(),101,0.897716,0.030699,10


In [34]:
model_grid_search.best_params_

{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}

In [35]:
cv_results = cross_validate(
    model_grid_search, data, target, cv=10, n_jobs=2, return_estimator=True, scoring="balanced_accuracy"
)

In [36]:
print(f"accuracy is {cv_results['test_score'].mean():.2} +/- {cv_results['test_score'].std():.2} ")

accuracy is 0.94 +/- 0.044 
