In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [3]:
print(target.value_counts())

Species
Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: count, dtype: int64


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

model2 = Pipeline(steps=[
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [5]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [6]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(model, data, target, cv=10,scoring="balanced_accuracy")
scores = cv_results["test_score"]
print(
    f"Accuracy score via cross-validation \n"
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

Accuracy score via cross-validation 
0.952 ± 0.040


In [7]:
from sklearn.model_selection import cross_validate

n_neighbors = [5,51,101]

for n in n_neighbors:
    model.set_params(classifier__n_neighbors=n)
    cv_results = cross_validate(model, data, target, cv=10, scoring="balanced_accuracy")
    scores = cv_results["test_score"]
    print(f"Para: {n} los resutados son: \n {scores}")

Para: 5 los resutados son: 
 [1.         1.         1.         0.91880342 0.88253968 0.95238095
 0.97777778 0.93015873 0.90793651 0.95238095]
Para: 51 los resutados son: 
 [0.95238095 0.97777778 1.         0.86324786 0.88253968 0.95238095
 0.95555556 0.95238095 0.93015873 0.95238095]
Para: 101 los resutados son: 
 [0.85714286 0.95238095 0.94444444 0.86324786 0.83492063 0.85714286
 0.83492063 0.88253968 0.83492063 0.9047619 ]


In [8]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(model2, data, target, cv=10,scoring="balanced_accuracy")
scores = cv_results["test_score"]
print(
    f"Accuracy score via cross-validation \n"
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

Accuracy score via cross-validation 
0.740 ± 0.087


In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [10]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42
)

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "preprocessor": all_preprocessors,
    "classifier__n_neighbors": n_neighbors,
}
model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=10)
model_grid_search.fit(data_train, target_train)

In [12]:
accuracy = model_grid_search.score(data_test, target_test)
print(
    f"The test accuracy score of the grid-searched pipeline is: {accuracy:.2f}"
)

The test accuracy score of the grid-searched pipeline is: 0.95


In [13]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False
)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.007529,0.00557,0.007919,0.005611,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.961538,1.0,0.961538,1.0,1.0,0.961538,0.92,0.96,0.96,0.96,0.968462,0.023856,1
2,0.007366,0.005563,0.009138,0.005962,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.961538,1.0,0.961538,1.0,1.0,0.961538,0.92,0.96,0.96,0.96,0.968462,0.023856,1
3,0.008608,0.005582,0.009403,0.006094,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,0.961538,0.961538,1.0,0.961538,0.92,0.96,0.96,0.92,0.964462,0.027943,3
4,0.014422,0.00598,0.005487,0.002675,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.961538,1.0,0.961538,1.0,1.0,0.961538,0.92,0.96,0.96,0.92,0.964462,0.027943,3
6,0.00472,0.00275,0.00447,9.7e-05,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.961538,0.961538,0.961538,1.0,0.961538,0.923077,0.88,0.96,0.92,0.96,0.948923,0.031218,5
7,0.003682,0.000202,0.00453,0.000144,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.961538,0.961538,0.961538,1.0,0.923077,0.923077,0.88,0.96,0.88,1.0,0.945077,0.040628,6
9,0.007062,0.001312,0.004849,0.000616,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.961538,0.961538,0.961538,1.0,0.923077,0.923077,0.88,0.96,0.88,1.0,0.945077,0.040628,6
8,0.005273,0.001094,0.00509,0.000945,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",1.0,0.961538,0.961538,0.923077,0.923077,0.923077,0.88,0.92,0.88,0.92,0.929231,0.034967,8
11,0.003673,0.000101,0.004545,0.000121,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.846154,0.923077,0.884615,0.884615,0.807692,0.884615,0.84,0.88,0.8,0.92,0.867077,0.040278,9
12,0.003564,0.000293,0.004947,0.000515,101,MinMaxScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.884615,0.923077,0.884615,0.884615,0.807692,0.884615,0.84,0.84,0.8,0.92,0.866923,0.040816,10


In [14]:
from sklearn.model_selection import cross_validate
import numpy as np
outer_cv = cross_validate(model_grid_search , data, target, cv=10, scoring="balanced_accuracy", return_estimator=True)
print(f'Nested CV accuracy: {np.mean(outer_cv["test_score"]):.2f} ± {np.std(outer_cv["test_score"]):.2f}')

Nested CV accuracy: 0.94 ± 0.04


In [18]:
for idx, estimator in enumerate(outer_cv['estimator']):
    print(f"Fold {idx + 1} best parameters: {estimator.best_params_}")
    print(f"Fold {idx + 1} best score: {estimator.best_score_}")

Fold 1 best parameters: {'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Fold 1 best score: 0.9675268817204301
Fold 2 best parameters: {'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Fold 2 best score: 0.9643010752688171
Fold 3 best parameters: {'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
Fold 3 best score: 0.957741935483871
Fold 4 best parameters: {'classifier__n_neighbors': 51, 'preprocessor': StandardScaler()}
Fold 4 best score: 0.9610752688172044
Fold 5 best parameters: {'classifier__n_neighbors': 51, 'preprocessor': StandardScaler()}
Fold 5 best score: 0.9643010752688171
Fold 6 best parameters: {'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
Fold 6 best score: 0.9675268817204301
Fold 7 best parameters: {'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
Fold 7 best score: 0.9641935483870968
Fold 8 best parameters: {'classifier__n_neighbors': 