In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [3]:
data.describe()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [5]:
from sklearn.model_selection import cross_validate

model.set_params(classifier__n_neighbors = 5)
model.set_params(preprocessor = StandardScaler())

cv_results = cross_validate(model, data, target,
                            cv=10, scoring="balanced_accuracy",
                            return_train_score=True, n_jobs=2)
cv_results_1 = pd.DataFrame(cv_results)

In [6]:
cv_results_1

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.004151,0.003658,1.0,0.976199
1,0.004229,0.003742,1.0,0.974365
2,0.003807,0.003347,1.0,0.982369
3,0.003273,0.003227,0.918803,0.976518
4,0.003358,0.003,0.88254,0.965335
5,0.003284,0.002977,0.952381,0.976254
6,0.003259,0.003075,0.977778,0.978705
7,0.00345,0.002967,0.930159,0.981718
8,0.003564,0.003006,0.907937,0.975691
9,0.003479,0.003162,0.952381,0.973803


In [7]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor': (StandardScaler(), "passthrough"),
    'classifier__n_neighbors': (5, 51, 101)}

model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=2, cv=2, return_train_score=True)
model_grid_search.fit(data, target)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier', KNeighborsClassifier())]),
             n_jobs=2,
             param_grid={'classifier__n_neighbors': (5, 51, 101),
                         'preprocessor': (StandardScaler(), 'passthrough')},
             return_train_score=True)

In [9]:
df = pd.DataFrame(model_grid_search.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,0.006692,0.000972,0.008611,0.001086,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.959064,0.953216,0.95614,0.002924,1,0.982456,0.982456,0.982456,0.0
1,0.002815,0.000392,0.008231,0.000927,5,passthrough,"{'classifier__n_neighbors': 5, 'preprocessor':...",0.71345,0.830409,0.77193,0.05848,4,0.847953,0.842105,0.845029,0.002924
2,0.004212,4e-05,0.008551,0.000398,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.935673,0.923977,0.929825,0.005848,2,0.94152,0.935673,0.938596,0.002924
3,0.002761,0.000135,0.00974,0.000551,51,passthrough,"{'classifier__n_neighbors': 51, 'preprocessor'...",0.701754,0.766082,0.733918,0.032164,5,0.754386,0.701754,0.72807,0.026316
4,0.004153,9.6e-05,0.010265,0.000535,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.777778,0.783626,0.780702,0.002924,3,0.80117,0.795322,0.798246,0.002924
5,0.002818,0.000414,0.011076,1.9e-05,101,passthrough,"{'classifier__n_neighbors': 101, 'preprocessor...",0.701754,0.760234,0.730994,0.02924,6,0.766082,0.707602,0.736842,0.02924


In [10]:
cv_results = cross_validate(model_grid_search, data, target,
                            cv=10, scoring="balanced_accuracy",
                            return_train_score=True, n_jobs=2)
cv_results = pd.DataFrame(cv_results)

In [11]:
cv_results

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.418259,0.003761,1.0,0.976199
1,0.341715,0.005468,1.0,0.974365
2,0.323719,0.00403,1.0,0.982369
3,0.391804,0.003847,0.918803,0.976518
4,0.426586,0.004769,0.88254,0.965335
5,0.421824,0.007492,0.952381,0.976254
6,0.369757,0.004422,0.977778,0.978705
7,0.363178,0.003825,0.930159,0.981718
8,0.381076,0.0039,0.907937,0.975691
9,0.341837,0.003627,0.952381,0.973803


In [8]:
# model.set_params(classifier__n_neighbors = 5)
# model.set_params(classifier__n_neighbors = 51)
model.set_params(classifier__n_neighbors = 101)

# model.set_params(preprocessor = "passthrough")
model.set_params(preprocessor = StandardScaler())

cv_results = cross_validate(model, data, target,
                            cv=10, scoring="balanced_accuracy",
                            return_train_score=True, n_jobs=2)
cv_results_2 = pd.DataFrame(cv_results)

In [9]:
cv_results_2

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.004778,0.0046,0.857143,0.869371
1,0.004695,0.004798,0.952381,0.855456
2,0.003639,0.003866,0.944444,0.877188
3,0.003713,0.004298,0.863248,0.898798
4,0.003504,0.003719,0.834921,0.888832
5,0.003506,0.003704,0.857143,0.89731
6,0.003733,0.004121,0.834921,0.880354
7,0.003533,0.003998,0.88254,0.883368
8,0.003654,0.003692,0.834921,0.894297
9,0.003683,0.00423,0.904762,0.880917


In [10]:
sum([cv_results_1.train_score > cv_results_2.train_score])

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: train_score, dtype: int64

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

all_n_branches = [5, 51, 101]

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': all_n_branches}

model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=2, cv=10, return_train_score=True,
                                scoring="balanced_accuracy")
model_grid_search.fit(data, target)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier',
                                        KNeighborsClassifier(n_neighbors=101))]),
             n_jobs=2,
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]},
             return_train_score=True, scoring='balanced_accuracy')

In [14]:
df = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
1,0.003524,0.000125,0.003276,0.000141,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,...,0.982369,0.976518,0.965335,0.976254,0.978705,0.981718,0.975691,0.973803,0.976096,0.004497
2,0.003293,6.5e-05,0.003254,0.000111,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,...,0.982369,0.976518,0.965335,0.968338,0.970789,0.981718,0.970227,0.968338,0.973118,0.005572
3,0.004291,0.000198,0.003338,0.000149,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,...,0.957254,0.960758,0.953854,0.959871,0.962322,0.970248,0.967213,0.96177,0.962141,0.004489
4,0.007339,0.000804,0.003469,0.000126,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,...,0.974541,0.971142,0.959871,0.965887,0.968338,0.979267,0.964762,0.965887,0.968875,0.005149
6,0.003629,0.000244,0.00427,0.001403,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,...,0.943681,0.960389,0.959308,0.941016,0.93745,0.943467,0.954396,0.945928,0.948178,0.007352
8,0.004614,0.000338,0.003654,0.000207,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.857143,0.952381,1.0,...,0.914348,0.928131,0.929535,0.921619,0.915592,0.921057,0.921619,0.918606,0.922949,0.007404
9,0.006902,0.000511,0.003647,0.000173,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.977778,1.0,...,0.933507,0.941809,0.935551,0.924622,0.929524,0.924622,0.927073,0.932538,0.93003,0.006263
7,0.003502,0.000208,0.003689,0.0003,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.952381,1.0,...,0.94426,0.952562,0.94648,0.921609,0.93744,0.924622,0.935551,0.932538,0.934859,0.010463
11,0.003534,8e-05,0.003755,7.3e-05,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.952381,0.944444,...,0.877188,0.898798,0.888832,0.89731,0.880354,0.883368,0.894297,0.880917,0.882589,0.012665
12,0.003231,6.8e-05,0.003637,0.000112,101,MinMaxScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.857143,0.944444,...,0.885015,0.869466,0.875442,0.866974,0.871876,0.867527,0.869977,0.864524,0.865493,0.01312


In [44]:
model_grid_search.best_params_

{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': all_n_branches}

model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=2, cv=10, return_train_score=True,
                                scoring="balanced_accuracy")
# model_grid_search.fit(data, target)

In [42]:
cv_results = cross_validate(model_grid_search, data, target,
                            cv=10, scoring="balanced_accuracy",
                            return_train_score=True, n_jobs=2,
                           return_estimator=True)
cv_results = pd.DataFrame(cv_results)

In [43]:
cv_results

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,3.904115,0.003503,"GridSearchCV(cv=10,\n estimator=Pi...",0.952381,0.965263
1,3.749147,0.003773,"GridSearchCV(cv=10,\n estimator=Pi...",0.92674,0.962857
2,4.124029,0.003291,"GridSearchCV(cv=10,\n estimator=Pi...",1.0,0.982369
3,3.725382,0.003313,"GridSearchCV(cv=10,\n estimator=Pi...",0.918803,0.976518
4,3.849141,0.004285,"GridSearchCV(cv=10,\n estimator=Pi...",0.88254,0.965335
5,3.865108,0.003389,"GridSearchCV(cv=10,\n estimator=Pi...",1.0,0.959871
6,3.710999,0.003289,"GridSearchCV(cv=10,\n estimator=Pi...",0.955556,0.970789
7,3.895823,0.003473,"GridSearchCV(cv=10,\n estimator=Pi...",0.930159,0.981718
8,3.929272,0.004383,"GridSearchCV(cv=10,\n estimator=Pi...",0.907937,0.975691
9,3.985982,0.003394,"GridSearchCV(cv=10,\n estimator=Pi...",0.952381,0.96177


In [44]:
display(cv_results_df["estimator"])

0    GridSearchCV(cv=10,\n             estimator=Pi...
1    GridSearchCV(cv=10,\n             estimator=Pi...
2    GridSearchCV(cv=10,\n             estimator=Pi...
3    GridSearchCV(cv=10,\n             estimator=Pi...
4    GridSearchCV(cv=10,\n             estimator=Pi...
5    GridSearchCV(cv=10,\n             estimator=Pi...
6    GridSearchCV(cv=10,\n             estimator=Pi...
7    GridSearchCV(cv=10,\n             estimator=Pi...
8    GridSearchCV(cv=10,\n             estimator=Pi...
9    GridSearchCV(cv=10,\n             estimator=Pi...
Name: estimator, dtype: object

In [45]:
for estimator in cv_results["estimator"]:
#     estimator.fit(data, target)
    print(estimator.best_params_)
    print()

{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}

{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}

{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}

{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}

{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}

{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}

{'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}

{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}

{'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}

{'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}

