Cross-validation

- full data should not be used for scoring a model -> we need a train-test split (evaluate the model on unseen data)

- cross validation is a way to evaluate the variability of our estimation of the generalization of the performance 

In [1]:
from sklearn.model_selection import KFold, ShuffleSplit

cv = KFold(n_splits = 5, random_state = 0, shuffle = True)  #shuffle == mix or not

#ShuffleSplit : the number of of splits no longer determines the size of the train and the test set 
cv2 = ShuffleSplit(n_splits = 5, random_state = 0, test_size = 0.2)

Make a pipeline

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn import datasets

iris = datasets.load_iris()
data = iris.data
target = iris.target

model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
cv_result = cross_validate(model, data, target, cv=5)
cv_result

{'fit_time': array([0.00165319, 0.00120592, 0.00124788, 0.00145388, 0.00102401]),
 'score_time': array([0.00240159, 0.00202107, 0.00200582, 0.0020771 , 0.00177789]),
 'test_score': array([0.96666667, 0.96666667, 0.93333333, 0.9       , 1.        ])}

Encoding of categorical variables

In [None]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

data_categorical = data[categorical_columns]
data_numerical = data[numerical_columns]

In [None]:
# Given a data_categorical 
from sklearn.preprocessing import OrdinalEncoder
column = data_categorical[[column]]
encoder = OrdinalEncoder().set_output(transform="pandas")
column_encoded = encoder.fit_transform(column)

`OneHotEncoder` is an alternative encoder that prevents the downstream
models to make a false assumption about the ordering of categories. For a
given feature, it creates as many new columns as there are possible
categories. For a given sample, the value of the column corresponding to the
category is set to `1` while all the columns of the other categories
are set to `0`.

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
education_encoded = encoder.fit_transform(column)

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

model = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"), LogisticRegression(max_iter=500)
)