In [65]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [66]:
# Get dataset from sklearn
data = load_breast_cancer(as_frame=True)['data']
target = load_breast_cancer(as_frame=True)['target']
target_names = load_breast_cancer(as_frame=True)['target_names']
print(f'The dataset contains {data.shape[0]} samples and {data.shape[1]} features.')
print(f'{target[target == 0].shape[0]} of the samples are assigned with label 0={target_names[0]}.')
print(f'{target[target == 1].shape[0]} of the samples are assigned with label 1={target_names[1]}.')

The dataset contains 569 samples and 30 features.
212 of the samples are assigned with label 0=malignant.
357 of the samples are assigned with label 1=benign.


In [67]:
# Create a sklearn pipeline with normalization and SVM
pipe = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# Evaluate the pipeline with a 5-fold CV on the train set using the Matthews Correlation Coefficient (MCC)
cv_result = cross_validate(pipe, X=data, y=target, cv=cv, scoring=make_scorer(matthews_corrcoef))
print(f'MCC: {cv_result["test_score"].mean().round(3)}')

MCC: 0.941
