# Using k-fold Cross-Validation

In [1]:
# regular import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
plt.style.use('bmh')
plt.rcParams['font.size'] == 11

False

### Data Loading and Train Test Split

In [3]:
# Loading the brest cancer data
data_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
df = pd.read_csv(data_uri, header=None)

In [4]:
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
# Seperating the featu
features = df.loc[:, 2:].values
targets = df.loc[:, 1].values

In [7]:
le = LabelEncoder()
targets = le.fit_transform(targets)
le.classes_

array(['B', 'M'], dtype=object)

In [8]:
le.transform(["M", "B"])

array([1, 0])

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# Train Test split
feat_train, feat_test, tar_train, tar_test = train_test_split(
    features, 
    targets, 
    test_size=0.2, 
    stratify=targets, 
    random_state=1
)

### Building Pipeline

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [12]:
pipe_lr = make_pipeline(
    StandardScaler(), 
    PCA(n_components=2), 
    LogisticRegression(random_state=1, solver='lbfgs')
)
pipe_lr.fit(feat_train, tar_train)
tar_pred = pipe_lr.predict(feat_test)

In [13]:
print("Test Accuracy: {:.3f}".format(pipe_lr.score(feat_test, tar_test)))

Test Accuracy: 0.956


*Note: The make_pipeline function takes an arbitrary number of scikit-learn transformers (objects that support the fit and transform methods as input), followed by a scikit-learn estimator that implements the fit and predict methods. In our preceding code example, we provided two transformers, StandardScaler and PCA, and a LogisticRegression estimator as inputs to the make_pipeline function, which constructs a scikit-learn Pipeline object from these objects.*

### The Hold out Method
the holdout method for model selection is to separate the data into three parts: a training set, a validation set, and a test set. The training set is used to fit the different models, and the performance on the validation set is then used for the model selection. A disadvantage of the holdout method is that the performance estimate may be very sensitive to how we partition the training set into the training and validation subsets; the estimate will vary for different samples of the data.

### K-fold Cross Validation (10 is the best number)
In k-fold cross-validation, we randomly split the training dataset into k folds without replacement, where k — 1 folds are used for the model training, and one fold is used for performance evaluation. This procedure is repeated k times so that we obtain k
models and performance estimates.

In [14]:
from sklearn.model_selection import StratifiedKFold

In [50]:
kfold =(
    StratifiedKFold(
        n_splits=10,
        random_state=1)
    .split(feat_train, tar_train)
)

In [28]:
scores = []
for k, (train, test) in enumerate(kfold): # more than one item in enumerate needs a set
    pipe_lr.fit(feat_train[train], tar_train[train])
    score = pipe_lr.score(feat_train[test], tar_train[test])
    scores.append(score)
    print("Fold {} Class dist.: {} Acc: {:.2f}".format(k + 1, np.bincount(tar_train[train]), score))

Fold 1 Class dist.: [256 153] Acc: 0.93
Fold 2 Class dist.: [256 153] Acc: 0.93
Fold 3 Class dist.: [256 153] Acc: 0.96
Fold 4 Class dist.: [256 153] Acc: 0.96
Fold 5 Class dist.: [256 153] Acc: 0.93
Fold 6 Class dist.: [257 153] Acc: 0.96
Fold 7 Class dist.: [257 153] Acc: 0.98
Fold 8 Class dist.: [257 153] Acc: 0.93
Fold 9 Class dist.: [257 153] Acc: 0.96
Fold 10 Class dist.: [257 153] Acc: 0.96


### Alternative Approach

In [53]:
from sklearn.model_selection import cross_val_score

In [54]:
scores = cross_val_score(estimator=pipe_lr, X=feat_train, y=tar_train, cv=10, n_jobs=1)

In [55]:
print('CV accuracy scores: {}'.format(scores))

CV accuracy scores: [0.93478261 0.93478261 0.95652174 0.95652174 0.93478261 0.95555556
 0.97777778 0.93333333 0.95555556 0.95555556]


In [59]:
print('CV Accuracy {:.3f} +/- {:.3f}'.format(np.mean(scores), np.std(scores)))

CV Accuracy 0.950 +/- 0.014


# References
[1] Python Machine Learning by Sebastian R.