# Using k-fold cross-validation to assess model performance

In [1]:
# regular import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
plt.style.use('bmh')
plt.rcParams['font.size'] = 11

### Data Loading and Train Test Split

In [3]:
# Loading the brest cancer data
data_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
df = pd.read_csv(data_uri, header=None)

In [4]:
df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
# Seperating the featu
features = df.loc[:, 2:].values
targets = df.loc[:, 1].values

In [7]:
le = LabelEncoder()
targets = le.fit_transform(targets)
le.classes_

array(['B', 'M'], dtype=object)

In [8]:
le.transform(["M", "B"])

array([1, 0])

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# Train Test split
feat_train, feat_test, tar_train, tar_test = train_test_split(
    features, 
    targets, 
    test_size=0.2, 
    stratify=targets, 
    random_state=1
)

### Building Pipeline

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [12]:
pipe_lr = make_pipeline(
    StandardScaler(),
    PCA(n_components=2),
    LogisticRegression(random_state=1, solver='lbfgs')
)

### K-fold Cross Validation

#### A. Manual Approach

In [13]:
from sklearn.model_selection import StratifiedKFold

In [14]:
# Need to be careful that thats a generator and need to run every time before build
kfold = StratifiedKFold(
    n_splits=10,
    random_state=1
).split(feat_train, tar_train)

In [15]:
scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(feat_train[train], tar_train[train])
    score = pipe_lr.score(feat_train[test], tar_train[test])
    scores.append(score)
    print("Fold: {}, Class dist.: {}, Acc.: {:.3f}".format(k+1, np.bincount(tar_train[train]), score))

Fold: 1, Class dist.: [256 153], Acc.: 0.935
Fold: 2, Class dist.: [256 153], Acc.: 0.935
Fold: 3, Class dist.: [256 153], Acc.: 0.957
Fold: 4, Class dist.: [256 153], Acc.: 0.957
Fold: 5, Class dist.: [256 153], Acc.: 0.935
Fold: 6, Class dist.: [257 153], Acc.: 0.956
Fold: 7, Class dist.: [257 153], Acc.: 0.978
Fold: 8, Class dist.: [257 153], Acc.: 0.933
Fold: 9, Class dist.: [257 153], Acc.: 0.956
Fold: 10, Class dist.: [257 153], Acc.: 0.956


In [16]:
print("Accuracy: {:.3f} +/- {:.3f}".format(np.mean(scores), np.std(scores)))

Accuracy: 0.950 +/- 0.014


#### B. Scikit-Learn Approach

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
scores = cross_val_score(estimator=pipe_lr, X=feat_train, y=tar_train, cv=10, n_jobs=1)
print("Accuracy: {:.3f} +/- {:.3f}".format(np.mean(scores), np.std(scores)))

Accuracy: 0.950 +/- 0.014


# References
[1] Python Machine Learning by Sebastian R.