## Pipeline

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(
'https://archive.ics.uci.edu/ml/'
'machine-learning-databases'
'/breast-cancer-wisconsin/wdbc.data',
header=None
)

In [3]:
from sklearn.preprocessing import LabelEncoder

In [7]:
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values

In [8]:
le = LabelEncoder()

In [10]:
y = le.fit_transform( y )

In [12]:
le.classes_

array(['B', 'M'], dtype=object)

In [14]:
le.transform( ['B', 'M'] ) # confirm the labels.

array([0, 1])

In [15]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0, stratify=y )

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [20]:
pipeline = make_pipeline(
    StandardScaler(),
    PCA(n_components=2),
    LogisticRegression()
)

In [22]:
pipeline.fit( X_train, y_train ) 

In [23]:
y_pred = pipeline.predict( X_test )

In [25]:
score = pipeline.score( X_test, y_test )

In [26]:
score

0.9298245614035088

## Cross-Validation ( Holdout & K-fold )

### Holdout

In [None]:
'''
    * When we are trying to estimate the best hyperparamter of a model we tend to choose between
    different models with different values. The selsction of one of these model is called model 
    selection phase. In this phase, each model is tested with the Test dataset.
    Keep in mind, this particular dataset has never been used in the training process.
    To get the weights their right values we use other dataset that is a subset of the training set
    called `validation` set. 
    
    * The main disadvantage of the holdout method is it relis on a particular set to test. We should not forget
    that the performance of each model will vary depending on that dataset, and that is where K-fold method 
    comes into play.
'''

### K-Fold

In [27]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [28]:
sf = StratifiedKFold( n_splits=10 ).split( X_train, y_train )

In [30]:
scores = []
for i, (train,test) in enumerate( sf ):
    pipeline.fit( X_train[train], y_train[train] )
    score = pipeline.score( X_train[test], y_train[test] )
    scores.append(score)

In [31]:
scores

[0.9782608695652174,
 1.0,
 0.9777777777777777,
 0.9555555555555556,
 0.8888888888888888,
 0.9777777777777777,
 1.0]

In [32]:
from sklearn.model_selection import cross_val_score

In [None]:
'''
    The `n_job` parameter could be used to utilize CPU cores.
'''

In [34]:
scores = cross_val_score( estimator=pipeline, X=X_train, y=y_train, n_jobs=1, cv=10 )

In [38]:
scores_mean = np.mean(scores)
scores_stddiv = np.std(score)

In [40]:
scores_mean

0.9647826086956522

In [41]:
scores_stddiv

0.0