<a href="https://colab.research.google.com/github/DJCordhose/ml-workshop/blob/master/notebooks/process/export-pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pipeline and Export

* _Pipelines_ allow to have a well defined sequence of processing steps shared for training and production
  * have the same interface as a pure classifier, easy replacement for pure classfier
* _ColumnTransformer_ bundle transformations for each column individually  

Links
* https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
* https://scikit-learn.org/0.21/modules/compose.html#column-transformer
* https://scikit-learn.org/stable/modules/model_persistence.html
* https://scikit-learn.org/0.21/modules/generated/sklearn.compose.ColumnTransformer.html

In [1]:
!rm -rf data && mkdir data && cd data && curl https://raw.githubusercontent.com/DJCordhose/ml-workshop/master/data/insurance-customers-1500.csv -O && cd -

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 26783  100 26783    0     0   157k      0 --:--:-- --:--:-- --:--:--  156k
/content


In [2]:
!ls -l data

total 28
-rw-r--r-- 1 root root 26783 Aug  3 16:38 insurance-customers-1500.csv


In [0]:
import pandas as pd
df = pd.read_csv('data/insurance-customers-1500.csv', sep=';')

In [4]:
stats = df.describe()
stats

Unnamed: 0,speed,age,miles,group
count,1500.0,1500.0,1500.0,1500.0
mean,122.492667,44.980667,30.434,0.998667
std,17.604333,17.1304,15.250815,0.816768
min,68.0,16.0,1.0,0.0
25%,108.0,32.0,18.0,0.0
50%,120.0,42.0,29.0,1.0
75%,137.0,55.0,42.0,2.0
max,166.0,100.0,84.0,2.0


In [0]:
y=df['group']

In [0]:
df.drop('group', axis='columns', inplace=True)

In [0]:
X = df.values

In [8]:
X

array([[ 98.,  44.,  25.],
       [118.,  54.,  24.],
       [111.,  26.,  34.],
       ...,
       [138.,  41.,  45.],
       [100.,  31.,  28.],
       [100.,  58.,  55.]])

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1200, 3), (1200,), (300, 3), (300,))

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

In [0]:
column_trans = ColumnTransformer([], remainder=StandardScaler())

In [12]:
column_trans.fit_transform(X)

array([[-1.39175022, -0.05726625, -0.35642765],
       [-0.25528743,  0.52668598, -0.42201978],
       [-0.65304941, -1.10838025,  0.23390154],
       ...,
       [ 0.88117537, -0.23245192,  0.955415  ],
       [-1.27810395, -0.81640414, -0.15965125],
       [-1.27810395,  0.76026687,  1.61133633]])

In [13]:
pca = PCA()
pca

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [0]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = clf = RandomForestClassifier(max_depth=9, min_samples_leaf=9, min_samples_split=3, n_estimators=7, n_jobs=-1)

In [0]:
clf = Pipeline([('standardise', column_trans), ('reduce_dim', PCA()), ('clf', rf_clf)])

In [16]:
%time clf.fit(X_train, y_train)

CPU times: user 110 ms, sys: 8.63 ms, total: 118 ms
Wall time: 205 ms


Pipeline(memory=None,
         steps=[('standardise',
                 ColumnTransformer(n_jobs=None,
                                   remainder=StandardScaler(copy=True,
                                                            with_mean=True,
                                                            with_std=True),
                                   sparse_threshold=0.3,
                                   transformer_weights=None, transformers=[],
                                   verbose=False)),
                ('reduce_dim',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=9,
                                        max_features='auto',
                                        max_lea

In [17]:
train_score = clf.score(X_train, y_train)
train_score

0.8358333333333333

In [18]:
test_score = clf.score(X_test, y_test)
test_score

0.7766666666666666

In [19]:
from sklearn.model_selection import cross_val_score
cross_val_scores = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=10)
cross_val_scores

array([0.70247934, 0.79166667, 0.74166667, 0.8       , 0.70833333,
       0.76666667, 0.825     , 0.78333333, 0.79166667, 0.77310924])

In [20]:
# mean score and the 95% confidence interval (2 standard deviations assuming normal distribution)
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_scores.mean(), cross_val_scores.std() * 2))

Accuracy: 0.77 (+/- 0.08)


In [21]:
scores = {
    'cross_val_scores': cross_val_scores,
    'train_score': train_score,
    'test_score': test_score
}
scores

{'cross_val_scores': array([0.70247934, 0.79166667, 0.74166667, 0.8       , 0.70833333,
        0.76666667, 0.825     , 0.78333333, 0.79166667, 0.77310924]),
 'test_score': 0.7766666666666666,
 'train_score': 0.8358333333333333}

In [0]:
!rm -rf models && mkdir models

In [0]:
import pickle

pickle.dump(clf, open('models/model.pickle', 'wb'))

In [0]:
!rm -rf stats && mkdir stats

In [0]:
# has the distribution of the data changed substantially?
pickle.dump(stats, open('stats/describe.pickle', 'wb'))

In [0]:
# has the score descread since last time or is suspiciously different?
pickle.dump(scores, open('stats/scores.pickle', 'wb'))

In [27]:
import numpy, sklearn, pandas

model_name = 'pipeline_pca_std_rf'
model_version = 2

versions = {
    'numpy': numpy.__version__,
    'sklearn': sklearn.__version__, 
    'pandas': pandas.__version__,
    'model': model_version,
    'model_name': model_name
}
versions

{'model': 2,
 'model_name': 'pipeline_pca_std_rf',
 'numpy': '1.16.4',
 'pandas': '0.24.2',
 'sklearn': '0.21.3'}

In [0]:
# we need to have exactly the same versions for serving
pickle.dump(versions, open('stats/versions.pickle', 'wb'))

In [29]:
!ls -l stats

total 12
-rw-r--r-- 1 root root 1145 Aug  3 16:38 describe.pickle
-rw-r--r-- 1 root root  372 Aug  3 16:38 scores.pickle
-rw-r--r-- 1 root root  143 Aug  3 16:38 versions.pickle


In [30]:
!ls -l

total 48
drwxr-xr-x 2 root root  4096 Aug  3 16:38 data
-rw-r--r-- 1 root root 31648 Aug  3 16:38 export-pipeline.ipynb
drwxr-xr-x 2 root root  4096 Aug  3 16:38 models
drwxr-xr-x 1 root root  4096 Aug  1 16:08 sample_data
drwxr-xr-x 2 root root  4096 Aug  3 16:38 stats


In [31]:
# we also need the code that created this to have it reproduceable
!cd models
!curl https://raw.githubusercontent.com/DJCordhose/ml-workshop/master/notebooks/process/export-pipeline.ipynb -O
!cd -

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 31648  100 31648    0     0   303k      0 --:--:-- --:--:-- --:--:--  303k
/


In [32]:
!ls -l models

total 60
-rw-r--r-- 1 root root 59482 Aug  3 16:38 model.pickle


In [33]:
file_name = 'prod_{0}_{1}.tgz'.format(model_name, model_version)
file_name

'prod_pipeline_pca_std_rf_2.tgz'

In [0]:
# have everything we need to reproduce and compare to other versions here
# better to version this 
# often versioning data is a challenge simply because of its size
!tar -czf $file_name models stats data

In [35]:
!ls -l

total 76
drwxr-xr-x 2 root root  4096 Aug  3 16:38 data
-rw-r--r-- 1 root root 31648 Aug  3 16:38 export-pipeline.ipynb
drwxr-xr-x 2 root root  4096 Aug  3 16:38 models
-rw-r--r-- 1 root root 25145 Aug  3 16:38 prod_pipeline_pca_std_rf_2.tgz
drwxr-xr-x 1 root root  4096 Aug  1 16:08 sample_data
drwxr-xr-x 2 root root  4096 Aug  3 16:38 stats


In [36]:
!date

Sat Aug  3 16:38:45 UTC 2019


# Exercise: Export your favorite model

* run this notebook
* modify to choose any strategy you like
* chain together different / additional steps in a pipeline
* can you add a normaizer?
* PCA to less dimensions?
* rather use T-SNE or UMAP?

finally download the complete archieve for production for the next step
