# Pipeline and Export

* https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
* https://scikit-learn.org/stable/modules/model_persistence.html

In [1]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [2]:
!cd data && curl https://raw.githubusercontent.com/DJCordhose/deep-learning-crash-course-notebooks/master/data/insurance-customers-1500.csv -O && cd -

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    27  100    27    0     0      1      0  0:00:27  0:00:13  0:00:14     8
/content


In [3]:
!ls -l data

total 4
-rw-r--r-- 1 root root 27 Jul 22 15:44 insurance-customers-1500.csv


In [0]:
import pandas as pd
df = pd.read_csv('data/insurance-customers-1500.csv', sep=';')

In [5]:
stats = df.describe()
stats

Unnamed: 0,500: Internal Server Error
count,0
unique,0


In [6]:
y=df['group']

KeyError: ignored

In [0]:
df.drop('group', axis='columns', inplace=True)

In [0]:
X = df.as_matrix()

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [0]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=13, min_samples_leaf=1, min_samples_split=8, n_estimators=14, n_jobs=-1)
%time clf.fit(X_train, y_train)

In [0]:
train_score = clf.score(X_train, y_train)
train_score

In [0]:
test_score = clf.score(X_test, y_test)
test_score

In [0]:
from sklearn.model_selection import cross_val_score
cross_val_scores = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=5)
cross_val_scores

In [0]:
# mean score and the 95% confidence interval (2 standard deviations assuming normal distribution)
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_scores.mean(), cross_val_scores.std() * 2))

In [0]:
scores = {
    'cross_val_scores': cross_val_scores,
    'train_score': train_score,
    'test_score': test_score
}
scores

In [0]:
!mkdir models

In [0]:
import pickle

pickle.dump(clf, open('models/rf.model', 'wb'))

In [0]:
!mkdir stats

In [0]:
# has the distribution of the data changed substantially?
pickle.dump(stats, open('stats/describe.pickle', 'wb'))

In [0]:
# has the score descread since last time or is suspiciously different?
pickle.dump(scores, open('stats/scores.pickle', 'wb'))

In [0]:
import numpy, sklearn, pandas

versions = {
    'numpy': numpy.__version__,
    'sklearn': sklearn.__version__, 
    'pandas': pandas.__version__
}
versions

In [0]:
# we need to have exactly the same versions for serving
pickle.dump(scores, open('stats/versions.pickle', 'wb'))

In [0]:
!ls -l stats

In [0]:
!ls -l

In [0]:
# we also need the code that created this to have it reproduceable
!cd models
!curl https://raw.githubusercontent.com/DJCordhose/ml-workshop/blob/master/notebooks/process/export.ipynb -O
!cd -

In [0]:
!ls -l models

In [0]:
# have everything we need to reproduce and compare to other versions here
# better to version this 
# often versioning data is a challenge simply because of its size
!tar -czf prod.tgz models stats data

In [0]:
!ls -l

# Exercise: Export your favorite model

* choose any strategy you like
* save all the data you need for serving
* can you come up with some automated safety check for a new model?
* should you deploy to production automatically anyway?