# Now let's try to reload & reuse saved pipelines, will all code ready to use

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# AI Lab Imports

In [2]:
import sklearn
import sklearn_pandas
import pandas as pd
import ailab


Loading ailab...


# Read data 

In [3]:
df_train=pd.read_csv("titanic-train.csv")
df_test=pd.read_csv("titanic-test.csv")

df_full=pd.concat({"train":df_train,"test":df_train})

In [4]:
df_full

Unnamed: 0,Unnamed: 1,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
test,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
test,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
test,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
test,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
test,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
test,5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
test,6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
test,7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
test,8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
test,9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
# Quick check
df_full.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1782 entries, (test, 0) to (train, 890)
Data columns (total 12 columns):
PassengerId    1782 non-null int64
Survived       1782 non-null int64
Pclass         1782 non-null int64
Name           1782 non-null object
Sex            1782 non-null object
Age            1428 non-null float64
SibSp          1782 non-null int64
Parch          1782 non-null int64
Ticket         1782 non-null object
Fare           1782 non-null float64
Cabin          408 non-null object
Embarked       1778 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 172.4+ KB


# Reload pipelines

## Joblib  (doesnt work, non deep serialization, requires all class code or packages to be preloaded)

In [6]:
# Test load saved with joblib (will miss global declarations)

import joblib
pipeline=joblib.load("../../models/pipeline_demo_best_v1.joblib")
pipeline.predict_proba(df_test)

AttributeError: module '__main__' has no attribute 'PrepPipeline'

## Cloudpickle (deep serialization)

In [7]:
import cloudpickle

# Test load saved with cloudpickle
pipeline_v1=joblib.load("../../models/pipeline_demo_best_v1.pkl")

In [8]:
# Check that even without loading classes, it's working (saved with cloudpickle)
# Check, should be the same values, check notes saved
pipeline_v1.predict_proba(df_test)

Transforming...
Notes: v1-grid search


array([[0.69611264, 0.30388736],
       [0.49747532, 0.50252468],
       [0.63720075, 0.36279925],
       [0.79257597, 0.20742403],
       [0.4368235 , 0.5631765 ],
       [0.73261574, 0.26738426],
       [0.27801083, 0.72198917],
       [0.37799104, 0.62200896],
       [0.67717343, 0.32282657],
       [0.75139785, 0.24860215],
       [0.77257597, 0.22742403],
       [0.47666784, 0.52333216],
       [0.2388737 , 0.7611263 ],
       [0.46599013, 0.53400987],
       [0.3377807 , 0.6622193 ],
       [0.45699465, 0.54300535],
       [0.65556339, 0.34443661],
       [0.75423403, 0.24576597],
       [0.49748175, 0.50251825],
       [0.59742984, 0.40257016],
       [0.29694251, 0.70305749],
       [0.75386996, 0.24613004],
       [0.25943805, 0.74056195],
       [0.61651132, 0.38348868],
       [0.53867309, 0.46132691],
       [0.65375986, 0.34624014],
       [0.31935952, 0.68064048],
       [0.75417434, 0.24582566],
       [0.41741581, 0.58258419],
       [0.6742858 , 0.3257142 ],
       [0.

## Load pipeline v2

In [9]:
# Load 
pipeline_v2=joblib.load("../../models/pipeline_demo_best_v2.pkl")

## Check fitted state & pipeline params

In [10]:
pipeline_v2.named_steps["prep"].show_params()

fit_state {'cols_with_nas': ['Age', 'Cabin', 'Embarked'], 'impute_age': 29.0, 'impute_cabin': 'G6'}
params {'add_missing_indicators': True, 'copy': True, 'impute_age': True, 'impute_cabin': True, 'notes': 'v2-default pipeline', 'train_filter': ''}


In [11]:
# Check, should be the same values, check notes saved
pipeline_v2.predict_proba(df_test)

Transforming...
Notes: v2-default pipeline


array([[0.66, 0.34],
       [0.48, 0.52],
       [0.52, 0.48],
       [0.8 , 0.2 ],
       [0.36, 0.64],
       [0.76, 0.24],
       [0.34, 0.66],
       [0.46, 0.54],
       [0.54, 0.46],
       [0.82, 0.18],
       [0.8 , 0.2 ],
       [0.5 , 0.5 ],
       [0.48, 0.52],
       [0.62, 0.38],
       [0.36, 0.64],
       [0.54, 0.46],
       [0.62, 0.38],
       [0.62, 0.38],
       [0.48, 0.52],
       [0.48, 0.52],
       [0.4 , 0.6 ],
       [0.66, 0.34],
       [0.2 , 0.8 ],
       [0.56, 0.44],
       [0.52, 0.48],
       [0.62, 0.38],
       [0.44, 0.56],
       [0.64, 0.36],
       [0.5 , 0.5 ],
       [0.64, 0.36],
       [0.54, 0.46],
       [0.72, 0.28],
       [0.58, 0.42],
       [0.48, 0.52],
       [0.48, 0.52],
       [0.66, 0.34],
       [0.54, 0.46],
       [0.58, 0.42],
       [0.82, 0.18],
       [0.54, 0.46],
       [0.56, 0.44],
       [0.58, 0.42],
       [0.72, 0.28],
       [0.38, 0.62],
       [0.28, 0.72],
       [0.82, 0.18],
       [0.48, 0.52],
       [0.62,

In [12]:
#Recheck v1 again, loading v2 should not affect v1
pipeline_v1.predict_proba(df_test)


Transforming...
Notes: v1-grid search


array([[0.69611264, 0.30388736],
       [0.49747532, 0.50252468],
       [0.63720075, 0.36279925],
       [0.79257597, 0.20742403],
       [0.4368235 , 0.5631765 ],
       [0.73261574, 0.26738426],
       [0.27801083, 0.72198917],
       [0.37799104, 0.62200896],
       [0.67717343, 0.32282657],
       [0.75139785, 0.24860215],
       [0.77257597, 0.22742403],
       [0.47666784, 0.52333216],
       [0.2388737 , 0.7611263 ],
       [0.46599013, 0.53400987],
       [0.3377807 , 0.6622193 ],
       [0.45699465, 0.54300535],
       [0.65556339, 0.34443661],
       [0.75423403, 0.24576597],
       [0.49748175, 0.50251825],
       [0.59742984, 0.40257016],
       [0.29694251, 0.70305749],
       [0.75386996, 0.24613004],
       [0.25943805, 0.74056195],
       [0.61651132, 0.38348868],
       [0.53867309, 0.46132691],
       [0.65375986, 0.34624014],
       [0.31935952, 0.68064048],
       [0.75417434, 0.24582566],
       [0.41741581, 0.58258419],
       [0.6742858 , 0.3257142 ],
       [0.

# Retrain loaded pipeline

In [13]:
pipeline_v2.fit(df_full.drop("Survived",axis=1),df_full.Survived)

Fitting...
Notes: v2-default pipeline
Transforming...
Notes: v2-default pipeline


Pipeline(memory=None,
     steps=[('prep', PrepPipeline(add_missing_indicators=True, copy=True, impute_age=True,
       impute_cabin=True, notes='v2-default pipeline', train_filter='')), ('featurize', DataFrameMapper(default=False, df_out=True,
        features=[(['PassengerId'], Imputer(axis=0, copy=True, missing_values='NaN...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [14]:
# Check new v2, predictions should be different
pipeline_v2.predict_proba(df_test)


Transforming...
Notes: v2-default pipeline


array([[0.9 , 0.1 ],
       [0.6 , 0.4 ],
       [0.72, 0.28],
       [0.92, 0.08],
       [0.3 , 0.7 ],
       [0.86, 0.14],
       [0.32, 0.68],
       [0.5 , 0.5 ],
       [0.52, 0.48],
       [0.88, 0.12],
       [0.92, 0.08],
       [0.68, 0.32],
       [0.44, 0.56],
       [0.82, 0.18],
       [0.48, 0.52],
       [0.5 , 0.5 ],
       [0.68, 0.32],
       [0.86, 0.14],
       [0.58, 0.42],
       [0.52, 0.48],
       [0.56, 0.44],
       [0.62, 0.38],
       [0.32, 0.68],
       [0.62, 0.38],
       [0.26, 0.74],
       [0.78, 0.22],
       [0.48, 0.52],
       [0.86, 0.14],
       [0.68, 0.32],
       [0.84, 0.16],
       [0.7 , 0.3 ],
       [0.86, 0.14],
       [0.46, 0.54],
       [0.6 , 0.4 ],
       [0.46, 0.54],
       [0.84, 0.16],
       [0.6 , 0.4 ],
       [0.56, 0.44],
       [0.92, 0.08],
       [0.48, 0.52],
       [0.54, 0.46],
       [0.6 , 0.4 ],
       [0.88, 0.12],
       [0.36, 0.64],
       [0.36, 0.64],
       [0.92, 0.08],
       [0.6 , 0.4 ],
       [0.84,