In [1]:
"""
Main purpose:
1. save, load, and predict is an important function any data scientist shall know how to do.
2. after searching for a while, I didn't find much straight forward example about how to do it with tf
    -main diff I found was that, tf has many diff levels and thus seems confusing to many beginners
3. this notebook mainly aim to explore how to do it with a simple & easy toy model.


Notes:
...notice that, for tf, feature's column names can't contain space or ()
"""

"\nMain purpose:\n1. save, load, and predict is an important function any data scientist shall know how to do.\n2. after searching for a while, I didn't find much straight forward example about how to do it with tf\n    -main diff I found was that, tf has many diff levels and thus seems confusing to many beginners\n3. this notebook mainly aim to explore how to do it with a simple & easy toy model.\n\n\nNotes:\n...notice that, for tf, feature's column names can't contain space or ()\n"

In [14]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import tensorflow as tf
import shutil

### 1. Prepare data

We load data from sklearn preset dataset. Iris is a realitve simple one with only 4 features.

In [3]:
# load iris data from sklearn
raw_data = datasets.load_iris()

print(type(raw_data))
print(raw_data.keys())

<class 'sklearn.utils.Bunch'>
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])


In [4]:
print([raw_data.feature_names] + [raw_data.target_names])

[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], array(['setosa', 'versicolor', 'virginica'], dtype='<U10')]


Notice:  
1. Here we rename the columns to make it compatible with tf, and we name the y-label as 'kind'.  
2. Also, tf will complain if you name it as 'target'

In [5]:
col_names = ['sepal_length','sepal_width', 'petal_length', 'petal_width']
features = pd.DataFrame(raw_data.data, columns = col_names)
target = pd.DataFrame(raw_data.target, columns = ["kind"])
print(features.shape, target.shape)
data = features.merge(target, how='outer', left_index=True, right_index=True)
df_train, df_eval = train_test_split(data, test_size = 0.2)
data.head()

(150, 4) (150, 1)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,kind
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### 2. Define Model
Here we use a very simple linear regressor as an example.

In [6]:

BATCH_SIZE = 32
N_EPOCHS = 1000
CSV_COLUMNS = data.columns.tolist()
FEATURES = CSV_COLUMNS[0:len(CSV_COLUMNS)-1]

print(FEATURES)

LABEL = "kind"
def make_input_fn(df, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x = df,
        y = df["kind"],
        batch_size = 128,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1000,
        num_threads = 1
  )
def make_prediction_input_fn(df, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
        x = df,
        y = None,
        batch_size = 128,
        num_epochs = num_epochs,
        shuffle = True,
        queue_capacity = 1000,
        num_threads = 1
    )


def make_feature_cols():
    input_columns = [tf.feature_column.numeric_column(k) for k in FEATURES]
    return input_columns

# just to show what make_feature_cols look like.
make_feature_cols()

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']


[_NumericColumn(key='sepal_length', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='sepal_width', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='petal_length', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='petal_width', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [7]:
tf.logging.set_verbosity(tf.logging.INFO)

OUTDIR = 'sample_iris'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

model = tf.estimator.LinearRegressor(
      feature_columns = make_feature_cols(), model_dir = OUTDIR)

model.train(input_fn = make_input_fn(df_train, num_epochs = 10))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'sample_iris', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x118e1df28>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into sample_iris/model.ckpt.
INFO:tensorflow:loss 

<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x118e1dc88>

In [8]:
def print_rmse(model, name, df):
    metrics = model.evaluate(input_fn = make_input_fn(df, 1))
    print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
print_rmse(model, 'validation', df_eval)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-30-04:37:48
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from sample_iris/model.ckpt-10
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-30-04:37:49
INFO:tensorflow:Saving dict for global step 10: average_loss = 0.16566299, global_step = 10, label/mean = 0.93333334, loss = 4.9698896, prediction/mean = 1.0338904
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10: sample_iris/model.ckpt-10
RMSE on validation dataset = 0.40701717138290405


#### 2.a restoring your model

In [17]:
# # continue training from last check point if needed:
# OUTDIR = 'sample_iris'
# model = tf.estimator.LinearRegressor(
#       feature_columns = make_feature_cols(), model_dir = OUTDIR)

# model.train(input_fn = make_input_fn(df_train, num_epochs = 20))

# #9-30-2018 cooool~

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'sample_iris', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1195ad3c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from sample_iris/model.ckpt-29
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving 

<tensorflow.python.estimator.canned.linear.LinearRegressor at 0x1195adc18>

In [18]:
# def print_rmse(model, name, df):
#     metrics = model.evaluate(input_fn = make_input_fn(df, 1))
#     print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
# print_rmse(model, 'validation', df_eval)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-30-21:53:21
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from sample_iris/model.ckpt-48
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-30-21:53:21
INFO:tensorflow:Saving dict for global step 48: average_loss = 0.039258935, global_step = 48, label/mean = 0.93333334, loss = 1.177768, prediction/mean = 0.92304635
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 48: sample_iris/model.ckpt-48
RMSE on validation dataset = 0.19813866913318634


### 3. Export and Reload

Here's the main part of this example. To save the model, you need two things:
1. A directory to save your model
2. ** a feature space to tell what feature you used, with the shape, data type and default value.

The second one is a bit tricky one. What it expecting is a list of 
"tf.feature_column" items that you used when training the model.


In [9]:
# export to save the model 

feature_space = tf.feature_column.make_parse_example_spec(make_feature_cols())
print(feature_space)


{'sepal_length': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None), 'sepal_width': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None), 'petal_length': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None), 'petal_width': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None)}


In [10]:
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_space)
model.export_savedmodel('export', serving_input_receiver_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: ['serving_default', 'regression']
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from sample_iris/model.ckpt-10
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: export/temp-b'1538282269'/saved_model.pb


b'export/1538282269'

In [11]:
export_dir = 'export/1538271973'

inputs = pd.DataFrame({
    "sepal_length": [5.1,5.9,6.9],
    'sepal_width':[2.3,3.0,2.4],
    'petal_length': [1.4,2.3,4.4],
    'petal_width': [0.5,1.0,1.2],
})

examples = []
for index,row in inputs.iterrows():
    feature = {}
    for c, v in row.iteritems():
        feature[c] = tf.train.Feature(float_list = tf.train.FloatList(value = [v]))
    example = tf.train.Example(
        features = tf.train.Features(feature = feature))
    examples.append(example.SerializeToString())

predict_fn = tf.contrib.predictor.from_saved_model(export_dir)
predict_fn({'inputs':examples})

INFO:tensorflow:Restoring parameters from export/1538271973/variables/variables


{'outputs': array([[0.538356  ],
        [0.79771024],
        [1.1938202 ]], dtype=float32)}

In [12]:
data['test'] = 0
data = data.drop('test',1)

In [13]:
predictions = model.predict(input_fn = make_prediction_input_fn(df_eval.drop("kind",1), 1))
# for i in range(6):
#     print(next(predictions))
prediction_df = pd.DataFrame(predictions)
prediction_df["predictions"] = prediction_df["predictions"].apply(lambda x: float(x))

prediction_df

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from sample_iris/model.ckpt-10
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Unnamed: 0,predictions
0,1.191456
1,1.255506
2,0.907552
3,0.547086
4,1.198182
5,1.057894
6,1.267296
7,0.52308
8,1.506444
9,1.571193
