### Install dependencies

> #### Note
> Because of some of the updates to packages you **must** use the button at the bottom of the output of this cell to restart the runtime.  Following restart, you should rerun this cell.


In [None]:
!pip install -U tensorflow==2.0.0 tensorboard==2.0.1
!pip freeze | grep -e tensorflow -e tensorboard

In [None]:
import tempfile
import urllib
import os
import sys
import math
import json
import requests
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.utils import shuffle, resample
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Download example data
We download the sample dataset for use in our TFX pipeline.

In [None]:
# Download the example data.
_data_root = tempfile.mkdtemp(prefix='tfx-data')
DATA_PATH = (
    'https://raw.githubusercontent.com/tensorflow/' +
    'tfx/master/tfx/examples/chicago_taxi_pipeline/' +
    'data/simple/data.csv')
with open(os.path.join(_data_root, 'data.csv'), 'wb') as f:
    contents = urllib.request.urlopen(DATA_PATH).read()
    f.write(contents)

# Build some convenience functions

In [None]:
def df_to_dataset(dataframe, target_col, shuffle=True, batch_size=32):
    '''Takes a Pandas dataframe as an input and converts it
    to a Tensorflow dataset
    '''
    dataframe = dataframe.copy()
    labels = dataframe.pop(target_col)
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds


def build_model_input_columns(data):
    outputs = {}

    for key in DENSE_FLOAT_FEATURE_KEYS:
        outputs[key] = tf.feature_column.numeric_column(
            key,
            shape=()
        )
    for key in CATEGORICAL_FEATURE_KEYS:
        outputs[key] = tf.feature_column.categorical_column_with_identity(
            key,
            num_buckets=VOCAB_SIZE + OOV_SIZE,
            default_value=0
        )
        outputs[key] = tf.feature_column.embedding_column(
            outputs[key],
            dimension=math.ceil((VOCAB_SIZE + OOV_SIZE)**0.25)
        )
    for key in VOCAB_FEATURE_KEYS:
        _vocab = data[key].drop_duplicates().to_list()
        outputs[key] = tf.feature_column.categorical_column_with_vocabulary_list(
            key,
            vocabulary_list=_vocab,
            dtype=tf.string,
            num_oov_buckets=OOV_SIZE
        )
        if len(_vocab) > 10:
            outputs[key] = tf.feature_column.embedding_column(
                outputs[key],
                dimension=math.ceil((VOCAB_SIZE + OOV_SIZE)**0.25)
            )
        else:
            outputs[key] = tf.feature_column.indicator_column(outputs[key])

    for key in BUCKET_FEATURE_KEYS:
        outputs[key] = tf.feature_column.categorical_column_with_hash_bucket(
            key,
            hash_bucket_size=FEATURE_BUCKET_COUNT
        )
        outputs[key] = tf.feature_column.indicator_column(outputs[key])

    return outputs

# Define column schemas

In [None]:
DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']
CATEGORICAL_FEATURE_KEYS = [
    'trip_start_hour', 'trip_start_day', 'trip_start_month',
    'pickup_community_area', 'dropoff_community_area']

VOCAB_FEATURE_KEYS = ['payment_type', 'company',]

# We will quantize a few continuous variables, namely pickup and dropoff
# coordinates. Discretizing them just makes them marginally easier to deal
# with.
BUCKET_FEATURE_KEYS = [
    'pickup_latitude', 'pickup_longitude',
    'dropoff_latitude', 'dropoff_longitude']

VOCAB_SIZE=1000
OOV_SIZE=10
FEATURE_BUCKET_COUNT = 10

LABEL_KEY = 'tips'

# Load and process data

In [None]:
data = pd.read_csv(_data_root + '/data.csv')
data = data.drop(
    axis=1,
    columns=[
        'pickup_census_tract',
        'dropoff_census_tract',
        'trip_start_timestamp'
    ]
)

In [None]:
for col in data.columns:
    if data[col].dtype in ['int64', 'float64']:
        data[col] = data[col].fillna(0)
    else:
        data[col] = data[col].fillna('')

data['tips'] = np.where(data['tips'] > data['fare'] * 0.2, 1, 0)
data['dropoff_community_area'] = data['dropoff_community_area'].astype(int)

We'll split the data into train, test, and validation subsets, and then convert them into TensorFlow datasets

In [None]:
# split into train and test set
train, test = train_test_split(
    data,
    test_size=0.20,
    random_state=42)

train, val = train_test_split(
    train,
    test_size=0.20,
    random_state=42)

train = pd.concat([
    train[train[LABEL_KEY] == 0],
    resample(
        train[train[LABEL_KEY] == 1],
        replace=True,
        n_samples=round(train[train[LABEL_KEY] == 1].shape[0] * 1.5),
        random_state=42)],
    axis=0)

In [None]:
_batch_size = 32

train_ds = df_to_dataset(
    train,
    target_col=LABEL_KEY,
    batch_size=_batch_size)
val_ds = df_to_dataset(
    val,
    target_col=LABEL_KEY,
    shuffle=False,
    batch_size=_batch_size)
test_ds = df_to_dataset(
    test,
    target_col=LABEL_KEY,
    shuffle=False,
    batch_size=_batch_size)

# Define model form

In [None]:
model_columns = build_model_input_columns(data)

Things get mildly funky here, but basically it just boils down to the Keras API expects inputs of `tf.keras.layers.Input`, but we've got a boatload of `tf.feature_column`s. Here, we're just mapping from one to the other so everything plays nice and we get to use not-infuriating Keras APIs with the added bonus of feature vocabularies when we go to deploy things down the road.

In [None]:
deep_layer_inputs, deep_columns = {}, []
wide_layer_inputs, wide_columns = {}, []

for key, column in model_columns.items():
    if key in DENSE_FLOAT_FEATURE_KEYS:
        deep_layer_inputs[key] = tf.keras.Input(
            shape=(),
            name=key,
            dtype=column._asdict().get('dtype', tf.string)
        )
        deep_columns.append(column)
    else:
        wide_layer_inputs[key] = tf.keras.Input(
            shape=(),
            name=key,
            dtype=column._asdict().get(
                'dtype',
                tf.string if key in BUCKET_FEATURE_KEYS + VOCAB_FEATURE_KEYS \
                else tf.int64)
        )
        wide_columns.append(column)

We'll use a kind of janky implementation of Google's reference Wide-and-Deep recommender system architecture described in [arXiv:1606](https://arxiv.org/abs/1606.07792). It's not an awesome implementation, but it illustrates things well enough.

In [None]:
deep_layer_outputs = \
    tf.keras.layers.DenseFeatures(
        feature_columns=deep_columns)(deep_layer_inputs)
wide_layer_outputs = \
    tf.keras.layers.DenseFeatures(
        feature_columns=wide_columns)(wide_layer_inputs)

deep = tf.keras.layers.Dense(
    100,
    activation='relu',
    name='dense_1')(deep_layer_outputs)
deep = tf.keras.layers.BatchNormalization(name='norm_1')(deep)

deep = tf.keras.layers.Dense(
    75,
    activation='relu',
    name='dense_2')(deep)
deep = tf.keras.layers.BatchNormalization(name='norm_2')(deep)

deep = tf.keras.layers.Dense(
    50,
    activation='relu',
    name='dense_3')(deep)
deep = tf.keras.layers.BatchNormalization(name='norm_3')(deep)

deep = tf.keras.layers.Dense(
    30,
    activation='relu',
    name='dense_4')(deep)
deep = tf.keras.layers.BatchNormalization(name='norm_4')(deep)

combined = tf.keras.layers.concatenate([deep, wide_layer_outputs])
output = tf.keras.layers.Dense(1, activation='sigmoid', name='target')(deep)

model = tf.keras.Model(
    inputs=[v for v in wide_layer_inputs.values()] + [v for v in deep_layer_inputs.values()],
    outputs=output,
    name='tipping')
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adadelta(
    lr=1.0,
    rho=0.95,
    epsilon=None,
    decay=0.0)

In [None]:
# compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

In [None]:
model.fit(train_ds, validation_data=val_ds, epochs=10)

# Export the model

In [None]:
export_path_base = tempfile.mkdtemp(prefix='export')
os.environ['BASE_PATH'] = export_path_base

export_path = os.path.join(
      tf.compat.as_bytes(str(export_path_base)),
      tf.compat.as_bytes('tipping/1'))  # model_name/version
print('Exporting trained model to', export_path)

model.save(export_path, save_format='tf')

# Install and launch TensorFlow Serving

In [None]:
%%bash
echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" > /etc/apt/sources.list.d/tensorflow-serving.list
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add -
apt-get update
apt-get install -y tensorflow-model-server

In [None]:
%%script bash --bg
tensorflow_model_server \
    --model_name=tipping \
    --model_base_path="${BASE_PATH}/tipping" \
    --rest_api_port=8501

# Make online predictions

In [None]:
def convert(obj):
    if isinstance(obj, np.int64):
        return int(obj)  
    raise TypeError

for_inference = test.reset_index(drop=True).drop(LABEL_KEY, axis=1).loc[5, :].to_dict()
for_inference['pickup_latitude'] = str(for_inference['pickup_latitude'])
for_inference['pickup_longitude'] = str(for_inference['pickup_longitude'])
for_inference['dropoff_latitude'] = str(for_inference['dropoff_latitude'])
for_inference['dropoff_longitude'] = str(for_inference['dropoff_longitude'])

data = json.dumps({
    'signature_name': 'serving_default',
    'instances': [
        for_inference
    ]
}, indent=4, default=convert)

print(data)

In [None]:
headers = {'content-type': 'application/json'}
json_response = requests.post(
    'http://localhost:8501/v1/models/tipping:predict',
    data=data,
    headers=headers
)
print(json_response.text)