In [3]:
import tensorflow as tf
import numpy as np
import shutil
print(tf.__version__)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.14.0


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Read data

In [4]:
CSV_COLUMNS = ["dayofweek", "hourofday", "pickup_borough", "dropoff_borough", "trip_duration"]
LABEL_COLUMN = "trip_duration"
DEFAULTS = [[1], [0], [""], [""],  []]

In [14]:
# Create an input function reading a file using the Dataset API
# Then provide the results to the Estimator API
def read_dataset(filename, mode, batch_size = 512):
    def _input_fn():
        def decode_csv(records):
            columns = tf.decode_csv(records, record_defaults=DEFAULTS)
            features = dict(zip(CSV_COLUMNS, columns))
            features["dayofweek"] = features["dayofweek"] - 1
            label = features.pop(LABEL_COLUMN)
            return features, label
        
        # Create list of files that match pattern
        file_list = tf.gfile.Glob(filename)

        # Create dataset from file list
        dataset = (tf.data.TextLineDataset(file_list,compression_type="GZIP")  # Read text file
                   .map(decode_csv))  # Transform each elem by applying decode_csv fn

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = 5 # None # indefinitely
            dataset = dataset.shuffle(buffer_size=10*batch_size, seed=42)
        else:
            num_epochs = 1 # end-of-input after this

        dataset = dataset.repeat(num_epochs).batch(batch_size)
        return dataset
    return _input_fn

def get_train_input_fn():
    return read_dataset('gs://edml/data/taxi-trips/train/tlc_yellow_trips_2018-000*.csv',
                        mode = tf.estimator.ModeKeys.TRAIN)

def get_valid_input_fn():
    return read_dataset('gs://edml/data/taxi-trips/val/tlc_yellow_trips_2018-000*.csv',
                        mode = tf.estimator.ModeKeys.EVAL)

## Feature engineering

In [7]:
def get_wide_deep():
    
    borough_list = ["Manhattan", "Queens", "Brooklyn", "Bronx", "Staten Island", "EWR"]
        
    # One hot encode categorical features
    fc_dayofweek = tf.feature_column.categorical_column_with_identity(key="dayofweek", num_buckets = 7)
    fc_hourofday = tf.feature_column.categorical_column_with_identity(key="hourofday", num_buckets = 24)
    fc_pickuploc = tf.feature_column.categorical_column_with_vocabulary_list(key="pickup_borough", 
                                                                             vocabulary_list=borough_list)
    fc_dropoffloc = tf.feature_column.categorical_column_with_vocabulary_list(key="dropoff_borough", 
                                                                              vocabulary_list=borough_list)
    
    # Cross features to get combination of day and hour and pickup-dropoff locations
    fc_crossed_day_hr = tf.feature_column.crossed_column(keys = [fc_dayofweek, fc_hourofday], hash_bucket_size = 24 * 7)
    fc_crossed_pd_pair = tf.feature_column.crossed_column(keys = [fc_pickuploc, fc_dropoffloc], hash_bucket_size = 6*6)
    
    wide = [
        # Feature crosses
        fc_crossed_day_hr, fc_crossed_pd_pair,
        
        # Sparse columns
        fc_dayofweek, fc_hourofday,
        fc_pickuploc, fc_dropoffloc
    ]
    
    # Embedding_column to "group" together ...
    fc_embed_pd_pair = tf.feature_column.embedding_column(categorical_column = fc_crossed_pd_pair, dimension = 4)
    fc_embed_day_hr = tf.feature_column.embedding_column(categorical_column = fc_crossed_day_hr, dimension = 16)
    
    deep = [
        fc_embed_pd_pair,
        fc_embed_day_hr
    ]
    
    return wide, deep

### Serving input receiver function

In [8]:
# Create serving input function to be able to serve predictions later using provided inputs

def serving_input_receiver_fn():
    receiver_tensors = {
        'dayofweek' : tf.placeholder(dtype = tf.int64, shape = [None], name="dayofweek"),
        'hourofday' : tf.placeholder(dtype = tf.int64, shape = [None], name="hourofday"),
        'pickup_borough' : tf.placeholder(dtype = tf.int64, shape = [None], name="pickup_borough"), 
        'dropoff_borough' : tf.placeholder(dtype = tf.int64, shape = [None], name="dropoff_borough"),
    }
    
    features = {
        key: tf.expand_dims(tensor, -1)
        for key, tensor in receiver_tensors.items()
    }
        
    return tf.estimator.export.ServingInputReceiver(features = features, receiver_tensors = receiver_tensors)

## Build and train model

In [9]:
# Create estimator to train and evaluate
def train_and_evaluate(output_dir):
    
    EVAL_INTERVAL = 300
    wide, deep = get_wide_deep()
    
    run_config = tf.estimator.RunConfig(save_checkpoints_secs = EVAL_INTERVAL,
                                        tf_random_seed = 2810,
                                        keep_checkpoint_max = 3)
    
    # Add custom evaluation metric
    def my_rmse(labels, predictions):
        pred_values = tf.squeeze(input = predictions["predictions"], axis = -1)
        return {"rmse": tf.metrics.root_mean_squared_error(labels = labels, predictions = pred_values)}
    
    estimator = tf.estimator.DNNLinearCombinedRegressor(
        model_dir = output_dir,
        linear_feature_columns = wide,
        dnn_feature_columns = deep,
        dnn_hidden_units = [128, 64, 32],
        config = run_config)
    
    estimator = tf.contrib.estimator.add_metrics(estimator = estimator, metric_fn = my_rmse) 
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = get_train_input_fn(),
        max_steps = 500)
    
    exporter = tf.estimator.LatestExporter('exporter', serving_input_receiver_fn = serving_input_receiver_fn)
    
    eval_spec = tf.estimator.EvalSpec(
        input_fn = get_valid_input_fn(),
        steps = None,
        start_delay_secs = 60, # start evaluating after N seconds
        throttle_secs = EVAL_INTERVAL,  # evaluate every N seconds
        exporters = exporter)
    
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

### Train model

In [10]:
OUTDIR = "gs://edml/data/taxi-trips/model"

In [15]:
%%bash

# gsutil rm gs://bucket/subdir/** will remove all objects under gs://bucket/subdir or any of its subdirectories.
gsutil rm gs://edml/data/taxi-trips/model** # start fresh each time

Removing gs://edml/data/taxi-trips/model/...
Removing gs://edml/data/taxi-trips/model/checkpoint...                          
Removing gs://edml/data/taxi-trips/model/events.out.tfevents.1572266092.tensorflow-20191018...
Removing gs://edml/data/taxi-trips/model/graph.pbtxt...                         
/ [4 objects]                                                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m rm ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Removing gs://edml/data/taxi-trips/model/model.ckpt-0.data-00000-of-00002...
Removing gs://edml/data/taxi-trips/model/model.ckpt-0.data-00001-of-00002...    
Removing gs://edml/data/taxi-trips/model/model.ckpt-0.index...                  
Removing gs://edml/data/taxi-trips/model/model.ckpt-0.meta...                   
Removing gs://edml/data/taxi-trips/model/mode

In [16]:
tf.summary.FileWriterCache.clear() # ensure filewriter cache is clear for TensorBoard events file
tf.logging.set_verbosity(v = tf.logging.INFO) # so loss is printed during training

In [17]:
train_and_evaluate(OUTDIR)

INFO:tensorflow:Using config: {'_num_worker_replicas': 1, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f698438f550>, '_save_checkpoints_secs': 300, '_global_id_in_cluster': 0, '_model_dir': 'gs://edml/data/taxi-trips/model', '_task_type': 'worker', '_save_checkpoints_steps': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_protocol': None, '_evaluation_master': '', '_tf_random_seed': 2810, '_num_ps_replicas': 0, '_experimental_max_worker_delay_secs': None, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_experimental_distribute': None, '_train_distribute': None, '_save_summary_steps': 100, '_master': '', '_keep_checkpoint_max': 3, '_log_step_count_steps': 100, '_device_fn': None, '_eval_distribute': None}
INFO:tensorflow:Using config: {'_num_worker_replicas': 1, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.s

KeyboardInterrupt: 