In [25]:
## What-If Tool from scratch - From CSV to trained binary classification model to What-If Tool usage

# This notebook shows the process of loading up a dataset from CSV, training a very simple classifier to
# predict one of the columns, then using the What-If Tool (WIT) to analyze the training dataset and the trained
# model.

# This notebook uses the UCI Census dataset and learning problem, detailed at
# https://archive.ics.uci.edu/ml/datasets/census+income, which predicts whether a person earns more than $50k
# given their census information.
# To customize this notebook to work on your own dataset, you only need to edit the sections marked with "USER: "

## Setup (install Jupyter, Tensorflow, and Tensorflow Serving in a virtualenv).
# NOTE: Use of a virtualenv, pip installation of tensorflow and docker use for TF Serving aren't the only way
# to set all this up. I just find it the simplest and safest to use.

# Step 1: Install Tensorflow using pip/virtualenv - See https://www.tensorflow.org/install/pip for instructions

# Step 2: Install Tensorflow Serving using docker - See https://www.tensorflow.org/serving/docker for instructions

# The next steps must be done from a terminal that has activated the virtualenv that was created in step 1

# Step 3: Install Jupyter to view and run this notebook
# > pip install jupyter

# Step 4: Run this notebook
# > jupyter notebook
# From the file selector that opens in the browser, select this notebook file.
# Run the cells.

In [26]:
## Load helper functions

import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow import data

# Writes a pandas dataframe to disk as a tfrecord file of tf.Example protos,
# using only the dataframe columns specified. Non-numeric columns are treated
# as strings.
def write_df_as_tfrecord(df, filename, columns=None):
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    writer = tf.python_io.TFRecordWriter(filename)
    if columns == None:
        columns = df.columns.values.tolist()
    for index, row in df.iterrows():
        example = tf.train.Example()
        for col in columns:
            if df[col].dtype is np.dtype(np.int64):
                example.features.feature[col].int64_list.value.append(row[col])
            elif df[col].dtype is np.dtype(np.float64):
                example.features.feature[col].float_list.value.append(row[col])
            elif row[col] == row[col]:
                example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
        writer.write(example.SerializeToString())
    writer.close()


# Creates a tf feature spec from the dataframe and columns specified.
def create_feature_spec(df, columns):
    feature_spec = {}
    for f in columns:
        if df[f].dtype is np.dtype(np.int64):
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.int64)
        elif df[f].dtype is np.dtype(np.float64):
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.float32)
        else:
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.string)
    return feature_spec

# Parses a serialized tf.Example into input features and target feature from 
# the provided label feature name and feature spec.
def parse_tf_example(example_proto, label, feature_spec):
    parsed_features = tf.parse_example(serialized=example_proto, features=feature_spec)
    target = parsed_features.pop(label)
    return parsed_features, target

# An input function for providing input to a model from tf.Examples from tf record files.
def tfrecords_input_fn(files_name_pattern, feature_spec, label, mode=tf.estimator.ModeKeys.EVAL,
                       num_epochs=None, 
                       batch_size=64):
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TFRecordDataset(filenames=file_names)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example, label, feature_spec))
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, target

# Creates simple numeric and categorical feature columns from a feature spec and a
# list of columns from that spec to use.
#
# NOTE: Models might perform better with some feature engineering such as bucketed
# numeric columns and hash-bucket/embedding columns for categorical features.
def create_feature_columns(columns, feature_spec):
    ret = []
    for col in columns:
        if feature_spec[col].dtype is tf.int64 or feature_spec[col].dtype is tf.float32:
            ret.append(tf.feature_column.numeric_column(col))
        else:
            ret.append(tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_vocabulary_list(col, list(df[col].unique()))))
    return ret

In [27]:
## Read the dataset from a CSV into dataframe and display a list of all columns and a preview of the data

# USER: Set the path to the CSV containing the dataset to train on (can be a web address or local path).
csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

# USER: Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# the column names, then set this to None.
csv_columns = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
               "Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
               "Hours-per-week", "Country", "Target"]

# Read the dataset from the provided CSV and print out information about it.
df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True)
print(df.columns.tolist())
df.head(5)

['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-Gain', 'Capital-Loss', 'Hours-per-week', 'Country', 'Target']


Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-Gain,Capital-Loss,Hours-per-week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [28]:
# USER: Set the name you want to give the directory the model will be saved to
model_name = 'trained_model'

# USER: Set the name you want to give the tfrecord dataset file
tfrecord_name = 'data.tfrecord'

# USER: Set the column in the dataset you wish for the model to predict
label_column = 'Target'

model_path = os.path.join(os.getcwd(), model_name)
tfrecord_path = os.path.join(os.getcwd(), tfrecord_name)

# USER: Make the label column numeric (0 and 1), for use in our model.
# In this case, examples with a target value of '<=50K' are considered to be in the '0' (negative) class
# and all other examples are considered to be in the '1' (positive) class.
df[label_column] = np.where(df[label_column] == '<=50K', 0, 1)

# USER: If the CSV needs any clean-up (such as removing problematic rows or creating new columns), do it here.

# USER: Set list of all columns from the dataset we will use for model input.
input_features = ['Age', 'Workclass', 'Education', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex',
                  'Capital-Gain', 'Capital-Loss', 'Hours-per-week', 'Country']

# Ensure the label column is not accidentally set as an input feature.
if label_column in input_features:
    input_features.remove(label_column)

# Create a list containing all input features and the label column
features_and_labels = input_features + [label_column]

In [29]:
# Write the records to disk as tf.Example protos in tf record file, for use in model training
# and later for use by WIT.
write_df_as_tfrecord(df, tfrecord_path, features_and_labels)

In [30]:
## Create and train the classifier

import functools

# Create a feature spec for the classifier
feature_spec = create_feature_spec(df, features_and_labels)

# Define and train the classifier
train_inpf = functools.partial(tfrecords_input_fn, tfrecord_path, feature_spec, label_column)
classifier = tf.estimator.LinearClassifier(
    feature_columns=create_feature_columns(input_features, feature_spec))
classifier.train(train_inpf, steps=10000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpq77g1dm4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f248d99c668>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph wa

INFO:tensorflow:loss = 27.781816, step = 7101 (0.283 sec)
INFO:tensorflow:global_step/sec: 373.999
INFO:tensorflow:loss = 126.17555, step = 7201 (0.268 sec)
INFO:tensorflow:global_step/sec: 287.1
INFO:tensorflow:loss = 32.98658, step = 7301 (0.349 sec)
INFO:tensorflow:global_step/sec: 261.509
INFO:tensorflow:loss = 35.472523, step = 7401 (0.383 sec)
INFO:tensorflow:global_step/sec: 288.418
INFO:tensorflow:loss = 133.76158, step = 7501 (0.346 sec)
INFO:tensorflow:global_step/sec: 354.337
INFO:tensorflow:loss = 127.338425, step = 7601 (0.283 sec)
INFO:tensorflow:global_step/sec: 389.959
INFO:tensorflow:loss = 15.225655, step = 7701 (0.256 sec)
INFO:tensorflow:global_step/sec: 248.277
INFO:tensorflow:loss = 32.691788, step = 7801 (0.403 sec)
INFO:tensorflow:global_step/sec: 372.777
INFO:tensorflow:loss = 48.37611, step = 7901 (0.268 sec)
INFO:tensorflow:global_step/sec: 260.326
INFO:tensorflow:loss = 20.456463, step = 8001 (0.384 sec)
INFO:tensorflow:global_step/sec: 312.662
INFO:tensorfl

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7f248d625da0>

In [32]:
## Save the classifier to disk for serving

# Uses a parsing serving input receiver function so that it can classify from serialized tf.Examples
# using the TensorFlow Serving Classify API.

serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
classifier.export_savedmodel(model_path, serving_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: ['serving_default', 'classification']
INFO:tensorflow:Signatures INCLUDED in export for Regress: ['regression']
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from /tmp/tmpq77g1dm4/model.ckpt-10000
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: /home/alejandro/Programming/bias-eval/trained_model/temp-b'1543751245'/saved_model.pb


b'/home/alejandro/Programming/bias-eval/trained_model/1543751245'

In [33]:
## Print out the What-If Tool usage instructions (serve model, launch TensorBoard, configure What-If Tool)
import urllib

docker_command = 'sudo docker run -p 8500:8500 --mount type=bind,source=%s,target=/models/my_model/ -e MODEL_NAME=my_model -t tensorflow/serving' % model_path
what_if_tool_path = ('http://localhost:6006/#whatif&inferenceAddress1=%s&modelName1=my_model&examplesPath=%s' % 
                     (urllib.parse.quote('localhost:8500'), urllib.parse.quote(tfrecord_path)))

print('Command to serve model:')
print (docker_command)
print ('\n')

print ('Command to launch tensorboard:')
print ('tensorboard --logdir .')
print ('\n')


print ('URL to view What-If Tool for your model and dataset:')
print (what_if_tool_path)

# To kill the served model, find the docker container ID through 'sudo docker container ls',
# then run 'sudo docker kill [containerId]'

Command to serve model:
sudo docker run -p 8500:8500 --mount type=bind,source=/home/alejandro/Programming/bias-eval/trained_model,target=/models/my_model/ -e MODEL_NAME=my_model -t tensorflow/serving


Command to launch tensorboard:
tensorboard --logdir .


URL to view What-If Tool for your model and dataset:
http://localhost:6006/#whatif&inferenceAddress1=localhost%3A8500&modelName1=my_model&examplesPath=/home/alejandro/Programming/bias-eval/data.tfrecord
