In [1]:
import time
import os
import urllib.parse

import numpy as np
import tensorflow as tf
from tensorflow.core.example.example_pb2 import Example
from tensorflow.core.example.feature_pb2 import Feature
from sklearn.model_selection import train_test_split
import pandas as pd

tf.logging.set_verbosity(tf.logging.INFO) 

# Get data
We are using the Census data. We are proposing a process to assess algorithmic bias in models, so instead of using the classification for whether people have >50k income, it will be used to classify if people should get a loan.

In [2]:
DOWNLOAD = False

csv_columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                   "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                   "hours-per-week", "native-country", "loan"]

label_column = 'loan'

csv_path = "data/adult.csv"

if DOWNLOAD:
    csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

# Read the dataset from the provided CSV and print out information about it.
df = pd.read_csv(csv_path, names=csv_columns, skiprows=[0], skipinitialspace=True)

# Converting Loan column to 1 if >50K
df[label_column] = df[label_column].apply(lambda x: ">50K" in x).astype(int)

for c in ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]:
        df[c] = pd.to_numeric(df[c])

# Delete unused column
del df["fnlwgt"] 

df.head(5)

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,loan
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
model_folder = 'trained_model'

examples_file = 'data.tfrecord'

model_path = os.path.join(os.getcwd(), model_folder)
examples_path = os.path.join(os.getcwd(), examples_file)

In [4]:
X = df.drop(label_column, axis=1)
y = df[label_column]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [6]:
BATCH_SIZE = 40
num_epochs = 1
shuffle = True

In [7]:
# Categorical base columns.
sex = tf.feature_column.categorical_column_with_vocabulary_list(
    key="sex",                                                           
    vocabulary_list=["female", "male"])
race = tf.feature_column.categorical_column_with_vocabulary_list(
    key="race",                                                             
    vocabulary_list=["Amer-Indian-Eskimo",
                     "Asian-Pac-Islander",
                     "Black", "Other", "White"])

education = tf.feature_column.categorical_column_with_hash_bucket(
  "education", hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket(
  "marital-status", hash_bucket_size=100)
relationship = tf.feature_column.categorical_column_with_hash_bucket(
  "relationship", hash_bucket_size=100)
workclass = tf.feature_column.categorical_column_with_hash_bucket(
  "workclass", hash_bucket_size=100)
occupation = tf.feature_column.categorical_column_with_hash_bucket(
  "occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
  "native-country", hash_bucket_size=1000)

In [8]:
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education-num")
capital_gain = tf.feature_column.numeric_column("capital-gain")
capital_loss  = tf.feature_column.numeric_column("capital-loss")
hours_per_week = tf.feature_column.numeric_column("hours-per-week")

In [9]:
all_columns = [
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(sex),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(race),
    tf.feature_column.embedding_column(education, dimension=8),
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]

In [10]:

model_dir = "models/model_DEEP_" + str(int(time.time()))
m = tf.estimator.DNNClassifier(
            model_dir=model_dir,
            feature_columns=all_columns,
            hidden_units=[100,100,100])



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/model_DEEP_1543768058', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdf8ea56668>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [11]:
%%time 

train_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_train,
        y=y_train,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

m.train(input_fn=train_input_fn)

print('training done')

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into models/model_DEEP_1543768058/model.ckpt.
INFO:tensorflow:loss = 47.412918, step = 1
INFO:tensorflow:global_step/sec: 53.1471
INFO:tensorflow:loss = 26.449947, step = 101 (1.882 sec)
INFO:tensorflow:global_step/sec: 39.5366
INFO:tensorflow:loss = 17.033653, step = 201 (2.529 sec)
INFO:tensorflow:global_step/sec: 51.4118
INFO:tensorflow:loss = 13.045663, step = 301 (1.

In [12]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_test,
        y=y_test,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

results = m.evaluate(input_fn=eval_input_fn)

print('\nAccuracy: %s' % results['accuracy'])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-02-16:27:59
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543768058/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-02-16:28:02
INFO:tensorflow:Saving dict for global step 977: accuracy = 0.8432798, accuracy_baseline = 0.7636401, auc = 0.9001138, auc_precision_recall = 0.75323397, average_loss = 0.32540274, global_step = 977, label/mean = 0.23635991, loss = 12.9749365, precision = 0.7253766, prediction/mean = 0.23578101, recall = 0.5422261
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 977: models/model_DEEP_1543768058/model.ckpt-977

Accuracy: 0.8432798


In [13]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_test,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

predictions = m.predict(input_fn=eval_input_fn)

print(list(predictions))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543768058/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[{'logits': array([-1.8445925], dtype=float32), 'logistic': array([0.13650905], dtype=float32), 'probabilities': array([0.86349094, 0.13650906], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object)}, {'logits': array([-5.7458534], dtype=float32), 'logistic': array([0.00318582], dtype=float32), 'probabilities': array([0.9968142 , 0.00318582], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object)}, {'logits': array([-5.7088413], dtype=float32), 'logistic': array([0.00330555], dtype=float32), 'probabilities': array([0.99669445, 0.00330555], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object)}, {'logits': array([-1.8541266], dtype=float

# Explainability with SHAP

In [14]:
import shap
shap.initjs()
from importlib import reload
reload(shap)

<module 'shap' from '/home/alejandro/anaconda3/lib/python3.6/site-packages/shap/__init__.py'>

In [15]:
actual_df = []

input_features = ["age", "workclass", "education", "education-num", "marital-status",
                   "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                   "hours-per-week", "native-country"]

def f(shap_X):
    tmp_df = pd.DataFrame(data=shap_X, columns=input_features)
    # Converting to int
    for c in ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]:
        tmp_df[c] = pd.to_numeric(tmp_df[c])
    
    predict_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=tmp_df,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

    pred_gen = m.predict(input_fn=predict_input_fn)
    pred_list = []
    for pred in pred_gen:
        pred_list.append(pred["logits"][0])
    pred_arr = np.array(pred_list)

    return pred_arr

explainer = shap.KernelExplainer(f, X_train.iloc[:100,:], link="logit")

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543768058/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [16]:
shap_values = explainer.shap_values(X_train.iloc[350,:], nsamples=500)
shap.force_plot(shap_values, X_display.iloc[350,:])

[24]
[[53]
 [23]
 [45]
 [38]
 [38]
 [39]
 [45]
 [19]
 [29]
 [22]
 [50]
 [30]
 [22]
 [53]
 [31]
 [24]
 [25]
 [44]
 [32]
 [55]
 [30]
 [55]
 [33]
 [47]
 [59]
 [59]
 [30]
 [47]
 [28]
 [61]
 [18]
 [31]
 [62]
 [30]
 [32]
 [62]
 [58]
 [55]
 [37]
 [31]
 [37]
 [53]
 [45]
 [27]
 [19]
 [69]
 [66]
 [50]
 [47]
 [51]
 [27]
 [34]
 [48]
 [35]
 [22]
 [48]
 [36]
 [26]
 [25]
 [28]
 [34]
 [50]
 [31]
 [21]
 [20]
 [33]
 [28]
 [50]
 [18]
 [19]
 [35]
 [21]
 [35]
 [61]
 [37]
 [41]
 [40]
 [36]
 [41]
 [46]
 [55]
 [35]
 [57]
 [58]
 [58]
 [49]
 [67]
 [45]
 [33]
 [37]
 [33]
 [60]
 [28]
 [31]
 [17]
 [64]
 [33]
 [22]
 [37]
 [30]]
['Private']
[['Private']
 ['Private']
 ['Private']
 ['Private']
 ['Private']
 ['Self-emp-not-inc']
 ['Private']
 ['Private']
 ['Private']
 ['Local-gov']
 ['Private']
 ['Private']
 ['Private']
 ['Private']
 ['Local-gov']
 ['Private']
 ['Private']
 ['Private']
 ['State-gov']
 ['Self-emp-not-inc']
 ['Private']
 ['Private']
 ['Local-gov']
 ['Federal-gov']
 ['Private']
 ['Private']
 ['Federal-gov

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543768058/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543768058/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


  return np.log(x/(1-x))
  outputs = ufunc(*inputs)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [17]:
shap.force_plot(shap_values, X_display.iloc[350,:])

NameError: name 'shap_values' is not defined

In [None]:
background = X_train.sample(100, replace=False)

In [None]:
e = shap.DeepExplainer((m.layers[0], background)

shap_values = e.shap_values(x_test[1:5])

shap.image_plot(shap_values, -x_test[1:5])