In [1]:
import time
import os
import urllib.parse

import numpy as np
import tensorflow as tf
from tensorflow.core.example.example_pb2 import Example
from tensorflow.core.example.feature_pb2 import Feature
from sklearn.model_selection import train_test_split
import pandas as pd

tf.logging.set_verbosity(tf.logging.INFO) 

# Get data
We are using the Census data. We are proposing a process to assess algorithmic bias in models, so instead of using the classification for whether people have >50k income, it will be used to classify if people should get a loan.

In [106]:
DOWNLOAD = False

csv_columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                   "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                   "hours-per-week", "native-country", "loan"]

label_column = 'loan'

csv_path = "data/adult.csv"

if DOWNLOAD:
    csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'

# Read the dataset from the provided CSV and print out information about it.
df = pd.read_csv(csv_path, names=csv_columns, skiprows=[0], skipinitialspace=True)

X_display = df.drop('loan', axis=1)
y_display = df['loan']

# Converting Loan column to 1 if >50K
df[label_column] = df[label_column].apply(lambda x: ">50K" in x).astype(int)

for c in ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]:
        df[c] = pd.to_numeric(df[c])
        
# int_columns = df.select_dtypes(['int64']).columns
# df[int_columns] = df[int_columns].astype('int64')

# cat_columns = df.select_dtypes(['object']).columns
# df[cat_columns] = df[cat_columns].astype('category')
# df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

# Delete unused column
del df["fnlwgt"] 

df.head(5)

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,loan
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [107]:
model_folder = 'trained_model'

examples_file = 'data.tfrecord'

model_path = os.path.join(os.getcwd(), model_folder)
examples_path = os.path.join(os.getcwd(), examples_file)

In [108]:
X = df.drop(label_column, axis=1)
y = df[label_column]

# #Normalising to increase accuracy
# dtypes = list(zip(X.dtypes.index, map(str, X.dtypes)))
# for k,dtype in dtypes:
#     if dtype == "float32":
#         X[k] -= X[k].mean()
#         X[k] /= X[k].std()

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [110]:
BATCH_SIZE = 40
num_epochs = 1
shuffle = True

In [111]:
# Categorical base columns.
sex = tf.feature_column.categorical_column_with_hash_bucket(
    key="sex", hash_bucket_size=1000)
race = tf.feature_column.categorical_column_with_hash_bucket(
    key="race", hash_bucket_size=1000)

education = tf.feature_column.categorical_column_with_hash_bucket(
      "education", hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket(
      "marital-status", hash_bucket_size=100)
relationship = tf.feature_column.categorical_column_with_hash_bucket(
      "relationship", hash_bucket_size=100)
workclass = tf.feature_column.categorical_column_with_hash_bucket(
      "workclass", hash_bucket_size=100)
occupation = tf.feature_column.categorical_column_with_hash_bucket(
      "occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
      "native-country", hash_bucket_size=1000)

In [112]:
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education-num")
capital_gain = tf.feature_column.numeric_column("capital-gain")
capital_loss  = tf.feature_column.numeric_column("capital-loss")
hours_per_week = tf.feature_column.numeric_column("hours-per-week")

In [113]:
all_columns = [
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(sex),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(race),
    tf.feature_column.embedding_column(education, dimension=8),
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]

In [114]:

model_dir = "models/model_DEEP_" + str(int(time.time()))
m = tf.estimator.DNNClassifier(
            model_dir=model_dir,
            feature_columns=all_columns,
            hidden_units=[100,100,100])



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/model_DEEP_1543784002', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fadb38c8710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [115]:
%%time 

train_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_train,
        y=y_train,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

m.train(input_fn=train_input_fn)

print('training done')

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into models/model_DEEP_1543784002/model.ckpt.
INFO:tensorflow:loss = 556.74524, step = 1
INFO:tensorflow:global_step/sec: 78.4483
INFO:tensorflow:loss = 20.036163, step = 101 (1.275 sec)
INFO:tensorflow:global_step/sec: 93.4177
INFO:tensorflow:loss = 15.656116, step = 201 (1.070 sec)
INFO:tensorflow:global_step/sec: 89.9304
INFO:tensorflow:loss = 17.38699, step = 301 (1.112 sec)
INFO:tensorflow:global_step/sec: 84.6392
INFO:tensorflow:loss = 10.787235, step = 401 (1.181 sec)
INFO:tensorflow:global_step/sec: 92.7844
INFO:tensorflow:loss = 11.119894, step = 501 (1.078 sec)
INFO:tensorflow:global_step/sec: 90.287
INFO:tensorflow:loss = 10.590277, step = 601 (1.108 sec)
INFO:tensorflow:global_step/sec: 83.8991
INFO:t

In [116]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_test,
        y=y_test,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

results = m.evaluate(input_fn=eval_input_fn)

print('\nAccuracy: %s' % results['accuracy'])

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-02-20:53:38
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543784002/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-02-20:53:41
INFO:tensorflow:Saving dict for global step 977: accuracy = 0.8344764, accuracy_baseline = 0.7547344, auc = 0.8953799, auc_precision_recall = 0.7514547, average_loss = 0.34096223, global_step = 977, label/mean = 0.24526563, loss = 13.595347, precision = 0.76406777, prediction/mean = 0.21357425, recall = 0.47036728
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 977: models/model_DEEP_1543784002/model.ckpt-977

Accuracy: 0.8344764


In [117]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_test,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

predictions = m.predict(input_fn=eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543784002/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[{'logits': array([0.00666034], dtype=float32), 'logistic': array([0.5016651], dtype=float32), 'probabilities': array([0.49833494, 0.5016651 ], dtype=float32), 'class_ids': array([1]), 'classes': array([b'1'], dtype=object)}, {'logits': array([-6.0044193], dtype=float32), 'logistic': array([0.00246175], dtype=float32), 'probabilities': array([0.9975382 , 0.00246175], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object)}, {'logits': array([-3.4126134], dtype=float32), 'logistic': array([0.03190358], dtype=float32), 'probabilities': array([0.96809644, 0.03190358], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object)}, {'logits': array([-2.802343], dtype=float32

# Explainability with SHAP

In [118]:
import shap
shap.initjs()


In [120]:
actual_df = []

input_features = ["age", "workclass", "education", "education-num", "marital-status",
                   "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                   "hours-per-week", "native-country"]

def f(shap_X):
    tmp_df = pd.DataFrame(data=shap_X, columns=input_features)
    # Converting to int
    for c in ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]:
        tmp_df[c] = pd.to_numeric(tmp_df[c])
    
    predict_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=tmp_df,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

    pred_gen = m.predict(input_fn=predict_input_fn)
    pred_list = []
    for pred in pred_gen:
        pred_list.append(pred["logistic"][0])
    pred_arr = np.array(pred_list)

    return pred_arr

explainer = shap.KernelExplainer(f, X_train.iloc[:100,:])


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543784002/model.ckpt-977
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [121]:
shap_values = explainer.shap_values(X_train.iloc[350,:], nsamples=500)


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [94]:
shap.force_plot(explainer.expected_value, shap_values, X_test.iloc[350,:])

In [None]:
background = X_train.sample(100, replace=False)

In [None]:
e = shap.DeepExplainer((m.layers[0], background)

shap_values = e.shap_values(x_test[1:5])

shap.image_plot(shap_values, -x_test[1:5])


# Census income classification with Keras

We use keras to perform this computation


In [38]:
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense, Flatten, Concatenate, concatenate, Dropout, Lambda
from keras.models import Model
from keras.layers.embeddings import Embedding
from tqdm import tqdm
import shap

Using TensorFlow backend.


In [48]:
# build model
input_els = []
encoded_els = []
for k,dtype in dtypes:
    input_els.append(Input(shape=(1,)))
    if dtype == "int8":
        e = Flatten()(Embedding(X_train[k].max()+1, 1)(input_els[-1]))
    else:
        e = input_els[-1]
    encoded_els.append(e)
encoded_els = concatenate(encoded_els)
layer1 = Dropout(0.5)(Dense(100, activation="relu")(encoded_els))
out = Dense(1)(layer1)

# train model
regression = Model(inputs=input_els, outputs=[out])
regression.compile(optimizer="adam", loss='binary_crossentropy')
regression.fit(
    [X_train[k].values for k,t in dtypes],
    y_train,
    epochs=50,
    batch_size=512,
    shuffle=True,
    validation_data=([X_test[k].values for k,t in dtypes], y_test)
)

Train on 39073 samples, validate on 9769 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fadb3a9a6a0>

In [97]:
def f(X):
    values = regression.predict([X[:,i] for i in range(X.shape[1])]).flatten()
    print(values)
    return values
explainer = shap.KernelExplainer(f, X.iloc[:100,:])

[-1108.6953     -31.40039    -33.025097   -37.684834   -30.669397
   -34.32478    -28.892414   -40.053234 -7103.692    -2621.888
   -50.615017   -31.368414   -24.938583   -35.976284   -34.442257
   -31.726748   -26.29388    -30.785225   -36.175884   -38.475044
   -44.05599    -33.490055   -30.51061   -793.0917     -40.818542
   -40.94003    -26.960817   -46.953228   -50.777164   -36.996326
   -34.55939    -29.394356  -552.31024    -30.420425   -17.77883
   -35.886227   -27.767893   -20.189445   -29.978155   -37.72492
   -30.19771    -39.863308   -34.194767   -36.992203   -26.295874
   -41.30956    -37.612843   -36.899494   -34.770714   -31.640896
   -28.8227     -21.974937  -743.6929     -44.627304   -44.06381
   -35.20274    -33.736774   -32.59049    -37.23187  -2534.9773
 -1223.3937     -29.03878    -36.634747   -38.78906    -38.342026
   -32.292484   -29.700134   -38.41095    -41.257504   -28.813925
   -23.476707   -31.757086   -44.836777   -28.25681    -45.636
   -29.07753    -34.7

In [98]:
shap_values = explainer.shap_values(X_train.iloc[350,:], nsamples=500)

[-27.706234]
[-1104.076      -18.26278    -27.703339 ...   -28.007942   -28.64113
   -27.706215]
[[ 0.40140185  0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.11587473  0.11587473 ...  0.11587473  0.11587473
   0.11587473]
 [ 0.          0.40140185  0.         ...  0.          0.
   0.        ]
 ...
 [-0.09004047  0.          0.         ...  0.          0.
   0.        ]
 [-0.07609808  0.         -0.07609808 ...  0.         -0.07609808
  -0.07609808]
 [ 0.         -0.09625731  0.         ... -0.09625731  0.
   0.        ]] [ 2.19013216e+00  2.89571273e+01 -1.08282982e-03  2.96031631e+01
  3.40697903e-06  2.96028880e+01  1.32772998e-01  2.95591067e+01
 -6.12666710e-04  2.96032142e+01 -2.51722926e-03  2.96036120e+01
 -6.36653374e-04  2.96030785e+01  9.00437770e-05  2.96028618e+01
  1.00218777e-05  2.96029202e+01  8.68064232e+01  4.52911663e+00
  1.30534451e+01  2.58046111e+01  2.40953182e-01  2.95208124e+01
  1.31072992e-05  2.96029263e+01  6.08119916e-01  1.8

In [99]:
shap.force_plot(explainer.expected_value, shap_values, X_test.iloc[350,:])