In [264]:
import time
import os
import urllib.parse

import numpy as np
import tensorflow as tf
from tensorflow.core.example.example_pb2 import Example
from tensorflow.core.example.feature_pb2 import Feature
import pandas as pd



In [265]:

tf.logging.set_verbosity(tf.logging.INFO) 
# Set to INFO for tracking training, default is WARN 

print("Using TensorFlow version %s" % (tf.__version__)) 

CATEGORICAL_COLUMNS = ["workclass", "education", 
                       "marital-status", "occupation", 
                       "relationship", "race", 
                       "sex", "native-country"]

# Columns of the input csv file
COLUMNS = ["age", "workclass", "fnlwgt", "education", 
           "education-num", "marital-status",
           "occupation", "relationship", "race", 
           "sex", "capital-gain", "capital-loss",
           "hours-per-week", "native-country", "income"]

# Feature columns for input into the model
FEATURE_COLUMNS = ["age", "workclass", "education", 
                   "education-num", "marital-status",
                   "occupation", "relationship", "race", 
                   "sex", "capital-gain", "capital-loss",
                   "hours-per-week", "native-country"]

INFERENCE_COLUMNS = ["Inference correct", "Inference label", "Inference score", "Inference value"]


Using TensorFlow version 1.12.0


In [295]:
df = pd.read_csv("adult_full.csv")

In [267]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [268]:
df.describe(include=[np.number])

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [269]:
df.describe(include=[np.object])

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
count,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [270]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [271]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

In [272]:
from sklearn.model_selection import train_test_split
BATCH_SIZE = 40

num_epochs = 1
shuffle = True

In [273]:

y = df["income"].apply(lambda x: ">50K" in x).astype(int)
del df["fnlwgt"] # Unused column
X = df.drop("income", axis=1)


In [274]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)


In [275]:

train_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_train,
        y=y_train,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)



In [276]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X_test,
        y=y_test,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

In [277]:
full_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=X,
        y=y,
        batch_size=BATCH_SIZE,
        num_epochs=num_epochs,
        shuffle=shuffle)

In [278]:
# Categorical base columns.
sex = tf.feature_column.categorical_column_with_vocabulary_list(
    key="sex",                                                           
    vocabulary_list=["female", "male"])
race = tf.feature_column.categorical_column_with_vocabulary_list(
    key="race",                                                             
    vocabulary_list=["Amer-Indian-Eskimo",
                     "Asian-Pac-Islander",
                     "Black", "Other", "White"])

education = tf.feature_column.categorical_column_with_hash_bucket(
  "education", hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket(
  "marital-status", hash_bucket_size=100)
relationship = tf.feature_column.categorical_column_with_hash_bucket(
  "relationship", hash_bucket_size=100)
workclass = tf.feature_column.categorical_column_with_hash_bucket(
  "workclass", hash_bucket_size=100)
occupation = tf.feature_column.categorical_column_with_hash_bucket(
  "occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
  "native-country", hash_bucket_size=1000)

print('Categorical columns configured')

Categorical columns configured


In [279]:
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education-num")
capital_gain = tf.feature_column.numeric_column("capital-gain")
capital_loss  = tf.feature_column.numeric_column("capital-loss")
hours_per_week = tf.feature_column.numeric_column("hours-per-week")

In [280]:
wide_columns = [sex, race, native_country,
      education, occupation, workclass,
      marital_status, relationship]

deep_columns = [
    # Multi-hot indicator columns for columns with fewer possibilities
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(sex),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(race),
    # Embeddings for categories with more possibilities. Should have at least (possibilties)**(0.25) dims
    tf.feature_column.embedding_column(education, dimension=8),
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
    # Numerical columns
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]


In [281]:

model_dir = "models/model_DEEP_" + str(int(time.time()))
m = tf.estimator.DNNClassifier(
            model_dir=model_dir,
            feature_columns=deep_columns,
            hidden_units=[100,100,100])



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/model_DEEP_1543752757', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f59af459518>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [282]:
%%time 

m.train(input_fn=train_input_fn)

print('training done')

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into models/model_DEEP_1543752757/model.ckpt.
INFO:tensorflow:loss = 199.69298, step = 1
INFO:tensorflow:global_step/sec: 62.7243
INFO:tensorflow:loss = 14.9565525, step = 101 (1.595 sec)
INFO:tensorflow:global_step/sec: 88.4871
INFO:tensorflow:loss = 20.36529, step = 201 (1.130 sec)
INFO:tensorflow:global_step/sec: 86.3634
INFO:tensorflow:loss = 13.351946, step = 301 (1.158 sec)
INFO:tensorflow:global_step/sec: 89.7295
INFO:tensorflow:loss = 17.712923, step = 401 (1.115 sec)
INFO:tensorflow:global_step/sec: 85.5792
INFO:tensorflow:loss = 19.031654, step = 501 (1.168 sec)
INFO:tensorflow:global_step/sec: 82.9433
INFO:tensorflow:loss = 9.162474, step = 601 (1.205 sec)
INFO:tensorflow:Saving checkpoints for 652 int

In [283]:
results = m.evaluate(input_fn=eval_input_fn)
print('evaluate done')
print('\nAccuracy: %s' % results['accuracy'])



INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-02-12:12:54
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543752757/model.ckpt-652
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-02-12:12:57
INFO:tensorflow:Saving dict for global step 652: accuracy = 0.84154767, accuracy_baseline = 0.7575618, auc = 0.89905035, auc_precision_recall = 0.764175, average_loss = 0.33150694, global_step = 652, label/mean = 0.2424382, loss = 13.246041, precision = 0.7145098, prediction/mean = 0.23650843, recall = 0.57694745
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 652: models/model_DEEP_1543752757/model.ckpt-652
evaluate done

Accuracy: 0.84154767


In [284]:
feature_spec = tf.feature_column.make_parse_example_spec(deep_columns)


serving_input_receiver_fn = \
    tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)

export_dir = m.export_savedmodel( \
            'export', serving_input_receiver_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: ['serving_default', 'classification']
INFO:tensorflow:Signatures INCLUDED in export for Regress: ['regression']
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543752757/model.ckpt-652
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: export/temp-b'1543752778'/saved_model.pb


In [285]:
predictions = m.predict(input_fn=full_input_fn)

all_preds = []
[[1,1,1,1],[2,2,2,2]]

for p, i in zip(predictions, y):
    i_correct = "correct" if p["class_ids"][0] == i else "incorrect"
    i_label = ">50K" if p["class_ids"][0] == 1 else "<=50K"
    i_score = p["probabilities"][p["class_ids"][0]]
    i_value = p["class_ids"][0]
    row = []
    row.append(i_correct)
    row.append(i_label)
    row.append(i_score)
    row.append(i_value)
    all_preds.append(row)


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543752757/model.ckpt-652
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [286]:
p_df = pd.DataFrame(all_preds, columns=INFERENCE_COLUMNS)
p_df.head(5)

Unnamed: 0,Inference correct,Inference label,Inference score,Inference value
0,correct,<=50K,0.966029,0
1,correct,<=50K,0.509781,0
2,correct,<=50K,0.901539,0
3,incorrect,>50K,0.526714,1
4,correct,<=50K,0.87499,0


In [287]:
full_df = p_df.join(df)
full_df.head(5)

Unnamed: 0,Inference correct,Inference label,Inference score,Inference value,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,correct,<=50K,0.966029,0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,correct,<=50K,0.509781,0,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,correct,<=50K,0.901539,0,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,incorrect,>50K,0.526714,1,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,correct,<=50K,0.87499,0,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [255]:
# Create exammples

def write_df_as_tfrecord(df, filename, feature_cols):
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    writer = tf.python_io.TFRecordWriter(filename)
    if feature_cols == None:
        feature_cols = df.feature_cols.values.tolist()
    for index, row in df.iterrows():
        example = tf.train.Example()
        for col in feature_cols:
            if df[col].dtype is np.dtype(np.int64):
                example.features.feature[col].int64_list.value.append(row[col])
            elif df[col].dtype is np.dtype(np.float64):
                example.features.feature[col].float_list.value.append(row[col])
            elif row[col] == row[col]:
                example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
        
        writer.write(example.SerializeToString())
    writer.close()
    
    
features_and_labels = FEATURE_COLUMNS + ["income"]
tfrecord_path = os.path.join(os.getcwd(), 'data.tfrecord')
write_df_as_tfrecord(full_df, tfrecord_path, features_and_labels)



In [210]:
# predictions = m.predict(input_fn=full_input_fn)

# predictions_path = os.path.join(os.getcwd(), 'predictions.tfrecord')

# def write_predictions_as_text(filename, predictions):
#     if not os.path.exists(os.path.dirname(filename)):
#         os.makedirs(os.path.dirname(filename))
    
#     with open(filename, "w") as f:
#         # Writing ID file
#         f.write("Inference correct\tInference label\tInference score\tInference value\n")
#         for p, i in zip(predictions, y):
#             i_correct = "correct" if p["class_ids"][0] == i else "incorrect"
#             i_label = ">50K" if p["class_ids"][0] == 1 else "<=50K"
#             i_score = p["probabilities"][p["class_ids"][0]]
#             i_value = p["class_ids"]
#             f.write("%s\t%s\t%s\t%s" % (
#                 i_correct, i_label, i_score, i_value))
#             f.write("\n")
    
# write_predictions_as_text(predictions_path, predictions)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from models/model_DEEP_1543599990/model.ckpt-652
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [256]:
import urllib.parse

what_if_tool_path = \
    ('http://localhost:6006/#whatif&inferenceAddress1=%s&examplesPath=%s' % 
    (urllib.parse.quote('localhost:8500'), urllib.parse.quote(tfrecord_path)))

print(what_if_tool_path)

http://localhost:6006/#whatif&inferenceAddress1=localhost%3A8500&examplesPath=/home/alejandro/Programming/bias-eval/data.tfrecord
