In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time

import numpy as np
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO) 
# Set to INFO for tracking training, default is WARN 

print("Using TensorFlow version %s" % (tf.__version__)) 
# This notebook is intended for tested for TF 1.3

CATEGORICAL_COLUMNS = ["workclass", "education", 
                       "marital_status", "occupation", 
                       "relationship", "race", 
                       "gender", "native_country"]

# Columns of the input csv file
COLUMNS = ["age", "workclass", "fnlwgt", "education", 
           "education_num", "marital_status",
           "occupation", "relationship", "race", 
           "gender", "capital_gain", "capital_loss",
           "hours_per_week", "native_country", "income_bracket"]

# Feature columns for input into the model
FEATURE_COLUMNS = ["age", "workclass", "education", 
                   "education_num", "marital_status",
                   "occupation", "relationship", "race", 
                   "gender", "capital_gain", "capital_loss",
                   "hours_per_week", "native_country"]

  from ._conv import register_converters as _register_converters


Using TensorFlow version 1.5.0


In [2]:
import pandas as pd

df = pd.read_csv("adult.test.csv", header=None, names=COLUMNS)

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
df.describe(include=[np.number])

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,16278.0,16278.0,16278.0,16278.0,16278.0,16278.0
mean,38.767416,189431.2,10.072368,1081.769382,87.915469,40.390466
std,13.85037,105711.4,2.567474,7584.547894,403.140665,12.479308
min,17.0,13492.0,1.0,0.0,0.0,1.0
25%,28.0,116738.5,9.0,0.0,0.0,40.0
50%,37.0,177829.5,10.0,0.0,0.0,40.0
75%,48.0,238384.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,3770.0,99.0


In [6]:
df.describe(include=[np.object])

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,gender,native_country,income_bracket
count,16278,16278,16278,16278,16278,16278,16278,16278,16278
unique,9,16,7,15,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,11208,5283,7401,2031,6521,13944,10857,14659,12433


In [7]:
df.corr()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
age,1.0,-0.076556,0.019944,0.076362,0.055304,0.077096
fnlwgt,-0.076556,1.0,-0.029951,-0.011705,0.007396,-0.003234
education_num,0.019944,-0.029951,1.0,0.130089,0.083133,0.134766
capital_gain,0.076362,-0.011705,0.130089,1.0,-0.031106,0.089421
capital_loss,0.055304,0.007396,0.083133,-0.031106,1.0,0.054926
hours_per_week,0.077096,-0.003234,0.134766,0.089421,0.054926,1.0


In [3]:
BATCH_SIZE = 40

def generate_input_fn(filename, num_epochs=None, shuffle=True, batch_size=BATCH_SIZE):
    df = pd.read_csv(filename, header=None, names=COLUMNS)
    labels = df["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
    del df["fnlwgt"] # Unused column
    del df["income_bracket"] # Labels column, already saved to labels variable
    
    return tf.estimator.inputs.pandas_input_fn(
        x=df,
        y=labels,
        batch_size=batch_size,
        num_epochs=num_epochs,
        shuffle=shuffle)

print('input function configured')

input function configured


In [4]:
%%time
# The layers module contains many utilities for creating feature columns.

# Categorical base columns.
gender = tf.feature_column.categorical_column_with_vocabulary_list(key="gender", 
                                                                   vocabulary_list=["female", "male"])
race = tf.feature_column.categorical_column_with_vocabulary_list(key="race",
                                                                 vocabulary_list=["Amer-Indian-Eskimo",
                                                                       "Asian-Pac-Islander",
                                                                       "Black", "Other",
                                                                       "White"])

education = tf.feature_column.categorical_column_with_hash_bucket(
  "education", hash_bucket_size=1000)
marital_status = tf.feature_column.categorical_column_with_hash_bucket(
  "marital_status", hash_bucket_size=100)
relationship = tf.feature_column.categorical_column_with_hash_bucket(
  "relationship", hash_bucket_size=100)
workclass = tf.feature_column.categorical_column_with_hash_bucket(
  "workclass", hash_bucket_size=100)
occupation = tf.feature_column.categorical_column_with_hash_bucket(
  "occupation", hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
  "native_country", hash_bucket_size=1000)

print('Categorical columns configured')

Categorical columns configured
Wall time: 0 ns


In [5]:
%%time
# Continuous base columns.
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss  = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

print('Continuous columns configured')

Continuous columns configured
Wall time: 1e+03 µs


In [6]:
# Transformations.
age_buckets = tf.feature_column.bucketized_column(
    age, boundaries=[ 18, 25, 30, 35, 40, 45, 50, 55, 60, 65 ])

education_occupation = tf.feature_column.crossed_column(
    ["education", "occupation"], hash_bucket_size=int(1e4))

age_race_occupation = tf.feature_column.crossed_column(
    [age_buckets, "race", "occupation"], hash_bucket_size=int(1e6))

country_occupation = tf.feature_column.crossed_column(
    ["native_country", "occupation"], hash_bucket_size=int(1e4))

print('Transformations complete')

Transformations complete


In [7]:
%%time
# Wide columns and deep columns.
wide_columns = [gender, race, native_country,
      education, occupation, workclass,
      marital_status, relationship,
      age_buckets, education_occupation,
      age_race_occupation, country_occupation]

deep_columns = [
    # Multi-hot indicator columns for columns with fewer possibilities
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(marital_status),
    tf.feature_column.indicator_column(gender),
    tf.feature_column.indicator_column(relationship),
    tf.feature_column.indicator_column(race),
    # Embeddings for categories with more possibilities
    tf.feature_column.embedding_column(education, dimension=8),
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation, dimension=8),
    # Numerical columns
    age,
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week,
]

print('wide and deep columns configured')

wide and deep columns configured
Wall time: 0 ns


In [8]:
%%time
def create_model_dir(model_type):
    return 'models/model_' + model_type + '_' + str(int(time.time()))

# If new_model=False, pass in the desired model_dir 
def get_model(model_type, new_model=False, model_dir=None):
    if new_model or model_dir is None:
        model_dir = create_model_dir(model_type) # Comment out this line to continue training a existing model
    print("Model directory = %s" % model_dir)
    
    m = None
    
    # Linear Classifier
    if model_type == 'WIDE':
        m = tf.estimator.LinearClassifier(
            model_dir=model_dir, 
            feature_columns=wide_columns)

    # Deep Neural Net Classifier
    if model_type == 'DEEP':
        m = tf.estimator.DNNClassifier(
            model_dir=model_dir,
            feature_columns=deep_columns,
            hidden_units=[100, 50])

    # Combined Linear and Deep Classifier
    if model_type == 'WIDE_AND_DEEP':
        m = tf.contrib.learn.DNNLinearCombinedClassifier(
                model_dir=model_dir,
                linear_feature_columns=wide_columns,
                dnn_feature_columns=deep_columns,
                dnn_hidden_units=[100, 70, 50, 25])
        
    print('estimator built')
    
    return m, model_dir
    
MODEL_TYPE = 'WIDE_AND_DEEP'
model_dir = create_model_dir(model_type=MODEL_TYPE)
m, model_dir = get_model(model_type = MODEL_TYPE, model_dir=model_dir)

Model directory = models/model_WIDE_AND_DEEP_1518181689
Instructions for updating:
Please set fix_global_step_increment_bug=True and update training steps in your pipeline. See pydoc for details.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000000167D4D68>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'models/model_WIDE_AND_DEEP_1518181689'}
estimator built
Wall time: 1.6 s


In [9]:
%%time 

train_file = str("adult.data.csv") 
# "gs://cloudml-public/census/data/adult.data.csv"
# storage.googleapis.com/cloudml-public/census/data/adult.data.csv

m.train(input_fn=generate_input_fn(train_file), 
      steps=1000)

print('training done')

AttributeError: 'DNNLinearCombinedClassifier' object has no attribute 'train'