# wide model with sparse columns

In [11]:
import tempfile
import urllib.request

In [16]:
# train_file = tempfile.NamedTemporaryFile()
# test_file = tempfile.NamedTemporaryFile()
with urllib.request.urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data") as response, open('consus_train.csv','wb') as out_file:
    data=response.read()
    out_file.write(data)
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", 'consus_test.csv')

('consus_test.csv', <http.client.HTTPMessage at 0x17fa95d6b70>)

In [17]:
import pandas as pd
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"]
train_df = pd.read_csv('consus_train.csv',names=CSV_COLUMNS, skipinitialspace=True )
test_df = pd.read_csv('consus_test.csv',names=CSV_COLUMNS, skipinitialspace=True, skiprows=1 )


In [18]:
train_labels= (train_df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int) 
test_labels= (test_df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)

In [19]:
import tensorflow as tf

In [20]:
# convert to Tnesor
def input_fn(dataset, num_epochs=None, shuffle=True):
    df_data=pd.read_csv(tf.gfile.Open(dataset),names=CSV_COLUMNS, skipinitialspace=True, engine='python', skiprows=1)
    df_data=df_data.dropna(how='any', axis=0)
    labels = (df_data['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
    return tf.estimator.inputs.pandas_input_fn(x=df_data, y=labels, num_epochs=num_epochs, shuffle=shuffle, batch_size=100)


# Feature engineering

In [21]:
# base categorical column
# know the set of all possible feature values of a column and there are only a few of them,
# Each key in the list will get assigned an auto-incremental ID starting from 0. 
gender = tf.feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male'])
education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
    "marital_status", [
        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
    ])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])


# don't know the set of possible values in advance
occupation = tf.feature_column.categorical_column_with_hash_bucket('occupation', hash_bucket_size=1000)
native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)

# base continuous column
age = tf.feature_column.numeric_column('age')
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

# Making Continuous Features Categorical through Bucketization
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

# Intersecting Multiple Columns with CrossedColumn
education_x_occupation = tf.feature_column.crossed_column(['education', 'occupation'], hash_bucket_size=1000)
# keys = string or Categorical_column

age_buckets_x_education_x_occupation = tf.feature_column.crossed_column(
    [age_buckets, "education", "occupation"], hash_bucket_size=1000)



In [26]:
# define linear model 
base_columns = [
    gender, native_country, education, occupation, workclass, relationship,
    age_buckets]

crossed_columns = [education_x_occupation, age_buckets_x_education_x_occupation, 
                   tf.feature_column.crossed_column(['native_country','occupation'], hash_bucket_size=1000)]
                   
feature_columns= base_columns+crossed_columns                  
m=tf.estimator.LinearClassifier(feature_columns=feature_columns)
                   


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'C:\\Users\\Lei\\AppData\\Local\\Temp\\tmpe9f4lzng', '_keep_checkpoint_max': 5}


In [27]:
# Training and Evaluating Our Model
m.train(input_fn=input_fn('consus_train.csv'),steps=2000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Lei\AppData\Local\Temp\tmpe9f4lzng\model.ckpt.
INFO:tensorflow:loss = 69.3147, step = 1
INFO:tensorflow:global_step/sec: 199.866
INFO:tensorflow:loss = 25.6617, step = 101 (0.516 sec)
INFO:tensorflow:global_step/sec: 231.205
INFO:tensorflow:loss = 35.8934, step = 201 (0.417 sec)
INFO:tensorflow:global_step/sec: 240.301
INFO:tensorflow:loss = 30.2206, step = 301 (0.416 sec)
INFO:tensorflow:global_step/sec: 231.831
INFO:tensorflow:loss = 36.7884, step = 401 (0.431 sec)
INFO:tensorflow:global_step/sec: 228.2
INFO:tensorflow:loss = 27.7739, step = 501 (0.438 sec)
INFO:tensorflow:global_step/sec: 240.262
INFO:tensorflow:loss = 33.1913, step = 601 (0.432 sec)
INFO:tensorflow:global_step/sec: 231.702
INFO:tensorflow:loss = 44.075, step = 701 (0.416 sec)
INFO:tensorflow:global_step/sec: 240.319
INFO:tensorflow:loss = 41.0936, step = 801 (0.416 sec)
INFO:tensorflow:global_step/sec: 236.812
INFO:te

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x17fad40f7f0>

In [28]:
results = m.evaluate(input_fn = input_fn(dataset='consus_test.csv', num_epochs=1, shuffle=False), steps=None)

INFO:tensorflow:Starting evaluation at 2017-09-23-09:47:51
INFO:tensorflow:Restoring parameters from C:\Users\Lei\AppData\Local\Temp\tmpe9f4lzng\model.ckpt-2000
INFO:tensorflow:Finished evaluation at 2017-09-23-09:47:53
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.834101, accuracy_baseline = 0.763774, auc = 0.881853, auc_precision_recall = 0.69317, average_loss = 0.354358, global_step = 2000, label/mean = 0.236226, loss = 35.3945, prediction/mean = 0.229536


In [30]:
for key in sorted(results):
    print('%s: %s ' %(key, results[key] ))

accuracy: 0.834101 
accuracy_baseline: 0.763774 
auc: 0.881853 
auc_precision_recall: 0.69317 
average_loss: 0.354358 
global_step: 2000 
label/mean: 0.236226 
loss: 35.3945 
prediction/mean: 0.229536 


In [None]:
# Adding Regularization to Prevent Overfitting
m= tf.estimator.LinearClassifier(feature_columns=feature_columns, 
                                 optimizer=tf.train.FtrlOptimizer(l2_regularization_strength=1.0,learning_rate=0.1))


# deep model with dense columns

In [32]:
# Represents multi-hot representation of given categorical column.
# indicator_column(categorical_column)
deep_columns = [
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(gender),
    tf.feature_column.indicator_column(relationship),
    # _DenseColumn that converts from sparse
    # embedding_column(categorical_column,dimension, 
    tf.feature_column.embedding_column(native_country, dimension =8),
    tf.feature_column.embedding_column(occupation, dimension = 8),
    age, 
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week]

# the number of dimensions is to start with a value on the order of log2⁡(n) or k*n ** (0.25), 
# where n is the number of unique features in a feature column and k is a small constant (usually smaller than 10).





In [34]:
# Combining Wide and Deep Models into One

m = tf.estimator.DNNLinearCombinedClassifier(linear_feature_columns=crossed_columns, 
                                             dnn_feature_columns=deep_columns, dnn_hidden_units=[100,50])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'C:\\Users\\Lei\\AppData\\Local\\Temp\\tmpnzqi95s2', '_keep_checkpoint_max': 5}


In [36]:
# Training and Evaluating The Model
m.train(input_fn=input_fn(dataset='consus_train.csv'), steps=2000)
results = m.evaluate(input_fn=input_fn('consus_test.csv', num_epochs=1,shuffle=False), steps=None)
for key in sorted(results):
    print('%s: %s' %(key, results[key]))


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Lei\AppData\Local\Temp\tmpnzqi95s2\model.ckpt.
INFO:tensorflow:loss = 66.9162, step = 1
INFO:tensorflow:global_step/sec: 193.751
INFO:tensorflow:loss = 54.2068, step = 101 (0.501 sec)
INFO:tensorflow:global_step/sec: 213.085
INFO:tensorflow:loss = 44.7816, step = 201 (0.469 sec)
INFO:tensorflow:global_step/sec: 222.225
INFO:tensorflow:loss = 40.8512, step = 301 (0.453 sec)
INFO:tensorflow:global_step/sec: 221.738
INFO:tensorflow:loss = 52.2857, step = 401 (0.448 sec)
INFO:tensorflow:global_step/sec: 231.532
INFO:tensorflow:loss = 52.6999, step = 501 (0.449 sec)
INFO:tensorflow:global_step/sec: 228.229
INFO:tensorflow:loss = 41.6077, step = 601 (0.437 sec)
INFO:tensorflow:global_step/sec: 231.685
INFO:tensorflow:loss = 54.2148, step = 701 (0.432 sec)
INFO:tensorflow:global_step/sec: 220.034
INFO:tensorflow:loss = 48.9944, step = 801 (0.439 sec)
INFO:tensorflow:global_step/sec: 223.945
INFO