# wide learning

In [1]:
import tensorflow as tf

In [2]:
import urllib
import tempfile

In [3]:
# train_file = tempfile.NamedTemporaryFile()
# test_file = tempfile.NamedTemporaryFile()


In [3]:
import pandas as pd

In [7]:
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", filename='adult.data')

('adult.data', <http.client.HTTPMessage at 0x169049c5128>)

In [8]:
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", filename='adult_test.data')

('adult_test.data', <http.client.HTTPMessage at 0x169049c5470>)

In [4]:
CSV_COLUMNS = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "gender",
    "capital_gain", "capital_loss", "hours_per_week", "native_country",
    "income_bracket"]
train_df = pd.read_csv('adult.data',names=CSV_COLUMNS, skipinitialspace=True)
test_df = pd.read_csv('adult_test.data',names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)


In [5]:
train_labels = (train_df["income_bracket"].apply(lambda x: '>50K' in x)).astype(int)
test_labels = (test_df["income_bracket"].apply(lambda x: '>50K' in x)).astype(int)

In [6]:
train_df=train_df.dropna(how='any', axis=0)
test_df=test_df.dropna(how='any', axis=0)
train_input_fn=tf.estimator.inputs.pandas_input_fn(x=train_df,y=train_labels, num_epochs=None, shuffle=True, batch_size=100)
test_input_fn = tf.estimator.inputs.pandas_input_fn(x=test_df, y=test_labels, num_epochs=1, shuffle=False,batch_size=100)

In [8]:
gender = tf.feature_column.categorical_column_with_vocabulary_list(key='gender', vocabulary_list=['Female', 'Male'])

In [7]:
marital_status = tf.feature_column.categorical_column_with_vocabulary_list(key='marital_status', vocabulary_list=train_df['marital_status'].unique())

In [9]:
occupation=tf.feature_column.categorical_column_with_vocabulary_list(key='occupation', vocabulary_list=train_df['occupation'].unique())

In [10]:
education = tf.feature_column.categorical_column_with_vocabulary_list(
    "education", [
        "Bachelors", "HS-grad", "11th", "Masters", "9th",
        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
        "Preschool", "12th"
    ])
relationship = tf.feature_column.categorical_column_with_vocabulary_list(
    "relationship", [
        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
        "Other-relative"
    ])
workclass = tf.feature_column.categorical_column_with_vocabulary_list(
    "workclass", [
        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
    ])
native_country = tf.feature_column.categorical_column_with_hash_bucket(
    "native_country", hash_bucket_size=1000)

In [11]:
age = tf.feature_column.numeric_column("age")
education_num = tf.feature_column.numeric_column("education_num")
capital_gain = tf.feature_column.numeric_column("capital_gain")
capital_loss = tf.feature_column.numeric_column("capital_loss")
hours_per_week = tf.feature_column.numeric_column("hours_per_week")

In [12]:
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

In [15]:
education_x_occupation = tf.feature_column.crossed_column(
    ["education", "occupation"], hash_bucket_size=1000)
age_buckets_x_education_x_occupation = tf.feature_column.crossed_column(
    [age_buckets, "education", "occupation"], hash_bucket_size=1000)
country_x_occupation = tf.feature_column.crossed_column(['native_country', occupation], hash_bucket_size=1000)

In [16]:
base_columns = [
    gender, native_country, education, occupation, workclass, relationship,
    age_buckets,
]
crossed_columns = [
    tf.feature_column.crossed_column(
        ["education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
    tf.feature_column.crossed_column(
        ["native_country", "occupation"], hash_bucket_size=1000)
]

In [22]:
model = tf.estimator.LinearClassifier(feature_columns=base_columns+crossed_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Lei\\AppData\\Local\\Temp\\tmph09j3u5f', '_log_step_count_steps': 100, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_tf_random_seed': 1}


In [23]:
model.train(input_fn=train_input_fn, steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Lei\AppData\Local\Temp\tmph09j3u5f\model.ckpt.
INFO:tensorflow:loss = 69.3147, step = 1
INFO:tensorflow:global_step/sec: 171.674
INFO:tensorflow:loss = 29.6601, step = 101 (0.591 sec)
INFO:tensorflow:global_step/sec: 180.372
INFO:tensorflow:loss = 31.1131, step = 201 (0.545 sec)
INFO:tensorflow:global_step/sec: 208.928
INFO:tensorflow:loss = 31.6184, step = 301 (0.479 sec)
INFO:tensorflow:global_step/sec: 198.143
INFO:tensorflow:loss = 29.4733, step = 401 (0.521 sec)
INFO:tensorflow:global_step/sec: 192.749
INFO:tensorflow:loss = 37.8348, step = 501 (0.503 sec)
INFO:tensorflow:global_step/sec: 187.708
INFO:tensorflow:loss = 31.6403, step = 601 (0.548 sec)
INFO:tensorflow:global_step/sec: 182.648
INFO:tensorflow:loss = 38.372, step = 701 (0.532 sec)
INFO:tensorflow:global_step/sec: 185.761
INFO:tensorflow:loss = 33.6536, step = 801 (0.538 sec)
INFO:tensorflow:global_step/sec: 172.309
INFO:

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x169049cf2b0>

In [24]:
evaluation=model.evaluate(input_fn=test_input_fn)

INFO:tensorflow:Starting evaluation at 2017-10-23-19:39:33
INFO:tensorflow:Restoring parameters from C:\Users\Lei\AppData\Local\Temp\tmph09j3u5f\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-10-23-19:39:36
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.832566, accuracy_baseline = 0.763774, auc = 0.883092, auc_precision_recall = 0.695403, average_loss = 0.352525, global_step = 1000, label/mean = 0.236226, loss = 35.2114, prediction/mean = 0.245728


In [25]:
model_reg = tf.estimator.LinearClassifier(feature_columns=base_columns+crossed_columns, 
                                          optimizer=tf.train.FtrlOptimizer(l1_regularization_strength=1.0, learning_rate=0.1,l2_regularization_strength=1.0))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Lei\\AppData\\Local\\Temp\\tmpngwmzt7x', '_log_step_count_steps': 100, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_tf_random_seed': 1}


In [26]:
model_reg.train(input_fn=train_input_fn, steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Lei\AppData\Local\Temp\tmpngwmzt7x\model.ckpt.
INFO:tensorflow:loss = 69.3147, step = 1
INFO:tensorflow:global_step/sec: 143.704
INFO:tensorflow:loss = 41.6227, step = 101 (0.710 sec)
INFO:tensorflow:global_step/sec: 146.002
INFO:tensorflow:loss = 36.6987, step = 201 (0.685 sec)
INFO:tensorflow:global_step/sec: 174.256
INFO:tensorflow:loss = 38.318, step = 301 (0.558 sec)
INFO:tensorflow:global_step/sec: 185.245
INFO:tensorflow:loss = 31.0162, step = 401 (0.540 sec)
INFO:tensorflow:global_step/sec: 206.053
INFO:tensorflow:loss = 34.1197, step = 501 (0.485 sec)
INFO:tensorflow:global_step/sec: 148.027
INFO:tensorflow:loss = 37.7318, step = 601 (0.680 sec)
INFO:tensorflow:global_step/sec: 190.284
INFO:tensorflow:loss = 41.2929, step = 701 (0.537 sec)
INFO:tensorflow:global_step/sec: 162.594
INFO:tensorflow:loss = 42.7422, step = 801 (0.599 sec)
INFO:tensorflow:global_step/sec: 156.266
INFO:

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x169071067b8>

In [27]:
evaluation2=model_reg.evaluate(input_fn=test_input_fn)

INFO:tensorflow:Starting evaluation at 2017-10-23-19:42:49
INFO:tensorflow:Restoring parameters from C:\Users\Lei\AppData\Local\Temp\tmpngwmzt7x\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-10-23-19:42:52
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.837541, accuracy_baseline = 0.763774, auc = 0.885906, auc_precision_recall = 0.700899, average_loss = 0.348481, global_step = 1000, label/mean = 0.236226, loss = 34.8075, prediction/mean = 0.24273


# wide & deep learning

In [17]:
deep_columns = [
    # multi_hot representation, appropriate for categorical columns with only a few possible values 
    tf.feature_column.indicator_column(workclass),
    tf.feature_column.indicator_column(education),
    tf.feature_column.indicator_column(gender),
    tf.feature_column.indicator_column(relationship),
    
    # The higher the dimension of the embedding is, 
    # the more degrees of freedom the model will have to learn the representations of the features.
    # sparse, high-dimensional categorical features are first converted into a low-dimensional and dense real-valued vector, 
    tf.feature_column.embedding_column(native_country, dimension=8),
    tf.feature_column.embedding_column(occupation,dimension=8),
    #  concatenated with the continuous features
    age, 
    education_num,
    capital_gain,
    capital_loss,
    hours_per_week
]

In [18]:
joint_model = tf.estimator.DNNLinearCombinedClassifier(dnn_hidden_units=[100,50], dnn_feature_columns=deep_columns,
        linear_feature_columns=crossed_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'C:\\Users\\Lei\\AppData\\Local\\Temp\\tmpuoe09nm2', '_tf_random_seed': 1, '_keep_checkpoint_max': 5}


In [19]:
joint_model.train(input_fn=train_input_fn, steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into C:\Users\Lei\AppData\Local\Temp\tmpuoe09nm2\model.ckpt.
INFO:tensorflow:step = 1, loss = 800.224
INFO:tensorflow:global_step/sec: 129.886
INFO:tensorflow:step = 101, loss = 91.1717 (0.833 sec)
INFO:tensorflow:global_step/sec: 191.894
INFO:tensorflow:step = 201, loss = 47.9213 (0.460 sec)
INFO:tensorflow:global_step/sec: 223.785
INFO:tensorflow:step = 301, loss = 42.1696 (0.456 sec)
INFO:tensorflow:global_step/sec: 220.57
INFO:tensorflow:step = 401, loss = 56.4651 (0.438 sec)
INFO:tensorflow:global_step/sec: 231.468
INFO:tensorflow:step = 501, loss = 47.4517 (0.432 sec)
INFO:tensorflow:global_step/sec: 215.422
INFO:tensorflow:step = 601, loss = 47.1432 (0.470 sec)
INFO:tensorflow:global_step/sec: 223.558
INFO:tensorflow:step = 701, loss = 42.1435 (0.448 sec)
INFO:tensorflow:global_step/sec: 210.825
INFO:tensorflow:step = 801, loss = 45.5199 (0.468 sec)
INFO:tensorflow:global_step/sec: 231.469
INFO:

<tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier at 0x26ed8c90fd0>

In [20]:
evaluation = joint_model.evaluate(test_input_fn)

INFO:tensorflow:Starting evaluation at 2017-10-24-08:05:42
INFO:tensorflow:Restoring parameters from C:\Users\Lei\AppData\Local\Temp\tmpuoe09nm2\model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2017-10-24-08:05:44
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.830109, accuracy_baseline = 0.763774, auc = 0.856434, auc_precision_recall = 0.695002, average_loss = 0.401289, global_step = 1000, label/mean = 0.236226, loss = 40.0821, prediction/mean = 0.273192
