# Import  libralies to deal with ML model, matrixs and time

In [47]:
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import multiprocessing
import six
import time

In [48]:
#Use Infor to track training,set WARN as default value
print("Current Tensor Flow Version is %s" %(tf.__version__))

Current Tensor Flow Version is 1.13.1


In [49]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [50]:
# Define some input formats for input csv file columns
COLUMNS = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country', 'income_bracket']

In [51]:
# Define some input formats for selected feature columns to be used in the model
# fnlwgt and income_bracket not included
FEATURE_COLUMNS = ['age', 'workclass', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country']

In [52]:
# Define some input formats for caegorical columns, not including continuous values
CATEGORICAL_COLUMNS = ['workclass', 'education',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
                'native_country']

# Import pandas library to explore datasets


In [53]:
# to check if the data set is small enough to be handeled in memory all at once as one batch
# explore data properties and decide how to use data effectively and efficiently
import pandas as pd


In [54]:
#read the original file and check the first 5 rows
df = pd.read_csv("adult.data.csv", header = None, names=COLUMNS)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [55]:
#check some statistics of the dataset
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [56]:
# check some correlations of the variables
df.corr()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756
fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768
education_num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123
capital_gain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409
capital_loss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256
hours_per_week,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0


#  Used Tensorflow directly to load data by batch for big and scalable datasets

In [57]:

#generate a function to facilitate the the input of data into our model
# use dictionary to map features to tensor values, this represents a key to value set linked up
# used a list to hold the tensor of values to represent the labes, which is the answer to be predicted


In [58]:
# create a function to transform original input file name to tensor values to be used later
# set a proper batch size according to the power of your CPUs
BATCH_SIZE=40
def generate_input_fn(filename, batch_size=BATCH_SIZE):
    def _input_fn():
        #pass in file name to get string as input
        filename_queue = tf.train.string_input_producer([filename])
        # read line by line
        reader = tf.TextLineReader()
        # only get up to batch_size number of lines to process at one time, pass in queue and size
        key, rows = reader.read_up_to(filename_queue, num_records=batch_size)
        # set record default as value or string, but it must match the columns original datatype
        record_defaults = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
                       [0], [0], [0], [''], ['']]
        # get batch size of rows to expand matrix dimession which is to shape new axies to hold record column values
        # now we have a new subset of data frame of one batch size number of rows and orignal number of colums
        rows = rows[:, np.newaxis]
        # read out and decode the new data subset by rows and columns
        columns = tf.decode_csv(rows, record_defaults=record_defaults)
        #use dictionary to map this tensor flow value of the columns to the original all COLUMN names in the csv file 
        #now we will link up all columns name to all column tensor values 
        all_columns = dict(zip(COLUMNS, columns))
        
        # Create the label as the answer sheet by pop the column that has the answer
        income_bracket = all_columns.pop('income_bracket')
        
        # Delete some columns that is not useful for the model to save computing power
        all_columns.pop('fnlwgt', 'this useless column not found')
        # repeat this function untill all useless columns are removed
        
        # repeat this function untill all useless columns are removed
        # all the rest columns from all_columns can be used as our selected feature columns
        features = all_columns
        
        #Compare results and set the lable answers to 0 and 1 as integer for output according to the rules
        labels = tf.to_int32(tf.equal(income_bracket, " >50k"))
        
        
        # output selected feature columns and lable answers
        return features, labels
    return _input_fn
# end of the input function
print('The input function has been successfully executed.')
                       

The input function has been successfully executed.


# Feature Selection by doing research on some column transformation in order  to train models and compare result accuracy

In [73]:
# Set up continuous value, lower dimensional and dense real value columns
age = layers.real_valued_column("age")
education_num = layers.real_valued_column("education_num")
capital_gain = layers.real_valued_column("capital_gain")
capital_loss = layers.real_valued_column("capital_loss")
hours_per_week = layers.real_valued_column("hours_per_week")

print('Continous value columns transformed.')

Continous value columns transformed.


In [74]:
#Use sparse colum with keys to handel columns with small amount of categories and we can easily pick a tensor value for it
#Use hash bucket for features with many categorical names
#import the layers library to create features
from tensorflow.contrib import layers

In [75]:
# use keys
gender = layers.sparse_column_with_keys(column_name="gender", keys=["female","male"])
race = layers.sparse_column_with_keys(column_name="race", keys=["Amer-Indian-Eskin","Asian-Pac-Islande","Black","Other","White"])


In [76]:
#use buckets, pick a approriate size according to your CPU power
education = layers.sparse_column_with_hash_bucket("education", hash_bucket_size = 1000)
marital_status = layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size = 100)
relationship = layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size = 100)
workclass = layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size = 100)
occupation = layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size = 1000)
native_country = layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size = 1000)

print('Sparse categorical columns transformed.')


Sparse categorical columns transformed.


# Use domain knowledge and intuitions to explore different combination of columns to get more value out of the dataset

In [77]:
# Categorical features can be of some relationship.
# we can cross the feture to explore if some meaningful categories are related to each other by groupping


In [78]:
# Age can be tranformed to categorical values by bucketizing due to different range of age have different influence to others
# try to set a bin of size 5 and a range by commensense
age_buckets = layers.bucketized_column(age,boundaries=[18,25,30,35,40,45,50,55,60,65])

In [79]:
# Try some feature crossing to eplore some more columns by intuition
education_occupation = layers.crossed_column([education, occupation], hash_bucket_size = int(1e4))

age_race_occupation = layers.crossed_column([age_buckets, race, occupation], hash_bucket_size = int(1e6))

country_occupation = layers.crossed_column([native_country, occupation], hash_bucket_size = int(1e4))

print('selected categorical crossing columns transformed as additional higher degree dimensions of new wide column fetures.')

selected categorical crossing columns transformed as additional higher degree dimensions of new wide column fetures.


In [80]:
# group the colums together by dimension complexity, representing by continous value deep and categorical wide

wide_columns = [gender, race, native_country, education, occupation, workclass, marital_status, 
                relationship, age_buckets, education_occupation, age_race_occupation, country_occupation]

In [81]:
# set dimention as a small number to start with but can increase later to refine model by using a higher dimension
deep_columns = [layers.embedding_column(workclass, dimension = 8),
                layers.embedding_column(education, dimension = 8),
                layers.embedding_column(marital_status, dimension = 8),
                layers.embedding_column(gender, dimension = 8),
                layers.embedding_column(relationship, dimension = 8),
                layers.embedding_column(race, dimension = 8),
                layers.embedding_column(native_country, dimension = 8),
                layers.embedding_column(occupation, dimension = 8),
                age,
                education_num,
                capital_gain,
                capital_loss,
                hours_per_week
               ]
print('continous and categorical columns grouped.')

continous and categorical columns grouped.


# Build severl ML models to compare results

In [82]:
#import tensor flow libary to build model
from tensorflow.contrib import learn
#List out directory of different model types
def create_model_dir(model_type):
    return 'models/model_' + model_type + '_' + str(int(time.time()))
# pass in model_type to run model, if new model or model_dir is not provided, use default model_type and directory
def get_model(model_type, new_model = False, model_dir = None):
    if new_model or model_dir is None:
        # if model not provided, use old model type dirctory to run model, show dirctory of model
        model_dir= create_model_dir(model_type)
    print('Currenty Model Dirctory =  %s' % model_dir)

    # set no model provide as default model m
    m = None

    # Try several different models
    # Linear Classifier
    if model_type == 'WIDE':
        m = learn.LinearClassifier(model_dir = model_dir, 
                                   feature_columns = wide_columns)
        
    # Deep Neural Network Classifier
    if model_type == 'DEEP':
        m = learn.DNNClassifier(
        model_dir=model_dir,
        feature_columns = deep_columns,
        hidden_units=[100,50])
        
    # Combined Linear and NN classifier
    if model_type == 'WIDE_AND_DEEP':
            m= learn.DNNLinearCombinedClassifier(model_dir=model_dir,
                                               linear_feature_columns=wide_columns,
                                               dnn_feature_columns= deep_columns,
                                               dnn_hidden_units=[100,70,50,25])

    print('Model successfuly built.')
    return m, model_dir

# set defaul model type and pass in model
MODEL_TYPE = 'WIDE_AND_DEEP'
#output model directory
model_dir = create_model_dir(model_type = MODEL_TYPE)
#call the function to run model
m, model_dir = get_model(model_type = MODEL_TYPE, model_dir=model_dir)


Currenty Model Dirctory =  models/model_WIDE_AND_DEEP_1572034090
Model successfuly built.


# Train the model by trying the train parameters

In [87]:
%%time
#import time, %% timeit must put on first line otherwise won't run. When hit enter, you can hear CPU fan boot up sound
# try different train steps and batch size depending on your Computer power
# need to show how much time used for training to compare model results later

#pass in train csv file
train_file = str("adult.data.csv")
#pass in test csv file
test_file = str("adult.test.csv")
#set a proper train step size
train_steps = 1000
#fit the model by passing all the parameters in
m.fit(input_fn=generate_input_fn(train_file,BATCH_SIZE), steps = train_steps)
print('Model successfuly Trained.')

Model successfuly Trained.
Wall time: 26.3 s


# Evalute the accuracy of the model by test dataset


In [88]:
TEST_STEPS=200
results = m.evaluate(input_fn=generate_input_fn(test_file),
                    steps =TEST_STEPS )

print('results')
print('Evaluation of the test data completed.')
print ('The model accuracy is: %s' %results['accuracy'])

results
Evaluation of the test data completed.
The model accuracy is: 1.0
