Install package dependencies


In [0]:
!pip install googledrivedownloader

Include packages

In [0]:
import numpy as np
import csv
import tensorflow as tf
from google_drive_downloader import GoogleDriveDownloader

This code is necessary in order to read the Income dataset from csv files, and turn it into a numerical representation.

In [0]:
categoricals = {
    'workclass': [
        'Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov',
        'State-gov', 'Without-pay', 'Never-worked', '?'
    ],
    'education': [
        'Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school',
        'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters',
        '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool', '?'
    ],
    'marital_status': [
        'Married-civ-spouse', 'Divorced', 'Never-married',
        'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse', '?'
    ],
    'occupation': [
        'Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
        'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical',
        'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces', '?'
    ],
    'relationship': ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried', '?'],
    'race': ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black', '?'],
    'sex': ['Female', 'Male', '?'],
    'native_country': [
        'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
        'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China',
        'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica',
        'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic',
        'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala',
        'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador',
        'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands', '?'
    ],
    'income': ['<=50K', '>50K']
}


class Person(object):

    def __init__(self, age, workclass, fnlwgt, education, education_num,
                 marital_status, occupation, relationship, race, sex, capital_gain,
                 capital_loss, hours_per_week, native_country, income):
        self.age = int(age)
        self.workclass = workclass
        self.fnlwgt = int(fnlwgt)
        self.education = education
        self.education_num = int(education_num)
        self.marital_status = marital_status
        self.occupation = occupation
        self.relationship = relationship
        self.race = race
        self.sex = sex
        self.capital_gain = int(capital_gain)
        self.capital_loss = int(capital_loss)
        self.hours_per_week = int(hours_per_week)
        self.native_country = native_country
        self.income = income

    @staticmethod
    def to_categorical(key, value):
        values = categoricals[key]
        cat = np.zeros(shape=len(values))
        cat[values.index(value)] = 1
        return cat

    @property
    def to_numeric(self):
        list = [[self.age],
                self.to_categorical('workclass', self.workclass),
                [self.fnlwgt],
                self.to_categorical('education', self.education),
                [self.education_num],
                self.to_categorical('marital_status', self.marital_status),
                self.to_categorical('occupation', self.occupation),
                self.to_categorical('relationship', self.relationship),
                self.to_categorical('race', self.race),
                self.to_categorical('sex', self.sex),
                [self.capital_gain],
                [self.capital_loss],
                [self.hours_per_week],
                self.to_categorical('native_country', self.native_country),
                self.to_categorical('income', self.income),
                ]
        return list

GoogleDriveDownloader.download_file_from_google_drive(file_id='1Dr8ybk7vEFVdZzDi_YHkFoHSQQatqduS',
                                                      dest_path='./income.zip',
                                                      overwrite=True,
                                                      unzip=True)


def load_csv(csv_name):
    with open(csv_name, 'rt') as file:
        csv_reader = csv.reader(file)

        samples = []
        for row in csv_reader:
            row = [s.strip() for s in row]
            samples.append([item for sublist in Person(*row).to_numeric for item in sublist])

    samples = np.stack(samples)
    return samples


def load_income_dataset():

    train_samples = load_csv('income_train.csv')
    test_samples = load_csv('income_test.csv')
    x_train = train_samples[:, :-2]
    y_train = train_samples[:, -2:]
    x_test = test_samples[:, :-2]
    y_test = test_samples[:, -2:]

    x_train /= np.max(x_train + np.finfo(np.float32).eps, axis=0, keepdims=True)
    x_test /= np.max(x_train + np.finfo(np.float32).eps, axis=0, keepdims=True)

    return x_train, y_train, x_test, y_test


Define some training parameters

In [0]:
# Training parameters
batch_size = 64
n_epochs = 100
learning_rate = 1e-3

Actually load the Income dataset, splitted in training and testing.

In [0]:
# Read data
x_train, y_train, x_test, y_test = load_income_dataset()

n_train_samples, sample_dim = x_train.shape
n_train_samples, n_classes = y_train.shape
n_test_samples, sample_dim = x_test.shape
n_test_samples, n_classes = y_test.shape

print('Number of training examples: {}'.format(n_train_samples))
print('Number of test examples: {}'.format(n_test_samples))
print('Number of classes: {}'.format(n_classes))
print('Number of features: {}'.format(sample_dim))

Define the placeholders for the input and the target. Can you guess the right shape? (Hint, they both have two dimensions)

In [0]:
# Define placeholders (2-d)
x = tf.placeholder(shape=(None, sample_dim), name='x', dtype=tf.float32)
y = tf.placeholder(shape=(None, n_classes), name='y', dtype=tf.float32)

Define the Multi-Layer Perceptron model, using [tf.layers.dense](https://www.tensorflow.org/api_docs/python/tf/layers/dense).

In [0]:
# Multi-layer perceptron
h = tf.layers.dense(x, 512, activation=tf.nn.relu)
h = tf.layers.dense(h, 512, activation=tf.nn.relu)
h = tf.layers.dense(h, 512, activation=tf.nn.relu)
h = tf.layers.dense(h, 512, activation=tf.nn.relu)
h = tf.layers.dense(h, 512, activation=tf.nn.relu)
h = tf.layers.dense(h, 256, activation=tf.nn.relu)
y_pred = tf.layers.dense(h, n_classes, activation=tf.nn.softmax)

Create a crossentropy loss function.

In [0]:
# Define objective function
loss = - tf.reduce_mean(y * tf.log(y_pred + 0.00001))

Create an optimizer (see [tf.train.GradientDescentOptimizer](https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer))

In [0]:
# Define optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

Define the tensorflow operation for the training step.

In [0]:
# Define one training iteration
train_step = optimizer.minimize(loss)

Train the model. This is usually implemented as a double loop over training epochs and batches. For each batch, perform one training step and print the loss function.

In [0]:
with tf.Session() as sess:

    # Initialize all variables
    sess.run(tf.global_variables_initializer())

    # Number of batches per epoch
    batches_per_epoch = n_train_samples // batch_size

    # Train
    for i in range(n_epochs):
        total_loss = 0
        for b in range(0, batches_per_epoch):
            # Get the b-th training batch
            start = b*batch_size
            end = (b+1)*batch_size
            x_batch, y_batch = x_train[start:end], y_train[start:end]
            
            # Run the training operator and compute the loss feeding the batch
            _, l = sess.run([train_step, loss], feed_dict={x: x_batch, y: y_batch})
            
            total_loss += l
            
        # Print mean loss among epoch
        print('Epoch {0}: {1}'.format(i, total_loss / batches_per_epoch))
        
        
    # Test?
    correct_predictions = 0
    for i in range(0, len(x_test)):
        x_t, y_t = x_test[i:i+1], y_test[i:i+1]
        y_p = sess.run(y_pred, feed_dict={x: x_t})
        
        correct_predictions += np.sum(np.round(y_p) * y_t)
    print('Test accuracy: {}'.format(correct_predictions / len(x_test)))