# Adult

In [1]:
from keras.layers import Dense, Dropout, Input, Conv2D, MaxPooling2D, Flatten, Activation
from keras.models import Sequential, Model
import keras
from keras.datasets import cifar100
import numpy as np
from keras import optimizers
from sklearn.model_selection import train_test_split
from keras.callbacks import CSVLogger
import tensorflow as tf
import innvestigate
import innvestigate.utils as iutils
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
from collections import namedtuple
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
import setGPU

setGPU: Setting GPU to: 5


In [3]:
sess = tf.InteractiveSession()

In [4]:
random_state = 42

In [5]:
path_to_data = r'' + '../data/'

### 1 Loading adult

In [6]:
# credits: http://stackoverflow.com/questions/2356925/how-to-check-whether-string-might-be-type-cast-to-float-in-python
def is_float(string):
    try:
        float(string)
        return True
    except ValueError:
        return False
    
def find_means_for_continuous_types(x):
    means = []
    for col in range(len(x[0])):
        m_sum = 0
        count = 0.000000000000000000001
        for value in x[:, col]:
            if is_float(value):
                m_sum += float(value)
                count += 1
        means.append(m_sum / count)
    return means

def prepare_data(raw_data, means, inputs, input_shape):
    x = raw_data[:, :-1]
    y = raw_data[:, -1:]

    # x:
    def flatten_persons_inputs_for_model(person_inputs, j_means, j_inputs, j_input_shape):
        float_inputs = []
        for l in range(len(j_input_shape)):
            features_of_this_type = j_input_shape[l]
            is_feature_continuous = j_inputs[l][1][0] == 'continuous'

            if is_feature_continuous:
                mean = j_means[l]
                if is_float(person_inputs[l]):
                    scale_factor = 1 / (2 * mean)  # we prefer inputs mainly scaled from -1 to 1.
                    float_inputs.append(float(person_inputs[l]) * scale_factor)
                else:
                    float_inputs.append(mean)
            else:
                for j in range(features_of_this_type):
                    feature_name = j_inputs[l][1][j]
                    if feature_name == person_inputs[l]:
                        float_inputs.append(1.)
                    else:
                        float_inputs.append(0)
        return float_inputs

    new_x = []
    for person in range(len(x)):
        formatted_x = flatten_persons_inputs_for_model(x[person], means, inputs, input_shape)
        new_x.append(formatted_x)
    new_x = np.array(new_x)

    # y:
    new_y = []
    for i in range(len(y)):
        if y[i] == ">50K":
            new_y.append(1)
        else:  # y[i] == "<=50k":
            new_y.append(0)
    new_y = np.array(new_y)
    return new_x, new_y



In [7]:
def loading_adult(path_to_data, random_state):
    inputs = (
        ("age", ("continuous",)),
        ("workclass", ("Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", "Local-gov", "State-gov",
                       "Without-pay", "Never-worked")),
        ("fnlwgt", ("continuous",)),
        ("education", ("Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", "Assoc-acdm", "Assoc-voc", "9th",
                       "7th-8th", "12th", "Masters", "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool")),
        ("education-num", ("continuous",)),
        ("marital-status", ("Married-civ-spouse", "Divorced", "Never-married", "Separated", "Widowed",
                            "Married-spouse-absent", "Married-AF-spouse")),
        ("occupation", ("Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial", "Prof-specialty",
                        "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", "Farming-fishing", "Transport-moving",
                        "Priv-house-serv", "Protective-serv", "Armed-Forces")),
        ("relationship", ("Wife", "Own-child", "Husband", "Not-in-family", "Other-relative", "Unmarried")),
        ("race", ("White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black")),
        ("sex", ("Female", "Male")),
        ("capital-gain", ("continuous",)),
        ("capital-loss", ("continuous",)),
        ("hours-per-week", ("continuous",)),
        ("native-country", ("United-States", "Cambodia", "England", "Puerto-Rico", "Canada", "Germany",
                            "Outlying-US(Guam-USVI-etc)", "India", "Japan", "Greece", "South", "China", "Cuba", "Iran",
                            "Honduras", "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico", "Portugal",
                            "Ireland", "France", "Dominican-Republic", "Laos", "Ecuador", "Taiwan", "Haiti", "Columbia",
                            "Hungary", "Guatemala", "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador",
                            "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands")),
    )

    input_shape = []
    column_names = []
    for i in inputs:
        count = len(i[1])
        if count <= 2:
            count = 1
            column_names.append(i[0])
        else:
            for j in range(len(i[1])):
                column_names.append("{}_{}".format(i[0], i[1][j]))
        input_shape.append(count)

    cols = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status",
            "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
            "Hours per week", "Country", "Target"]
    re_cols = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status",
               "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
               "Hours per week", "Country", "Target"]
    path_train = path_to_data + "/adult/adult.data"
    path_test = path_to_data + "/adult/adult.test"
    training_data = pd.read_csv(
                    path_train,
                    names=cols,
                    sep=r'\s*,\s*',
                    engine='python',
                    na_values="?")
    training_data = training_data[re_cols].values
    test_data = pd.read_csv(
                    path_test,
                    names=cols,
                    sep=r'\s*,\s*',
                    engine='python',
                    na_values="?")
    test_data = test_data[re_cols].values
    means = find_means_for_continuous_types(np.concatenate((training_data, test_data), 0))

    x_train, y_train = prepare_data(training_data, means, inputs, input_shape)
    x_test, y_test = prepare_data(test_data, means, inputs, input_shape)
    X_temp = np.concatenate([x_train, x_test])
    y_temp = np.concatenate([y_train, y_test])
    return X_temp, y_temp

In [8]:
# The `adult.test` file for the Adult dataset is a little compromised, to be able to use it. You need to remove the first line
# and remove the . (period) at the end of every line this can easily be done in a texteditor Replace `K.` with `K` 
X_temp, y_temp = loading_adult(path_to_data, random_state)
y_temp =  keras.utils.to_categorical(y_temp, 2)

### Training the models

In [9]:
def create_adult_base():
    # batch_size = 512
    # epochs = 50
    # learning_rate = 0.01
    model = Sequential()
    model.add(Dense(20, activation='tanh',
                    kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None),
                    bias_initializer='zeros'))
    model.add(Dense(20, activation='tanh',
                    kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None),
                    bias_initializer='zeros'))
    model.add(Dense(20, activation='tanh',
                    kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None),
                    bias_initializer='zeros'))
    model.add(Dense(20, activation='tanh',
                    kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None),
                    bias_initializer='zeros'))
    model.add(Dense(2, activation='softmax',
                    kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None),
                    bias_initializer='zeros'))

    return model

In [10]:
def evaluate_model(model,X,y):
    prediction = model.predict(X)
    loss = keras.losses.categorical_crossentropy(tf.convert_to_tensor(prediction),tf.convert_to_tensor(y))
    with sess.as_default():
        loss = loss.eval()
    try:
    # noinspection PyUnresolvedReferences
        model_wo_softmax = iutils.keras.graph.model_wo_softmax(model)
    except Exception:
        model_wo_softmax = model
    analyzer = innvestigate.create_analyzer("gradient", model_wo_softmax)
    analysis = analyzer.analyze(X)
    prediction_var = np.var(prediction,axis=1)
    analysis_var = np.var(analysis,axis=1)
    analysis_1 = np.linalg.norm(analysis,axis=1,ord=1)
    analysis_2 = np.linalg.norm(analysis,axis=1,ord=2)
    return loss, prediction_var, analysis_var, analysis_1, analysis_2

In [11]:
lr = 0.001
epochs = 20
points_per_model = 5000
decay = 1e-7
verbose = 0
experiment_name = "adult"

In [12]:
for random_state in range(1):
    print(random_state)
    print("######")
    path="ThresholdExperiments/{}/{}/".format(experiment_name,random_state)
    os.makedirs(path)
    np.random.seed(random_state)
    main_indicies = np.random.choice(len(X_temp),40000,replace=False)
    X,y = X_temp[main_indicies], y_temp[main_indicies]
    start = time.time()
    for model_number in range(4):
        print(model_number, end=" ")
        start_train = 2*model_number*points_per_model
        start_test = start_train+points_per_model
        end_test = start_test+points_per_model
        x_train, x_test =  X[start_train:start_test], X[start_test:end_test]
        y_train, y_test =  y[start_train:start_test], y[start_test:end_test]
        optimizer = optimizers.Adagrad(lr=lr, decay=decay)
        model = create_adult_base()
        model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        model.fit(x_train, y_train, epochs=epochs, 
              validation_data=(x_test, y_test), verbose=verbose)
        train_loss, train_prediction_var, train_analysis_var, train_analysis_1, train_analysis_2 = evaluate_model(model,x_train,y_train)
        df = pd.DataFrame()
        df['train_loss'] = train_loss
        df['train_prediction_var'] = train_prediction_var
        df['train_analysis_var'] = train_analysis_var
        df['train_analysis_1'] = train_analysis_1
        df['train_analysis_2'] = train_analysis_2
        test_loss, test_prediction_var, test_analysis_var, test_analysis_1, test_analysis_2 = evaluate_model(model,x_test,y_test)
        df['test_loss'] = test_loss
        df['test_prediction_var'] = test_prediction_var
        df['test_analysis_var'] = test_analysis_var
        df['test_analysis_1'] = test_analysis_1
        df['test_analysis_2'] = test_analysis_2
        df.to_csv("{}/{}.csv".format(path,model_number))
        print(time.time() - start)

0
######





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where







30.86101794242859
1 66.04473161697388
2 101.07218980789185
3 136.77771520614624
