In [1]:
import json
import numpy as np
import os
import pandas as pd
from shutil import rmtree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import sys
sys.path.insert(0, "../Code/")
from train import train_eval

SPLIT_SEED = 0


In [2]:
def load(dataset):
    if dataset == "adult":

        data = pd.read_csv("../fairness-comparison/fairness/data/preprocessed/adult_numerical-binsensitive.csv")

        colnames = list(data)

        for name in colnames:
            if name[:7] == "native-" or name[:8] == "marital-":
                data.drop(labels = [name], inplace = True, axis = 1)

        print("Features that we are considering")
        print(list(data))

        x_full = data.drop(labels = ["income-per-year"], inplace = False, axis = 1).values
        y_full = data["income-per-year"].values

    elif dataset == "ricci":

        data = pd.read_csv("../fairness-comparison/fairness/data/preprocessed/ricci_numerical-binsensitive.csv")

        print("Features that we are considering")
        print(list(data))

        x_full = data.drop(labels = ["Combine"], inplace = False, axis = 1).values
        y_full = 1 * (data["Combine"].values > 70)

    elif dataset == "german":

        data = pd.read_csv("../fairness-comparison/fairness/data/preprocessed/german_numerical-binsensitive.csv")

        print("Features that we are considering")
        print(list(data))

        x_full = data.drop(labels = ["credit"], inplace = False, axis = 1).values
        y_full = data["credit"].values % 2 # original: 2 -> bad, 1-> good

    print("Final Dimensions")
    print(x_full.shape)
    print(y_full.shape)
    
    return x_full, y_full

In [3]:
def run_baseline(x, y):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = SPLIT_SEED)
    
    scaler = StandardScaler()

    scaler.fit(x_train)

    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    model_lr = LogisticRegression().fit(x_train, y_train)
    print("Linear Acc: ", model_lr.score(x_test, y_test))
    
    model_svm = SVC().fit(x_train, y_train)
    print("SVM Acc: ", model_svm.score(x_test, y_test))
    
    model_rf = RandomForestClassifier().fit(x_train, y_train)
    print("RF Acc: ", model_rf.score(x_test, y_test))
    
    pred_lr = model_lr.predict(x_test)
    pred_svm = model_svm.predict(x_test)
    pred_rf = model_rf.predict(x_test)

    print("Linear & SVM Agreement: ", np.mean(pred_lr == pred_svm))
    print("Linear & RF Agreement: ", np.mean(pred_lr == pred_rf))
    print("SV & RF Agreement: ", np.mean(pred_svm == pred_rf))


In [4]:
x, y = load("adult")
run_baseline(x, y)

Features that we are considering
['age', 'education-num', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'income-per-year', 'race-sex', 'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private', 'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc', 'workclass_State-gov', 'workclass_Without-pay', 'education_10th', 'education_11th', 'education_12th', 'education_1st-4th', 'education_5th-6th', 'education_7th-8th', 'education_9th', 'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors', 'education_Doctorate', 'education_HS-grad', 'education_Masters', 'education_Preschool', 'education_Prof-school', 'education_Some-college', 'occupation_Adm-clerical', 'occupation_Armed-Forces', 'occupation_Craft-repair', 'occupation_Exec-managerial', 'occupation_Farming-fishing', 'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct', 'occupation_Other-service', 'occupation_Priv-house-serv', 'occupation_Prof-specialty', 'occupation_Protective-serv', 'occupat



Linear Acc:  0.8387481766343986
SVM Acc:  0.8421959952260973
RF Acc:  0.8301286301551518




Linear & SVM Agreement:  0.9474870706802812
Linear & RF Agreement:  0.8813154754011404
SV & RF Agreement:  0.8871502453255536


In [5]:
def eval_adult(sess, pred, perf_op, X, Y, x, y):

        n = x.shape[0]
        diffs = np.zeros((3))
        
        pred_original = sess.run(pred, {X: x})
        
        # Test the affect of increasing Age (feature at index 0)
        x_inc = np.copy(x)
        x_inc[:, 5] += 1
        pred_inc = sess.run(pred, {X: x_inc})
        diffs[0] = np.mean(pred_inc - pred_original)
        
        # Test the affect of changing Race (feature at index 2)    
        x_swap = np.copy(x)
        x_swap[:, 2] = (x_swap[:, 2] + 1) % 2
        pred_swap = sess.run(pred, {X: x_swap})
        diffs[1] = np.sqrt(np.mean((pred_swap - pred_original) ** 2))
        
        # Test the affect of changing Sex (feature at index 3)    
        x_swap = np.copy(x)
        x_swap[:, 3] = (x_swap[:, 3] + 1) % 2
        pred_swap = sess.run(pred, {X: x_swap})
        diffs[2] = np.sqrt(np.mean((pred_swap - pred_original) ** 2))
        
        out = {}
        out["Model Acc"] = sess.run(perf_op, {X: x, Y: y})
        out["Mean affect of increasing age by 1"] = diffs[0]
        out["RMSE of changing Race"] = diffs[1]
        out["RMSE of changing Sex"] = diffs[2]
        print(out)
        #with open("tests.txt", "w") as outfile:
            #json.dump(out, outfile)

In [6]:

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = SPLIT_SEED)
    
if os.path.exists("TB/Adult/"):
    rmtree("TB/Adult/")

train_eval(x_train, np.expand_dims(y_train, 1), "binary_classification", 
           hidden_layer_sizes = [150, 150, 150], learning_rate = 0.001, stopping_epochs = 100, tol = 0.0001, 
           eval_func = eval_adult, x_test = x_test, y_test = np.expand_dims(y_test, 1), flag = "UCI",
           name = "TB/Adult/")

train_eval(x_train, np.expand_dims(y_train, 1), "binary_classification", 
           hidden_layer_sizes = [150, 150, 150], learning_rate = 0.001, stopping_epochs = 100, tol = 0.0001, 
           eval_func = eval_adult, x_test = x_test, y_test = np.expand_dims(y_test, 1), flag = "UCI",
           name = "TB/Adult/",
           heuristics = [["inv", 0, 4, 1.0], ["inv", 2, 0.5, 1000.0], ["inv", 3, 0.5, 1000.0]])

INFO:tensorflow:Restoring parameters from ./model.cpkt
{'Model Acc': 0.85055035, 'Mean affect of increasing age by 1': 0.011059025302529335, 'RMSE of changing Race': 0.4230669438838959, 'RMSE of changing Sex': 0.6683206558227539}
INFO:tensorflow:Restoring parameters from ./model.cpkt
{'Model Acc': 0.8441851, 'Mean affect of increasing age by 1': 0.003944131545722485, 'RMSE of changing Race': 0.0002972083166241646, 'RMSE of changing Sex': 0.00036269091651774943}


In [7]:
x, y = load("german")
run_baseline(x, y)

Features that we are considering
['month', 'credit_amount', 'investment_as_income_percentage', 'sex', 'residence_since', 'age', 'number_of_credits', 'people_liable_for', 'credit', 'sex-age', 'status_A11', 'status_A12', 'status_A13', 'status_A14', 'credit_history_A30', 'credit_history_A31', 'credit_history_A32', 'credit_history_A33', 'credit_history_A34', 'purpose_A40', 'purpose_A41', 'purpose_A410', 'purpose_A42', 'purpose_A43', 'purpose_A44', 'purpose_A45', 'purpose_A46', 'purpose_A48', 'purpose_A49', 'savings_A61', 'savings_A62', 'savings_A63', 'savings_A64', 'savings_A65', 'employment_A71', 'employment_A72', 'employment_A73', 'employment_A74', 'employment_A75', 'other_debtors_A101', 'other_debtors_A102', 'other_debtors_A103', 'property_A121', 'property_A122', 'property_A123', 'property_A124', 'installment_plans_A141', 'installment_plans_A142', 'installment_plans_A143', 'housing_A151', 'housing_A152', 'housing_A153', 'skill_level_A171', 'skill_level_A172', 'skill_level_A173', 'skill_



In [8]:
def eval_german(sess, pred, perf_op, X, Y, x, y):

        n = x.shape[0]
        diffs = np.zeros((2))
        
        pred_original = sess.run(pred, {X: x})
        
        # Test the affect of changing Sex (feature at index 3)    
        x_swap = np.copy(x)
        x_swap[:, 3] = (x_swap[:, 3] + 1) % 2
        pred_swap = sess.run(pred, {X: x_swap})
        diffs[0] = np.sqrt(np.mean((pred_swap - pred_original) ** 2))
        
        # Test the affect of increasing Age (feature at index 5)
        x_inc = np.copy(x)
        x_inc[:, 5] += 1
        pred_inc = sess.run(pred, {X: x_inc})
        diffs[1] = np.mean(pred_inc - pred_original)

        out = {}
        out["Model Acc"] = sess.run(perf_op, {X: x, Y: y})
        out["RMSE of changing Sex"] = diffs[0]
        out["Mean affect of increasing age by 1"] = diffs[1]
        print(out)
        #with open("tests.txt", "w") as outfile:
            #json.dump(out, outfile)


In [9]:

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = SPLIT_SEED)
    
if os.path.exists("TB/German/"):
    rmtree("TB/German/")

train_eval(x_train, np.expand_dims(y_train, 1), "binary_classification", 
           hidden_layer_sizes = [150, 150, 150], learning_rate = 0.001, stopping_epochs = 100, tol = 0.0001, 
           eval_func = eval_german, x_test = x_test, y_test = np.expand_dims(y_test, 1), flag = "UCI", 
           name = "TB/German/")

train_eval(x_train, np.expand_dims(y_train, 1), "binary_classification", 
           hidden_layer_sizes = [150, 150, 150], learning_rate = 0.001, stopping_epochs = 100, tol = 0.0001, 
           eval_func = eval_german, x_test = x_test, y_test = np.expand_dims(y_test, 1), flag = "UCI", 
           name = "TB/German/",
           heuristics = [["inv", 3, 0.5, 1000.0], ["inv", 5, 3.0, 1.0]])



INFO:tensorflow:Restoring parameters from ./model.cpkt
{'Model Acc': 0.712, 'RMSE of changing Sex': 0.10801751911640167, 'Mean affect of increasing age by 1': 0.0799630880355835}
INFO:tensorflow:Restoring parameters from ./model.cpkt
{'Model Acc': 0.756, 'RMSE of changing Sex': 0.0003071436076425016, 'Mean affect of increasing age by 1': -0.0002613307151477784}
