In [1]:
import numpy as np
import os
import pandas as pd
from shutil import rmtree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import sys
sys.path.insert(0, "../Code/")
from train import train_eval

SPLIT_SEED = 0


In [2]:
def load(dataset):
    if dataset == "adult":

        data = pd.read_csv("../fairness-comparison/fairness/data/preprocessed/adult_numerical-binsensitive.csv")

        colnames = list(data)

        for name in colnames:
            if name[:7] == "native-" or name[:8] == "marital-":
                data.drop(labels = [name], inplace = True, axis = 1)

        print("Features that we are considering")
        print(list(data))

        x_full = data.drop(labels = ["income-per-year"], inplace = False, axis = 1).values
        y_full = data["income-per-year"].values

    elif dataset == "ricci":

        data = pd.read_csv("../fairness-comparison/fairness/data/preprocessed/ricci_numerical-binsensitive.csv")

        print("Features that we are considering")
        print(list(data))

        x_full = data.drop(labels = ["Combine"], inplace = False, axis = 1).values
        y_full = 1 * (data["Combine"].values > 70)

    elif dataset == "german":

        data = pd.read_csv("../fairness-comparison/fairness/data/preprocessed/german_numerical-binsensitive.csv")

        print("Features that we are considering")
        print(list(data))

        x_full = data.drop(labels = ["credit"], inplace = False, axis = 1).values
        y_full = data["credit"].values % 2 # original: 2 -> bad, 1-> good

    print("Final Dimensions")
    print(x_full.shape)
    print(y_full.shape)
    
    return x_full, y_full

In [3]:
def run_baseline(x, y):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = SPLIT_SEED)
    
    scaler = StandardScaler()

    scaler.fit(x_train)

    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    model_lr = LogisticRegression().fit(x_train, y_train)
    print("Linear Acc: ", model_lr.score(x_test, y_test))
    
    model_svm = SVC().fit(x_train, y_train)
    print("SVM Acc: ", model_svm.score(x_test, y_test))
    
    model_rf = RandomForestClassifier().fit(x_train, y_train)
    print("RF Acc: ", model_rf.score(x_test, y_test))
    
    pred_lr = model_lr.predict(x_test)
    pred_svm = model_svm.predict(x_test)
    pred_rf = model_rf.predict(x_test)

    print("Linear & SVM Agreement: ", np.mean(pred_lr == pred_svm))
    print("Linear & RF Agreement: ", np.mean(pred_lr == pred_rf))
    print("SV & RF Agreement: ", np.mean(pred_svm == pred_rf))


In [4]:
x, y = load("adult")
run_baseline(x, y)

Features that we are considering
['age', 'education-num', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'income-per-year', 'race-sex', 'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private', 'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc', 'workclass_State-gov', 'workclass_Without-pay', 'education_10th', 'education_11th', 'education_12th', 'education_1st-4th', 'education_5th-6th', 'education_7th-8th', 'education_9th', 'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors', 'education_Doctorate', 'education_HS-grad', 'education_Masters', 'education_Preschool', 'education_Prof-school', 'education_Some-college', 'occupation_Adm-clerical', 'occupation_Armed-Forces', 'occupation_Craft-repair', 'occupation_Exec-managerial', 'occupation_Farming-fishing', 'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct', 'occupation_Other-service', 'occupation_Priv-house-serv', 'occupation_Prof-specialty', 'occupation_Protective-serv', 'occupat

TypeError: Invalid parameters passed: {'seed': 0}

In [None]:
x, y = load("german")
run_baseline(x, y)

In [None]:

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = SPLIT_SEED)
    
if os.path.exists("TB/none"):
    rmtree("TB/none")

train_eval(x_train, np.expand_dims(y_train, 1), "binary_classification", 
           hidden_layer_sizes = [150, 150, 150], learning_rate = 0.001, stopping_epochs = 100, tol = 0.0001, 
           x_test = x_test, y_test = np.expand_dims(y_test, 1))
