In [None]:
#this will be the folder_name
model_folder_name="/LR"

#use this to run several models if desired
model_name="LR"

In [None]:
import pandas as pd
import os
import subprocess
import numpy as np
import time

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc,roc_curve,confusion_matrix

In [None]:
#############################
# list experiment parameters
# note that if the number of parties is different than 3 - 
# you need to create the required configuration files that are used by the viff package
#############################
parties=3

public_path=os.getcwd()+"/MyDistExperiment/public"
private_path=os.getcwd()+"/MyDistExperiment/private/" + str(parties) + "p"

ds_path=os.getcwd()+"/MyDistExperiment/datasets"

lams=list(np.around(np.arange(0.0,1.1,0.1),decimals=2))
print(lams)

#repairs=["III_min","I_min","I_med"]
repairs=["III_min"]

#datasets=["synthetic9"]
#datasets=["propublica-recidivism_numerical-binsensitive_race_1attr"]
datasets=["propublica-recidivism_numerical-binsensitive_race"]

print(datasets)


In [None]:
#############################
# after choosing a dataset
# we split it with the same random seed to train and test
# we split it with the same random seed to each of the parties
# here we define the functions for creating these partitions
#############################

def create_train_test_splits(num,TRAINING_PERCENT,df):
    splits=[]
    for i in range(0, num):
        # we first shuffle a list of indices so that each subprocessed data
        # is split consistently
        n=len(df)

        #for each i use i as seed for the shuffle
        a = np.arange(n)
        np.random.seed(i)
        np.random.shuffle(a)

        split_ix = int(n * TRAINING_PERCENT)
        train_fraction = a[:split_ix]
        test_fraction = a[split_ix:]

        train = df.iloc[train_fraction]
        test = df.iloc[test_fraction]
        
        splits.append((train, test))

    return splits

###########################
#we usually use num=1 since the reuslts are exact in our MPC repair process
###########################

def create_random_splits_for_parties(df,num=1,PARTIES=parties,SPLIT_PERCENTS=None):
    if SPLIT_PERCENTS is None:
        SPLIT_PERCENTS=[1.0/PARTIES]*PARTIES
        
    if sum(SPLIT_PERCENTS)>1:
        raise RuntimeError("The sum of the inputs of split percents is larger than 1")
    if sum(SPLIT_PERCENTS)<0.9999999999999:
        raise RuntimeError("The sum of the inputs of split percents is lower than 1")
    if len(SPLIT_PERCENTS)>PARTIES:
        raise RuntimeError("The length of the inputs of split percents is larger than the number of parties")
    if len(SPLIT_PERCENTS)<PARTIES:
        raise RuntimeError("The length of the inputs of split percents is lower than the number of parties")

        
    splits=[]
    for i in range(0, num):
        # we first shuffle a list of indices so that each subprocessed data
        # is split consistently
        n=len(df)

        #for each i use i as seed for the shuffle
        a = np.arange(n)
        np.random.seed(i+100)
        np.random.shuffle(a)

        parties_dfs = []
        start_split_ix=0
        end_split_ix=0
        for ell in range(0,PARTIES):
            end_split_ix=end_split_ix+int(n*SPLIT_PERCENTS[ell])
            if (ell==PARTIES-1):
                end_split_ix=n
            #print(end_split_ix)
            party_fraction=a[start_split_ix:end_split_ix]
            party_df=df.iloc[party_fraction]
            parties_dfs.append(party_df)
            start_split_ix=end_split_ix
            #print(parties_dfs)
    
        splits.append(parties_dfs)

    return splits

In [None]:
%%capture
for dataset_name in datasets:
    
    ####################
    # define dataset related properties
    # note: ignore_features are the features to ignore in the repair stage
    ####################
    
    df=None
    
    dataset_path=ds_path+"/" + dataset_name + ".csv"
    df=pd.read_table(dataset_path,delimiter=",")
    print(dataset_name)
    
    
    if dataset_name.startswith(("synthetic")):
        sensitive_feature="Socioeconomic_status"
        priv_value="Privileged"
        target_feature="Success"
        positive_class_val=0
        ignore_features=[]


    if dataset_name.startswith(("propublica-recidivism")):
        sensitive_feature="race"
        #sensitive_feature="sex"
        priv_value=1
        target_feature="two_year_recid"
        positive_class_val=1
        #positive_class_val=0
        ignore_features=[]

        

    # features - all features that are not in the target, sensitive, ignore
    all_features=[col for col in df.columns if col not in [sensitive_feature,target_feature,ignore_features]]
    print(all_features)

    alphas=[]
    betas=[]
    max_bin_sizes=[]
    features=[]
    for attr in all_features:
        #number of unique values
        num_vals=len(list(set(df[attr])))
        if num_vals>2:
            features.append(attr)
            alphas.append(min(df[attr]))
            betas.append(max(df[attr]))
            max_bin_sizes.append(num_vals)
        else:
            ignore_features.append(attr)

    print(ignore_features)
    print(features)
    print(alphas)
    print(betas)    

 
    print("######")
    print("#bins#")
    print("######")

    #####
    #only at most 10% of the min between sum(priv_sizes) and sum(unpriv_sizes), since B<<nu,nv
    #####

    n_v=sum(df[sensitive_feature]==priv_value)
    n_u=sum(df[sensitive_feature]!=priv_value)
    print(n_v,n_u)

    lower_n=min(n_u,n_v)
    print(lower_n)
    bin_nums_list=[]

    bnlst=[1,2,3,4,6,8,9,10]

    print(bnlst)
    
    for b in bnlst:
        bin_nums=[b]*len(features)
        bin_nums_list.append(bin_nums)

    print(bin_nums_list)
    print(len(bin_nums_list))
    
    
    
    #####################################
    # creating partitioning for parties
    #####################################

    df_splits=create_random_splits_for_parties(df,1,parties,[1/parties]*parties)

    priv_sizes=[]
    unpriv_sizes=[]
    for ell in range(0,len(df_splits[0])):
        df_splits[0][ell].insert(len(df_splits[0][ell].columns),"party",ell)
        df_party=df_splits[0][ell]
        df_party.to_csv(private_path+"/party"+str(ell)+"/"+dataset_name+"_partial.csv",index=True,index_label="index")
        priv_sizes.append(sum(df_party[sensitive_feature]==priv_value))
        unpriv_sizes.append(sum(df_party[sensitive_feature]!=priv_value))

    df_with_parties=pd.concat(df_splits[0]).sort_index()
    print(df_with_parties)

    ignore_features.append('party')
    ignore_features.append('index')

    print(priv_sizes)
    print(unpriv_sizes)
    
    ###########
    # save constant parameters
    ###########
    
    with open(public_path+"/constants/parties.ini", 'w') as f:
        (f.write(str(parties)))
    with open(public_path+"/constants/repairs.ini", 'w') as f:
        (f.write(str(repairs)))
    with open(public_path+"/constants/dataset_name.ini", 'w') as f:
        (f.write(str(dataset_name)))
    with open(public_path+"/constants/lam.ini", 'w') as f:
        (f.write(str(lams)))
    with open(public_path+"/constants/features.ini", 'w') as f:
        (f.write(str(features)))
    with open(public_path+"/constants/ignore_features.ini", 'w') as f:
        (f.write(str(ignore_features)))
    with open(public_path+"/constants/sensitive_feature.ini", 'w') as f:
        (f.write(sensitive_feature+","+str(priv_value)))
    with open(public_path+"/constants/target_feature.ini", 'w') as f:
        (f.write(target_feature))
    with open(public_path+"/constants/alphas.ini", 'w') as f:
        (f.write(str(alphas)))
    with open(public_path+"/constants/betas.ini", 'w') as f:
        (f.write(str(betas)))
    with open(public_path+"/constants/priv_sizes.ini", 'w') as f:
        (f.write(str(priv_sizes)))
    with open(public_path+"/constants/unpriv_sizes.ini", 'w') as f:
        (f.write(str(unpriv_sizes)))


    
    ###########
    # loop through bin_nums_list
    ###########
        
    num_train_test_splits=10
    train_percent=2.0/3.0

    with open(os.getcwd()+ model_folder_name +'/gran_results_' + dataset_name + "_" + sensitive_feature+ "_p" + str(parties) + ".csv", 'a') as f:
        f.write("dataset_name,sensitive_feature,priv_value,lam,max_bin,bin_nums,parties,n_u,n_v,priv_sizes,unpriv_sizes,alphas,betas,number_of_features,iteration,DFNR,DFPR,SUM_DIFFS,ACC,bin_repair_for_all_lams_together_time[sec],repair_type\n")

    
    with open(os.getcwd()+ model_folder_name +'/results_' + dataset_name + "_" + sensitive_feature+ "_p" + str(parties) + ".csv", 'a') as f:
        f.write("dataset_name,sensitive_feature,priv_value,lam,max_bin,bin_nums,parties,n_u,n_v,priv_sizes,unpriv_sizes,alphas,betas,number_of_features,AVG_DFNR,AVG_DFPR,SUM_DIFFS,AVG_ACC,bin_repair_for_all_lams_together_time[sec],repair_type\n")


    ###########
    # loop through bin_nums_list
    #     change bin_nums.ini
    #     run *3* (or more) parties' notebooks
    #     loop through lambda values
    #         create train-test splits
    #         run ML model
    #         record results
    ###########


    #########
    # loop through bin_nums_list
    #########

    for bin_nums in bin_nums_list:
    #for bin_nums in [bin_nums_list[0]]:

        bin_repair_for_all_lams_together_start_time = time.time()

        #########
        # change bin_nums.ini
        #########
        with open(public_path+"/constants/bin_nums.ini", 'w') as f:
            (f.write(str(bin_nums)))

        #########
        # convert notebooks to python files
        #########
        for i in range(0,parties):
            player_py_path='MyMultiDistExp_player' + str(i) + '.ipynb'
            print(player_py_path)
            python_command = 'jupyter nbconvert --to python ' + player_py_path
            process = subprocess.Popen(python_command.split(), stdout=subprocess.PIPE)

        #########
        # run *3* (or more) python files - one for each party
        #########
        python_command=""
        for i in range(0,parties):
            player_py_path='MyMultiDistExp_player' + str(i) + '.py'
            python_command = python_command + ' start python ' + player_py_path + ' &'
        python_command=python_command[1:-1]
        !$python_command

        bin_repair_for_all_lams_together_time = time.time()-bin_repair_for_all_lams_together_start_time

        #########
        #loop through lambda values and repair types
        #########
        for repair_type in repairs:
            for lam in lams:
                #################
                ##concat all datasets - to test ML's performance on repaired dataset
                #################
                print('####################################')
                print('#Datasets of parties:')
                print('####################################')
                df_full_repaired = pd.DataFrame({}) 
                for i in range(0,parties):
                    df_tmp_repaired=pd.read_table(private_path+"/party"+str(i)+"/" + dataset_name + "_sens-" + str(sensitive_feature) + "_rep_lam"+str(lam)+"_bins" + str(bin_nums[0]) + "_rep_type" + repair_type + ".csv",delimiter=",",index_col="index")
                    print(df_tmp_repaired)
                    df_full_repaired=pd.concat([df_full_repaired,df_tmp_repaired])

                df_full_repaired=df_full_repaired.sort_index()

                ################
                ##create train-test splits from the repaired dataset
                ##execute ML model on each split
                ################

                df_train_test_splits=create_train_test_splits(num_train_test_splits,train_percent,df_full_repaired)

                Sum_DFNR=0
                Sum_DFPR=0
                Sum_Acc=0

                for i in range(0,num_train_test_splits):
                    train=df_train_test_splits[i][0]
                    test=df_train_test_splits[i][1]
                    train.insert(len(train.columns),"isTraining",1)
                    test.insert(len(test.columns),"isTraining",0)

                    df_full_repaired_t=pd.concat([train,test])
                    print('####################################')
                    print('#Full dataset:')
                    print('####################################')
                    print(df_full_repaired_t)

                    #####################################
                    ## Here change values such that the y vectors will have 1 where it is equal to positive_class_val and 0 elsewhere.
                    ######################################

                    y_train=np.array(df_full_repaired_t[target_feature][df_full_repaired_t["isTraining"]==1])
                    y_train=(y_train==positive_class_val).astype(int)

                    y_test=np.array(df_full_repaired_t[target_feature][df_full_repaired_t["isTraining"]==0])
                    y_test=(y_test==positive_class_val).astype(int)


                    sensitive_train=np.array(df_full_repaired_t[sensitive_feature][df_full_repaired_t["isTraining"]==1])
                    sensitive_test=np.array(df_full_repaired_t[sensitive_feature][df_full_repaired_t["isTraining"]==0])

                    more_features=[col for col in ignore_features if col not in ['party', 'index']]
                    X_train=df_full_repaired_t[[*features,*more_features]][df_full_repaired_t["isTraining"]==1]
                    X_test=df_full_repaired_t[[*features,*more_features]][df_full_repaired_t["isTraining"]==0]

                    X_test_non_protected=X_test[sensitive_test==priv_value]
                    X_test_protected=X_test[sensitive_test!=priv_value]

                    y_test_non_protected=y_test[sensitive_test==priv_value]
                    y_test_protected=y_test[sensitive_test!=priv_value]

        
                    #****************************
                    # LR model
                    #****************************

                    logreg = LogisticRegression(penalty='l2',max_iter=1000)
                    logreg.fit(X_train, y_train)
                    print('Regular LR:')
                    
                    test_reslts=logreg.predict(X_test)
                    print("dataset: " + dataset_name + " repair: "+repair_type+" bins:" + str(bin_nums) + " lam:" + str(lam))
                    print(test_reslts)
                    test_accuracy=accuracy_score(y_test, test_reslts)
                    print(test_accuracy)


                    
                    ###################
                    # compute results
                    ###################

                    print('####################################')
                    print('#y_test_non_protected:')
                    print('####################################')
                    print(y_test_non_protected)
                    print('####################################')
                    print('#y_pred_non_protected:')
                    print('####################################')
                    y_pred_non_protected=logreg.predict(X_test_non_protected)
                    print(y_pred_non_protected)


                    print('####################################')
                    print('#cnf_matrix - non_protected:')
                    print('####################################')
                    cnf_matrix=confusion_matrix(y_test_non_protected,y_pred_non_protected,labels=[0,1])

                    print(cnf_matrix)

                    TN_np=cnf_matrix[0,0]
                    FP_np=cnf_matrix[0,1]
                    FN_np=cnf_matrix[1,0]
                    TP_np=cnf_matrix[1,1]
                    TNR_np=(0 if TN_np==0 else TN_np/(TN_np+FP_np))
                    TPR_np=(0 if TP_np==0 else TP_np/(TP_np+FN_np))
                    FPR_np=(0 if FP_np==0 else FP_np/(TN_np+FP_np))      #1-TNR_np
                    FNR_np=(0 if FN_np==0 else FN_np/(TP_np+FN_np))      #1-TPR_np

                    print("TN:", TN_np," FP:", FP_np," FN:", FN_np, " TP:", TP_np)
                    print("TNR:", TNR_np," FPR:", FPR_np," FNR:", FNR_np, " TPR:", TPR_np)

                    print('####################################')
                    print('#y_test_protected:')
                    print('####################################')
                    print(y_test_protected)        

                    print('####################################')
                    print('#y_pred_protected:')
                    print('####################################')
                    y_pred_protected=logreg.predict(X_test_protected)
                    print(y_pred_protected)


                    print('####################################')
                    print('#cnf_matrix - protected:')
                    print('####################################')
                    cnf_matrix=confusion_matrix(y_test_protected,y_pred_protected,labels=[0,1])

                    print(cnf_matrix)

                    TN_p=cnf_matrix[0,0]
                    FP_p=cnf_matrix[0,1]
                    FN_p=cnf_matrix[1,0]
                    TP_p=cnf_matrix[1,1]
                    TNR_p=(0 if TN_p==0 else TN_p/(TN_p+FP_p))    
                    TPR_p=(0 if TP_p==0 else TP_p/(TP_p+FN_p)) 
                    FPR_p=(0 if FP_p==0 else FP_p/(TN_p+FP_p))      #1-TNR_p
                    FNR_p=(0 if FN_p==0 else FN_p/(TP_p+FN_p))      #1-TPR_p

                    print("TN:", TN_p," FP:", FP_p," FN:", FN_p, " TP:", TP_p)
                    print("TNR:", TNR_p," FPR:", FPR_p," FNR:", FNR_p, " TPR:", TPR_p)


                    print('####################################')
                    print('#Results for this split:')
                    print('####################################')
                    print('|DFNR|: ',abs(FNR_np-FNR_p))
                    print('|DFPR|: ',abs(FPR_np-FPR_p))
                    print('Accuracy: ',test_accuracy)


                    Sum_DFNR=Sum_DFNR+abs(FNR_np-FNR_p)
                    Sum_DFPR=Sum_DFPR+abs(FPR_np-FPR_p)
                    Sum_Acc=Sum_Acc+test_accuracy
                    
                    DFNR=FNR_np-FNR_p
                    DFPR=FPR_np-FPR_p
                    SUM_DIFFS_granular=abs(DFNR)+abs(DFPR)
                    
                    valList_gran=[dataset_name,sensitive_feature,priv_value,lam,max(bin_nums),str(bin_nums),
                                     parties,n_u,n_v,str(priv_sizes),str(unpriv_sizes),str(alphas),str(betas),len(features),str(i),
                                     DFNR,DFPR,SUM_DIFFS_granular,test_accuracy,bin_repair_for_all_lams_together_time,repair_type]


                    with open(os.getcwd()+ model_folder_name +'/gran_results_' + dataset_name + "_" + sensitive_feature + "_p" + str(parties) +".csv", 'a') as f:
                        for item in valList_gran:
                            f.write('"%s",' % item)
                        f.write('\n')
                                        
                    print('\n')

        #************
        # Here we save and record all results
        #************

                AVG_DFNR=Sum_DFNR/num_train_test_splits
                AVG_DFPR=Sum_DFPR/num_train_test_splits
                SUM_DIFFS=abs(Sum_DFPR/num_train_test_splits)+abs(Sum_DFNR/num_train_test_splits)
                AVG_ACC=Sum_Acc/num_train_test_splits
                print('lambda: ', lam)
                print('AVG_DFNR: ',AVG_DFNR)
                print('AVG_DFPR: ',AVG_DFPR)
                print('SUM_DIFFS: ',SUM_DIFFS)
                print('AVG_ACC: ',AVG_ACC)



                valList=[dataset_name,sensitive_feature,priv_value,lam,max(bin_nums),str(bin_nums),
                                 parties,n_u,n_v,str(priv_sizes),str(unpriv_sizes),str(alphas),str(betas),len(features),
                                 AVG_DFNR,AVG_DFPR,SUM_DIFFS,AVG_ACC,bin_repair_for_all_lams_together_time,repair_type]

                with open(os.getcwd()+ model_folder_name +'/results_' + dataset_name + "_" + sensitive_feature + "_p" + str(parties) +".csv", 'a') as f:
                    for item in valList:
                        f.write('"%s",' % item)
                    f.write('\n')

                    