# Setup

In [None]:
%load_ext autoreload
%autoreload 2

import time
import string
import inspect
import numpy as np

import cvxpy as cp
import gurobipy

import copy, signal
import csv, datetime
import random
import itertools
from tqdm import tqdm

import numpy.random as random_numpy
from joblib import Parallel, delayed

import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
from scipy.stats import uniform

from birkhoff import birkhoff_von_neumann_decomposition

rng = random_numpy.default_rng(1234)
CORES = 5 ## Number of parallel threads to run

from birkhoff_edited import fast_decomposition

import warnings
warnings.filterwarnings("ignore")

## Global Variables

In [None]:
# Ideally, enter absolute path 
home_folder = "./"

In [None]:
debug = lambda str : f"print(\"{str}\",\"=\",eval(\"{str}\"))"

# Helper functions and ranking utilities

In [None]:
exec(open('utils.py').read())

In [None]:
exec(open('helper_funcs.py').read())

# Algorithms

In [None]:
exec(open('algorithms.py').read())

# Simulation with real-world image data

The code below is adapted from the [Noisy-Fair-Subset-Selection](https://github.com/AnayMehrotra/Noisy-Fair-Subset-Selection) repository

In [None]:
import cv2
import random
import pickle

from numpy import matlib
from varname import nameof

### Helper functions (1)

In [None]:
data_file = home_folder+'occupations_labels.csv'

In [None]:
def stats_occupations_ds(file, verbose=0):
    occupations = {}
    with open(file) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if row[0] not in occupations: occupations[row[0]]=[]
            occupations[row[0]].append({'image':row[0]+"/"+row[1], 'gender':row[2],'skin_tone':row[3]})
            line_count += 1
    occupations_stats = {}
    for o in occupations:
        tot_men = 0; tot_women = 0; tot_dark = 0; tot_light = 0;
        for img in occupations[o]:
            tot_men = tot_men + (img['gender']=='Male')
            tot_women = tot_women + (img['gender']=='Female')
            tot_dark = tot_dark + (img['skin_tone']=='dark')
            tot_light = tot_light + (img['skin_tone']=='light')
        occupations_stats[o] = {"total": len(occupations[o]), "tot_men": tot_men,\
                                "tot_women": tot_women, "tot_dark": tot_dark,\
                                "tot_light": tot_light}

    # prints file in csv format.
    if verbose:
        for o in occupations_stats: print(o,",",occupations_stats[o]["tot_men"],",",\
                                        occupations_stats[o]["tot_women"],",",\
                                        occupations_stats[o]["tot_dark"],",",\
                                        occupations_stats[o]["tot_light"])
    return occupations, occupations_stats

occupations, occupations_stats = stats_occupations_ds(data_file)

def filter_occupations(tau_rule=0.8, least_number_of_humans=50, verbose=0):
    _, stats = stats_occupations_ds(data_file);
    women_typical = []; men_typical = []; neutral = []
    for o in stats:
        if stats[o]["tot_men"]+stats[o]["tot_women"] >= least_number_of_humans:
            tot=stats[o]["tot_men"]+stats[o]["tot_women"]
            if stats[o]["tot_women"]/tot >= tau_rule: women_typical.append(o)
            elif stats[o]["tot_men"]/tot >= tau_rule: men_typical.append(o)
            else: neutral.append(o)
    if verbose:
        print("women_typical:")
        for o in women_typical: print(o, end=",")
        print("men_typical:")
        for o in men_typical: print(o, end=",")
        print("neutral:")
        for o in neutral: print(o, end=",")
    return women_typical, men_typical, neutral

all_occ, _, _ = filter_occupations(0,0,0)
all_occ.sort()

In [None]:
# Predictions using the classfier might be computationially heavy.
# The user can avoid recomputing the predictions, by using our precomputed results on the Occupations dataset.
# To use the pre-computed results set `runClassifier = False`
# Otherwise set `runClassifier = True` 

# Remarks:
# To run the classifier you need a suitable caffe installation.
# We refer the user to https://www.pyimagesearch.com/2018/02/26/face-detection-with-opencv-and-deep-learning/ for a tutorial on the face detector we use

In [None]:
runClassifier = False

### 3.A Preprocessing occupations dataset to generate cropped images (optional)

In [None]:
occ_folder=""

if runClassifier: 
    home_folder+"Occupations_dataset_images/"

crop_folder=home_folder+"image-subset-selection/Occupations-dataset/Occupations-Datatset-2019/Occupations_dataset_images_cropped/"

#### 3.A.1 Helper functions

In [None]:
def detect_faces(img, net, thresh=0.5,verbose=0):
    # source: https://www.pyimagesearch.com/2018/02/26/face-detection-with-opencv-and-deep-learning/#download-the-code
    # load the input image and construct an input blob for the image
    # by resizing to a fixed 300x300 pixels and then normalizing it
    image = cv2.imread(img)
    try: (h, w) = image.shape[:2]
    except:
        print("skipped!")
        return -1
    blob = cv2.dnn.blobFromImage(cv2.resize(image, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0))
    # pass the blob through the network and obtain the detections andpredictions
    net.setInput(blob)
    detections = net.forward()
    faces=[]
    # loop over the detections
    for i in range(detections.shape[1]):
        # extract the confidence (i.e., probability) associated with the prediction
        confidence = detections[0, 0, i, 2]
        if confidence > thresh:
            # compute the (x, y)-coordinates of the bounding box for the object
            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
            (startX, startY, endX, endY) = box.astype("int")
            faces.append([startX, startY, endX, endY])
            #
            if verbose:
                ##draw the bounding box of the face along with the associated  probability
                text = "{:.2f}%".format(confidence * 100);
                y = startY - 10 if startY - 10 > 10 else startY + 10;
                cv2.rectangle(image, (startX, startY), (endX, endY), (0, 0, 255), 2);
                cv2.putText(image, text, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 0, 255), 2)
    # show the output image
    if verbose: cv2.imshow("Output", image); cv2.waitKey(0); cv2.destroyWindow("Output")
    return faces

In [None]:
def extract_faces(image, faces, verbose=0):
    # set default margin
    pad=0.4 ## cropping margin
    # size of face
    # extractSubImage(img,box,pad)
    cropped_faces=[]
    for box in faces:
        sz =[box[3]-box[1]+1, box[2]-box[0]+1]
        #add margin
        new_crop = [0]*4
        new_crop[0]=round(box[0]-pad*sz[1]);
        new_crop[1]=round(box[1]-pad*sz[0]);
        new_crop[2]=round(box[2]+pad*sz[1]);
        new_crop[3]=round(box[3]+pad*sz[0]);
        # size of face with margin
        new_sz=[new_crop[3]-new_crop[1]+1, new_crop[2]-new_crop[0]+1]
        # ensure that the region cropped from the original image with margin doesn't go beyond the image size
        crop = [0]*4
        crop[0]=max(new_crop[0],0);
        crop[1]=max(new_crop[1],0);
        crop[2]=min(new_crop[2],image.shape[1]-1);
        crop[3]=min(new_crop[3],image.shape[0]-1);
        # size of the actual region being cropped from the original image
        crop_sz=[crop[3]-crop[1]+1, crop[2]-crop[0]+1]
        # create new image
        new_img=np.zeros((new_sz[0],new_sz[1],image.shape[2]), np.uint8)
        # coordinates of region taken out of the original image in the new image
        new_loc=[0]*4
        new_loc[0]=crop[0]-new_crop[0];
        new_loc[1]=crop[1]-new_crop[1];
        new_loc[2]=new_loc[0]+crop_sz[1]-1;
        new_loc[3]=new_loc[1]+crop_sz[0]-1;
        # coordinates of the face in the new image
        # obj_location=[0]*4
        # for i in range(4): obj_location[i]=new_loc[i]+box[i]-crop[i]+1;
        # do the crop
        try: new_img[new_loc[1]:new_loc[3]+1, new_loc[0]:new_loc[2]+1, :] = image[crop[1]:crop[3]+1,crop[0]:crop[2]+1,:];
        except:
            print("skipped!");
            debug("new_img.shape, image.shape");
            return -1;
        # if margin goes beyond the size of the image, repeat last row of pixels
        for c in range(image.shape[2]):
            if new_loc[1]>0:
                new_img[:new_loc[1],:,c]  = np.matlib.repmat(new_img[new_loc[1],:,c],new_loc[1],1);
            if new_loc[3]<new_img.shape[0]-1:
                new_img[new_loc[3]+1:,:,c]=np.matlib.repmat(new_img[new_loc[3],:,c],new_img.shape[0]-new_loc[3]-1,1);
            if new_loc[0]>0:
                new_img[:,:new_loc[0],c]  =np.matlib.repmat(new_img[:,new_loc[0],c],new_loc[0],1).T;
            if new_loc[2]<new_img.shape[1]-1:
                new_img[:,new_loc[2]+1:,c]=np.matlib.repmat(new_img[:,new_loc[2],c],new_img.shape[1]-new_loc[2]-1,1).T;
        cropped_faces.append(new_img)
        if verbose: cv2.imshow("padded_image", new_img); cv2.waitKey(0); cv2.destroyWindow("padded_image")
    return cropped_faces

#### 3.A.2 Load model

In [None]:
if runClassifier:
    prototxt=home_folder+"deploy.prototxt.txt"
    model=home_folder+"res10_300x300_ssd_iter_140000.caffemodel"
    net = cv2.dnn.readNetFromCaffe(prototxt, model)

#### 3.A.3 Code to crop and save images

In [None]:
if runClassifier:
    def solve(folder, occupation, num,net):
        file = occupation+"/0000"+("0" if num<10 else "")+str(num)
        if num == 100: file = occupation+"/000100"
        print(occupation, num)
        tmp = folder+file+".jpg"
        image = cv2.imread(tmp)
        face_boxes = detect_faces(tmp,net)
        if face_boxes == -1: return
        cropped_faces=extract_faces(image, face_boxes)
        if cropped_faces == -1: return
        i=0
        for f in cropped_faces: cv2.imwrite(crop_folder+file+"_"+str(i)+".jpg",f); i+=1

#### 3.A.4 Run classifier and save images

In [None]:
if runClassifier:
    occ = list(occupations_stats.keys()); occ.sort()
    for o in occ:
        for i in range(1,101): solve(occ_folder, o, i, net)

In [None]:
if runClassifier: del(net) # delete net to save RAM 

### 3.B Helper functions (2; pre-processing and generating predictions)

In [None]:
def predict_in_parallel(imgs, llim=0, rlim=0):
    # Set the right path to your model definition file, pretrained model weights,
    # and the image you would like to classify.
    MODEL_FILE = home_folder + '../rothe2016deep/gender-prediction/models/gender.prototxt'
    PRETRAINED = home_folder + '../rothe2016deep/gender-prediction/models/gender.caffemodel'
    
    
    caffe.set_mode_cpu() # load the model
    # caffe.set_device(0)
    
    net = caffe.Classifier(MODEL_FILE, PRETRAINED, channel_swap=(2,1,0), raw_scale=255, image_dims=(256, 256))
    print("successfully loaded classifier")
    
    # Not storing the images (keeps the memory requirement down)
    pred = []
    i = 0
    for x in tqdm(imgs[llim:rlim]):
        i += 1
        IMAGE_FILE = crop_folder + x['image'].split('.')[0] + "_0" + ".jpg"
        
        try:
            img = caffe.io.load_image(IMAGE_FILE)
            pred.extend(net.predict([img]))
        except Exception as exc:
            print(f"found bad image: {x['image']}!")
            pred.append([0,0])
    pred=np.array(pred)
    #
    debug("len(pred)")
    # pred[i] = (percent_woman, percent_man)
    #
    file = open('pred_for_tau00_min00_'+str(llim)+"_"+str(rlim), 'wb')
    pickle.dump(pred, file)

In [None]:
def get_images_with_humans(o_list=[]):
    imgs=[]
    occupations, _ = stats_occupations_ds(data_file);
    for o in o_list:
        for im in occupations[o]:
            if im['gender']!='NA': imgs.append(im)
    return imgs

def get_images_all(o_list=[]):
    imgs=[]
    occupations, _ = stats_occupations_ds(data_file);
    for o in o_list:
        for im in occupations[o]: imgs.append(im)
    return imgs

def ok_pred(pred):
    if pred[0]+pred[1]>0.5: return True
    return False

def generate_bins(imgs, num_bins=20, predict_now=runClassifier, verbose=0):
    if predict_now: 
        # predicitions take time; run only once.
        
        predict_in_parallel(imgs, 0, 500)
        predict_in_parallel(imgs, 500, 2500)
        predict_in_parallel(imgs, 2500, 4500)
        predict_in_parallel(imgs, 4500, 6000)
        
        pred=[]
        file = open('pred_for_tau00_min00_0_500', 'rb')
        pred.extend(list(pickle.load(file, encoding='latin1')))
        file = open('pred_for_tau00_min00_500_2500', 'rb')
        pred.extend(list(pickle.load(file, encoding='latin1')))
        file = open('pred_for_tau00_min00_2500_4500', 'rb')
        pred.extend(list(pickle.load(file, encoding='latin1')))
        file = open('pred_for_tau00_min00_4500_6000', 'rb')
        pred.extend(list(pickle.load(file, encoding='latin1')))
        debug("len(pred)")
        # Note: pred[i] = (percent_woman, percent_man)
        
        file = open(home_folder+'pre-predicted-labels/pred_for_tau0_6_min00', 'wb')
        pickle.dump(pred, file)
    else: 
        ## Use values calculated earlier to save computation
        pred=[]
        file = open(home_folder+'pre-predicted-labels/pred_for_tau0_6_min00', 'rb')
        pred.extend(list(pickle.load(file, encoding='latin1')))
        pred=np.array(pred)
    
    cnt=0;bad_ind=[]
    for i in range(len(pred)):
        if not ok_pred(pred[i]): cnt+=1; bad_ind.append(i)
    if verbose: print("Percentage of images without bounding boxes: ", cnt/len(imgs)*100)
    
    for i in range(len(imgs)): imgs[i]['pred'] = pred[i]
    
    # Generating bins
    tmp = []
    bins = np.linspace(0, 1, num_bins)
    for it in pred:
        if ok_pred(it): tmp.append(it[0])
    digi = np.digitize(tmp, bins)
    cnt_bin={}; cnt_bin_f={}; #number of females in bin
    cnt=0; cntf=0 # cnt of females which have a prediction
    for i in range(1,num_bins): cnt_bin_f[i]=0; cnt_bin[i]=0;
    for i in digi: cnt_bin[i]+=1
    for i in range(len(imgs)):
        if not ok_pred(pred[i]): continue
        if imgs[i]['gender']=='Female': cntf+=1; cnt_bin_f[digi[cnt]]+=1
        cnt+=1
    assert(cnt==len(tmp))
    frac_f=[cnt_bin_f[i]/(cnt_bin[i]+1e-5) for i in range(1,num_bins)]
    frac_f=np.array(frac_f)
    arr_cnt_bin = [cnt_bin[i] for i in range(1,num_bins)]; arr_cnt_bin=np.array(arr_cnt_bin)
    
    # Print
    if verbose:
        print(f"Number of images with predictions: {len(tmp)}")
        print(f"Number of images marked women with predictions: {cntf}")
        print(f"Number of bins: {num_bins}")
        print(f"Fraction of females in bins: ")
        debug("np.round(frac_f, 2)")
        print(f"Number of images in bins: ")
        debug("arr_cnt_bin")
    
    #
    acc=0 # number of correct predictions for women
    for i in range(len(imgs)):
        if not ok_pred(pred[i]): continue
        if imgs[i]['gender']=='Female' and pred[i][0]>=0.5: acc+=1
    if verbose: print(f"Accuracy on women={round(acc/cntf*100, 2)}%")
    acc=0 # number of correct predictions for men
    for i in range(len(imgs)):
        if not ok_pred(pred[i]): continue
        if imgs[i]['gender']=='Male' and pred[i][0]<0.5: acc+=1
    if verbose: print(f"Accuracy on men={round(acc/(len(tmp)-cntf)*100,2)}%")
    #
    return pred, bins, frac_f

def get_prediction_stats(imgs,num_bins=20,verbose=1):
    # Expects images from `get_prediction_and_image` (this pre-processes them)
    bins = np.linspace(0, 1, num_bins)
    tmp = []
    for i in range(len(imgs)):
        if ok_pred(imgs[i]['pred']): tmp.append(imgs[i]['pred'][0])
    assert(len(tmp)==len(imgs)) # Expects images from `get_prediction_and_image` (this pre-processes them)
    digi = np.digitize(tmp, bins)
    cnt_bin={}; cnt_bin_f={}; #number of females in bin
    cnt=0; cntf=0 # cnt of females which have a prediction
    for i in range(1,num_bins): cnt_bin_f[i]=0; cnt_bin[i]=0;
    for i in digi: cnt_bin[i]+=1
    for i in range(len(imgs)):
        if not ok_pred(imgs[i]['pred']): 
            print("Image prediction not okay")
            continue
        if imgs[i]['gender']=='Female': cntf+=1; cnt_bin_f[digi[cnt]]+=1
        cnt+=1
    if verbose: print(cnt, len(tmp))
    assert(cnt==len(tmp))
    frac_f=[cnt_bin_f[i]/(cnt_bin[i]+1e-5) for i in range(1,num_bins)]
    frac_f=np.array(frac_f)
    arr_cnt_bin = [cnt_bin[i] for i in range(1,num_bins)]; arr_cnt_bin=np.array(arr_cnt_bin)
    # Print
    if verbose:
        print(f"Number of images with predictions: {len(tmp)}")
        print(f"Number of images marked women with predictions: {cntf}")
        print(f"Number of bins: {num_bins}")
        print(f"Fraction of females in bins: ")
        debug("np.round(frac_f, 2)")
        print(f"Number of images in bins: ")
        debug("arr_cnt_bin")
    #
    acc=0 # number of correct predictions for women
    for it in imgs:
        if not ok_pred(it['pred']): continue
        if it['gender']=='Female' and it['pred'][0]>=0.5: acc+=1
    if verbose: print(f"Accuracy on women={round(acc/cntf*100, 2)}%")
    acc=0 # number of correct predictions for men
    for it in imgs:
        if not ok_pred(it['pred']): continue
        if it['gender']=='Male' and it['pred'][0]<0.5: acc+=1
    if verbose: print(f"Accuracy on men={round(acc/(len(tmp)-cntf)*100,2)}%")
    #
    return pred, bins, frac_f

def calibrate_pred(pred, c_bins=[], c_frac_f=[]):
    if len(c_frac_f)!=0 and len(c_bins)!=0: 
        return [c_frac_f[np.digitize(pred[0], c_bins)-1], 1-c_frac_f[np.digitize(pred[0], c_bins)-1]]
    return [frac_f[np.digitize(pred[0], bins)-1], 1-frac_f[np.digitize(pred[0], bins)-1]]

In [None]:
import pickle

#### 3.B.1 Generating all images

In [None]:
imgs = get_images_with_humans(all_occ) ## All images with humans

#### 3.B.2 Load predictions and bin them

In [None]:
pred, bins, frac_f = generate_bins(imgs, num_bins=20, verbose=1)

### 3.C Helper functions (3; sample candidates and utilities)

In [None]:
def get_prediction_and_image(o_list=[],custom_cal=0):
    imgs = get_images_with_humans(all_occ)
    pred, bins, frac_f = generate_bins(imgs)
    ## Remove bad images from images and add pred valueus to images
    imgs_new = []
    for i in range(len(imgs)):
        if ok_pred(pred[i]) and imgs[i]['image'].split('/')[0] in o_list:
            imgs[i]['pred']=pred[i]; imgs_new.append(imgs[i]);
    imgs_new = np.array(imgs_new)
    #
    c_bins=[]; c_frac_f=[]
    if custom_cal:
        _, c_bins, c_frac_f = get_prediction_stats(imgs=imgs_new,verbose=0)
    return imgs_new, c_bins, c_frac_f

def gen_candidates_image(m,imgs=[],c_bins=[], c_frac_f=[]):
    
    ## shuffle images (in imgs) and pick m of thems
    if m > len(imgs): raise Exception("m larger than the number of images!")
    
    ## construct q from the bins calculated earlier.
    subset = random.sample(list(imgs),m)
    
    P = np.array([calibrate_pred(s['pred'], c_bins=c_bins, c_frac_f=c_frac_f) for s in subset])
    P = P.T
    
    trueP = np.zeros_like(P) 
    
    for i in range(m):
        trueP[0, i] = (subset[i]['gender'] == 'Female')
        trueP[1, i] = 1 - trueP[0, i]
    
    
    return np.array(subset), trueP, P

utility_type_dict={0:'DCG (100/log(r+1))', 1:'Unif[0,1]', 2:'100/(r+1)', 3:'100-r'}
def gen_utility_image(subset, utility_type=0, multiplier=1, n=100, m=100):
    ## read image position and result utility
    # Utility types: 0==DCG, 1==Uniform, 2==Power-law (exp=1)
    w = []
    for img in subset:
        r = int(img['image'].split('/')[1].split('.')[0]) ## rank
        assert(multiplier==1)
        if utility_type==0: w.append(100/np.log(r+1)+rng.uniform(0,1)) # 100/np.log(r+1) is at least 20
        elif utility_type==1: w.append(rng.uniform(0,1))
        elif utility_type==2: w.append(100/(r+1)+0.1*rng.uniform(0,1)) # 100/(r+1) is at least 1
        elif utility_type==3: w.append(100-r+rng.uniform(0,1)) # 100/(r+1) is at least 1
    
    w = np.array(w).reshape((m,1))
    v = [1/np.log(j+1+1) for j in range(n)]
    W = w * v
    return W

In [None]:
# Compute Qgrp
p = 2
m = 4494
n = 100
Qgrp_image = np.zeros((2,2))

# Sample
imgs, c_bins, c_frac_f = get_prediction_and_image(all_occ,custom_cal=0)
cnd, trueP, P = gen_candidates_image(m, imgs, c_bins = c_bins, c_frac_f = c_frac_f)
W = gen_utility_image(cnd, utility_type=0, multiplier=1, n=n, m=m)


print("Fraction Female: ", np.mean(trueP[0, :]))
print("Fraction Male: ", np.mean(trueP[1, :]))
print('')

tmp = ['Female', 'Male']
for (i,gi), (j,gj) in itertools.product(enumerate(tmp), enumerate(tmp)):
    tot = np.sum( [i == np.argmax(P[:, t]) for t in range(m)])
    typ_t = np.sum( [(gj == cnd[t]['gender'])*(i == np.argmax(P[:, t])) for t in range(m)])
    Qgrp_image[i][j] = typ_t/tot
eval(debug('Qgrp_image'))

In [None]:
_, c_bins, c_frac_f = get_prediction_and_image(all_occ)

### 3.D Main code 

In [None]:
def plot_res(results_mean, results_std, utility_mean, utility_std, ITER = 100,\
             fairness_measure = 'Fairness measure', name_occ_list = '', name_occ_list2 = '',\
             exp = 'Synthetic data', m = 100, n = 50, g = 2, ylims=(0.6,1.0), save = True, useILP=False):
    num_of_alg = results_mean.shape[1]
    num_of_const_steps = results_mean.shape[0]

    algo_names = ['This work', 'SJ', 'CSV [Greedy]', 'MC', 'GAK [Det-Greedy]', 'Uncons']
    algo_colors = ['#2FA3EB', '#F2B93F', '#F06B56', '#4DF06D', '#604EE6', '#000000', '#804539', 'purple', 'black']
    color = {}

    # Plot: const vs fairness measure
    fig, ax = plt.subplots()
    for i in range(num_of_alg):
        x_axis = np.linspace(2, 1, num_of_const_steps)
        
        res = results_mean[:, i].T
        res_err = results_std[:, i].T / np.sqrt(ITER)
        
        plt.errorbar(x_axis, res, yerr=res_err, fmt=('--' if i == 2 else '-'),\
                     color=algo_colors[i], label=algo_names[i], linewidth=4, alpha=0.9)
        
    ax.invert_xaxis()
    plt.title(f'{exp}\n$(m,n,g)=$({m},{n},{g}),ITER={ITER},occ_lists=[{name_occ_list},{name_occ_list2}].', fontsize=15)
    # plt.ylim(ylims[0], ylims[1])
    plt.ylim(np.min(results_mean) - 0.02, np.max(results_mean) + 0.02)    
    ax.set_ylabel(f'(Less fair)\t\t\t{fairness_measure}\t\t\t(More fair)',fontsize=23)
    ax.set_xlabel('(Looser constraint)\tFairness const. ($\\alpha$)\t(Stricter constraint)',fontsize=23)
    legend = plt.legend(loc='best', shadow=False, fontsize=20)
    plt.tick_params(axis='both', which='major', labelsize=16)
    ax.tick_params(axis='both', which='major', labelsize=16)
    
    if save: pdf_savefig()
    else: plt.show()
    
    # 2 corresponds to the greedy algorithm
    max_utility = np.max(utility_mean[:, 2])
        
    # Plot: const vs utility measure
    fig, ax = plt.subplots()
    for i in range(num_of_alg):
        x_axis = np.linspace(2, 1, num_of_const_steps)
        
        util = utility_mean[:, i].T / max_utility
        util_err = utility_std[:, i].T / np.sqrt(ITER) / max_utility
        
        plt.errorbar(x_axis, util, yerr=util_err,\
                     color=algo_colors[i], label=algo_names[i], linewidth=4, alpha=0.7)    
    
    ax.invert_xaxis()
    ax.set_ylabel(f'Utility',fontsize=23)
    plt.title(f'{exp}\n$(m,n,g)=$({m},{n},{g}),ITER={ITER},occ_lists=[{name_occ_list},{name_occ_list2}].', fontsize=15)
    ax.set_xlabel('(Looser constraint)\tFairness const. ($\\alpha$)\t(Stricter constraint)',fontsize=23)
    # plt.ylim(0.85, np.max(utility_mean / max_utility) * 1.01)
    plt.ylim(np.min(utility_mean) - 0.02, np.max(utility_mean) + 0.02)
    legend = plt.legend(loc='best', shadow=False, fontsize=20)
    plt.tick_params(axis='both', which='major', labelsize=16)
    ax.tick_params(axis='both', which='major', labelsize=16)
    
    if save: pdf_savefig()
    else: plt.show()
        
    # Plot: fairness vs utility measure
    fig, ax = plt.subplots()
    for i in range(num_of_alg):
        res = results_mean[:, i].T
        res_err = results_std[:, i].T / np.sqrt(ITER)
        
        util = utility_mean[:, i].T / max_utility
        util_err = utility_std[:, i].T / np.sqrt(ITER) / max_utility
        
        plt.errorbar(res, util, xerr=res_err, yerr=util_err,\
                     color=algo_colors[i], label=algo_names[i], linewidth=4, alpha=0.7)
    # plt.ylim(0.85, np.max(utility_mean / max_utility) * 1.01)
    plt.ylim(np.min(utility_mean) / max_utility - 0.02, np.max(utility_mean) / max_utility + 0.02)    
    plt.xlim(np.min(results_mean) - 0.02, np.max(results_mean) + 0.02)    
    plt.title(f'{exp}\n$(m,n,g)=$({m},{n},{g}),ITER={ITER},occ_lists=[{name_occ_list},{name_occ_list2}].', fontsize=15)
    ax.set_ylabel(f'Utility',fontsize=23)
    ax.set_xlabel(f'(Less fair)'+'\t'*23+f'{fairness_measure}'+'\t'*23+'(More fair)',fontsize=23)
    legend = plt.legend(loc='best', shadow=False, fontsize=20)
    plt.tick_params(axis='both', which='major', labelsize=16)
    ax.tick_params(axis='both', which='major', labelsize=16)
        
    if save: pdf_savefig()
    else: plt.show()

In [None]:
def run_syn_exp_image(ITERS=20, num_of_const_steps=5, rND_k=n-1, dist=None,\
                fairness_measure=compute_weighted_risk_diff,\
                fairness_measure_name='Risk diff.', m=100, n=50, g=2,\
                occ_list=[], occ_list2=[], name_occ_list = 'NA', name_occ_list2 = 'NA',\
                verbose=False, useILP=False):
    
    num_of_alg = 6
    
    imgs, _, _ = get_prediction_and_image(occ_list)
    imgs2, _, _ = get_prediction_and_image(occ_list2)
    
    if dist is None: dist = np.ones(g) / g
    
    results_mean = np.zeros((num_of_const_steps, num_of_alg))
    results_std = np.zeros((num_of_const_steps, num_of_alg))
    
    utility_mean = np.zeros((num_of_const_steps, num_of_alg))
    utility_std = np.zeros((num_of_const_steps, num_of_alg))

    for ijk, gamma in enumerate(np.linspace(2, 1, num_of_const_steps)):
        
        # fix fairness constraints
        L = np.zeros((g,n))
        U = get_const_from_dist([0.5*gamma, 0.5*gamma], m, n, g)
        
        results_per_const = [[] for i in range(num_of_alg)]
        utility_per_const = [[] for i in range(num_of_alg)]
        
        for exp_run in tqdm(range(ITERS)):        
            cnt = 0
            while True:
                try:
                    # Generate data 
                    ma = int(len(imgs)*1.0/(len(imgs)+len(imgs2)) * m)
                    mb = m-ma

                    cnda, truePa, Pa = gen_candidates_image(ma, imgs, c_bins=c_bins, c_frac_f=c_frac_f)
                    cndb, truePb, Pb = gen_candidates_image(mb, imgs2, c_bins=c_bins, c_frac_f=c_frac_f)

                    for j in range(mb):
                        if cndb[j]['gender'] == 'Female':
                            cndb[j]['gender'] = 'Male'
                        else:
                            cndb[j]['gender'] = 'Female'
                    cnd = np.array(list(cnda)+list(cndb))

                    trueP = np.concatenate([truePa.T, (1-truePb).T]).T
                    P = np.concatenate([Pa.T, (1-Pb).T]).T
                    PT = np.round(P) # P thresholded
                    W = gen_utility_image(cnd, utility_type=2, multiplier=1, n=n, m=m)

                    # Find fair ranking 

                    if useILP:
                        x_our = noisy_rank_ilp(W, P, L, U)
                    else:
                        x_our = noisy_rank_cvz_rounding(W, P, L, U, verbose = False)

                    x_greedy = greedy_fair_ranking(W, PT, L, U)

                    x_LP, birkhoff = noisy_rank_basic_rounding(W, PT, L, U, getBirkhoff=True)
                    a, rankings = extractBirkhoff(birkhoff, n) # Compute Birkhoff decomposition
                    if verbose: print(f'Number of rankings = {len(rankings)}')

                    x_SS = subset_selection_algorithm(W[:, 0], P, L[:, -1], U[:, -1], n)
                    Lp = get_lower_const_from_dist_linkedIn_det_greedy([0.5*1, 0.5*1], m, n, g)
                    Up = get_upper_const_from_dist_linkedIn_det_greedy([0.5*1, 0.5*1], m, n, g)
                    x_det_greedy = linkedIn_det_greedy(W, PT, Lp, Up, n) # this assumes that W is a rank 1 metric
                    
                    x_uncons = greedy_fair_ranking(W, PT, np.zeros_like(L), np.ones_like(L)*2*n)
                    
                except:
                    cnt += 1
                    if cnt > 10: break
                    continue
                else:
                    break

            # Compute fairness measures
            # print('This work:')
            rd_our = fairness_measure(x_our, trueP, dist, m, n, g, k=rND_k, verbose=False, P=P)
            # print('Greedy algorithm:')
            rd_greedy = fairness_measure(x_greedy, trueP, dist, m, n, g, k=rND_k, verbose=False, P=P)
            
            rd_LP = 0
            for i, r in enumerate(rankings):
                rd_LP += a[i] * fairness_measure(r, trueP, dist, m, n, g, k=rND_k)
            
            # print('Subset selection:')
            rd_SS = fairness_measure(x_SS, trueP, dist, m, n, g, k=rND_k, verbose=False, P=P)
            rd_det_greedy = fairness_measure(x_det_greedy, trueP, dist, m, n, g, k=rND_k)
            
            rd_uncons = fairness_measure(x_uncons, trueP, dist, m, n, g, k=rND_k)
                
            # Print and store resuults 
            if verbose: print('$'*15, rd_our, rd_LP, rd_greedy)
            results_per_const[0].append(rd_our)
            results_per_const[1].append(rd_LP)
            results_per_const[2].append(rd_greedy)
            results_per_const[3].append(rd_SS)
            results_per_const[4].append(rd_det_greedy)
            results_per_const[5].append(rd_uncons)
            
            utility_per_const[0].append(get_utility(W, x_our))
            # utility_per_const[1].append(get_utility(W, x_LP))
            u_LP = 0
            for i, r in enumerate(rankings):
                u_LP += a[i] * get_utility(W, r)
            utility_per_const[1].append(u_LP)
            utility_per_const[2].append(get_utility(W, x_greedy))
            utility_per_const[3].append(get_utility(W, x_SS))
            utility_per_const[4].append(get_utility(W, x_det_greedy))
            utility_per_const[5].append(get_utility(W, x_uncons))
        
        results_mean[ijk] = np.array([np.mean(results_per_const[i]) for i in range(num_of_alg)])
        results_std[ijk]  = np.array([np.std(results_per_const[i]) for i in range(num_of_alg)])
        
        utility_mean[ijk] = np.array([np.mean(utility_per_const[i]) for i in range(num_of_alg)])
        utility_std[ijk] = np.array([np.std(utility_per_const[i]) for i in range(num_of_alg)])
  
    plot_res(results_mean, results_std, utility_mean, utility_std, ITER = ITERS,\
             fairness_measure = fairness_measure, name_occ_list = 'NA', name_occ_list2 = 'NA',\
             exp = 'UPDATED -- Image data (DCG Utility)', m = m, n = n, g = g, save = True, ylims=(0.45,0.9))
        
    return results_mean, results_std, utility_mean, utility_std

### 3.E Running simulation

In [None]:
women_typical08, men_typical08, neutral08 = filter_occupations(0.8,0,0)

neutral08.sort()
men_typical08.sort()
women_typical08.sort()

In [None]:
run_syn_exp_image(ITERS=1000, num_of_const_steps=10, rND_k=5, dist=np.ones(2)/2,\
                fairness_measure = compute_weighted_risk_diff, m =  500, n = 25, g = 2,\
                occ_list = men_typical08, occ_list2 = women_typical08,\
                name_occ_list = 'men_typical08', name_occ_list2 = 'women_typical08',\
                verbose=False)

In [None]:
run_syn_exp_image(ITERS=1000, num_of_const_steps=10, rND_k=5, dist=np.ones(2)/2,\
                fairness_measure = compute_weighted_selec_lift, m =  500, n = 25, g = 2,\
                occ_list = men_typical08, occ_list2 = women_typical08,\
                name_occ_list = 'men_typical08', name_occ_list2 = 'women_typical08',\
                verbose=False)