In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib widget
import copy
import numpy as np
import pickle 
 
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import rc, rcParams


from tqdm.notebook import tqdm
from scipy.integrate import quad

import pandas as pd

np.random.seed(0)

from helpers import *
from framework import *
from selection_and_validation_framework import *

home_folder = './'
out_folder = home_folder + 'figures/'

In [None]:
LOAD_PROCESSED_DATA = True

## Semantic scholar simulation

### Helper code from "Interventions for Ranking in the Presence of Implicit Bias" (FAccT 2020)

In [None]:
#######################################
## Plot tradeoff when setting different
## thresholds for names
#######################################
def plot_tradeoff_for_names(author, name):
    num_m,num_f,num_u, num_no =[],[],[], []
    thresh = [0.5,0.6,0.7,0.8,0.9,0.93,0.95,0.97,0.98,0.99]
    pbar = tqdm(thresh)
    for x in pbar:
        pbar.set_description(f"Threshold: {x}")
        male, female, unknown, no_first_name = [], [], [], []
        for key in author.keys():
            a=author[key]
            a_first_name=a['name'].split(' ')[0].split('.')[0].split('-')[0].lower()
            a_first_name=unidecode.unidecode(a_first_name)
            if len(a_first_name) <= 2:
                no_first_name.append(a)
            elif a_first_name in name:
                if name[a_first_name]['p'] > x and name[a_first_name]['gender']=='f':
                    female.append(a)
                elif name[a_first_name]['p'] > x and name[a_first_name]['gender']=='m':
                    male.append(a)
                else: unknown.append(a)
            else: unknown.append(a)
        print("Length male, female, unknown, no_first_name:", len(male), len(female), len(unknown), len(no_first_name))
        print("Total sum:", len(male)+len(female)+len(unknown)+len(no_first_name), "vs unique authors", len(author))
        num_m.append(len(male))
        num_f.append(len(female))
        num_u.append(len(unknown))
        num_no.append(len(no_first_name))
    #
    #
    plt.figure(figsize=(9,5))
    plt.plot(thresh, num_m,'--', label="Men",  linewidth=4)
    plt.plot(thresh, num_f, label="Women",  linewidth=4)
    plt.plot(thresh, num_u, ':', label="Unknown first name",  linewidth=4)
    legend = plt.legend(loc='best', shadow=False, fontsize=20)
    plt.ylabel("Number of Candidates Admitted", fontsize=16)
    plt.xlabel("Fraction of Women of Total Admitted Candidates", fontsize=16)
    plt.ylim(0,16000)
    plt.ticklabel_format(style='sci', axis='y', scilimits=(-1,3))
    plt.ylabel("Number of authors",fontsize=20)
    plt.xlabel("Threshold",fontsize=20)
    plt.ylim(0,0.5*len(author))
    plt.savefig(out_folder+'count_vs_thresh_tradeoff2.eps', format='eps', dpi=500)
    # plt.show()
    return num_m, num_f, num_u, num_no

In [None]:
#################################################
## Find male and female authors given a threshold
#################################################
def partition_authors_by_gender(thresh, author, name):
    male, female, unknown, no_first_name = [], [], [], []
    for key in tqdm(author.keys()):
        a=author[key]
        a_first_name=a['name'].split(' ')[0].split('.')[0].split('-')[0].lower() ## get first name
        a_first_name=unidecode.unidecode(a_first_name) ## convert non ascii characters
        if len(a_first_name) <= 2:  ## remove initials
            no_first_name.append(a)
        elif a_first_name in name:
            if name[a_first_name]['p'] > thresh and name[a_first_name]['gender']=='f':
                female.append(a)
            elif name[a_first_name]['p'] > thresh and name[a_first_name]['gender']=='m':
                male.append(a)
            else: unknown.append(a)
        else: unknown.append(a)
    print("Length male, female, unknown, no_first_name:", len(male), len(female), len(unknown), len(no_first_name))
    print("Total sum:", len(male)+len(female)+len(unknown)+len(no_first_name), "vs unique authors", len(author))
    return male, female, unknown, no_first_name

In [None]:
#######################################
## Plot number of authors by year
#######################################
def plot_number_of_authors_by_first_publication_yr(male, female):
    min_yr = 2000
    for m in male: min_yr = min(m['first_publish'], min_yr)
    for f in female: min_yr = min(f['first_publish'], min_yr)

    print("First citation in the dataset at", min_yr)

    yrs = [1936,1940,1950,1960,1970,1980,1990,2000,2010,2020]
    male_yr, female_yr = [], []
    
    for yr in tqdm(yrs):
        print(yr)
        male_cit_r=[]
        female_cit_r=[]
        for m in male:
            if m['first_publish'] <= yr: male_cit_r.append(m['citations'])
        for f in female:
            if f['first_publish'] <= yr: female_cit_r.append(f['citations'])
        male_yr.append(len(male_cit_r))
        female_yr.append(len(female_cit_r))
    
    plt.plot(yrs, male_yr, '--', label="Male", linewidth=4)
    plt.plot(yrs, female_yr, label="Female", linewidth=4)
    
    legend = plt.legend(loc='best', shadow=False, fontsize=20)
    
    plt.ticklabel_format(style='sci', axis='y', scilimits=(-1,3))
    plt.xlabel("Year", fontsize=20)
    plt.ylabel("Number of authors", fontsize=20)
    
    plt.savefig(out_folder+'num_by_first_pub_yr.eps', format='eps', dpi=150)
    plt.savefig(out_folder+'num_by_first_pub_yr.png', format='png', dpi=150)
    
    # plt.show()
    return male_yr, female_yr

In [None]:
#################################################
## Find unweighted citations of authors
#################################################
def distribution_of_citations(male, female, after_yr=1990, plot=0):
    male_cit_u=[]
    female_cit_u=[]
    
    for m in male:
        if m['first_publish'] >= after_yr: male_cit_u.append(m['citations'] )
    
    for f in female:
        if f['first_publish'] >= after_yr: female_cit_u.append(f['citations'] )
    
    male_cit_u = np.array(male_cit_u)
    female_cit_u = np.array(female_cit_u)
    
    ## make a histogram of distribution
    if plot:
        plt.figure(figsize=(8.2,5))
        powerlaw.plot_pdf(male_cit_u[male_cit_u>0], label="Male", color='b', linewidth=4)
        powerlaw.plot_pdf(female_cit_u[female_cit_u>0], label="Female", color='orange', linewidth=4)
        plt.xscale('linear')
        plt.ticklabel_format(style='sci', axis='x', scilimits=(-1,3))
        legend = plt.legend(loc='best', shadow=False, fontsize=20)
        after_yr=1980
        plt.xlabel("Total Citations (Papers after "+str(after_yr)+")",fontsize=20)
        plt.ylabel("Probability Mass Function", fontsize=20)
        plt.savefig(out_folder+'citation_unweighted_'+str(after_yr)+'.eps', format='eps', dpi=500)
        plt.savefig(out_folder+'citation_unweighted_'+str(after_yr)+'.png', format='png', dpi=150)
        # plt.show()
    
    return male_cit_u, female_cit_u




### Load dataset

In [None]:
if not LOAD_PROCESSED_DATA:
    citation=pickle.load(open(home_folder + "semantic-scholar/citation_count_ss_sorted2019-07-16_19_30_59_180426.pickle", "rb"))
    author=pickle.load(open(home_folder + "semantic-scholar/author_ss_dict_id_and_citation-count2019-07-2403-07-59-289763.pickle", "rb"))
    print("done: loading")

        
    #######################################
    ## Read the name database and conver to
    ## more managable python objects
    #######################################

    name={}
    for index in tqdm(range(1880,2019)):
        f=open(home_folder + 'semantic-scholar/names/yob'+str(index)+".txt","r")
        if index%20==0:print("File no:", index)
        tmp = f.readline()
        i=0
        while len(tmp)>0:
            i+=1
            n, gender, cnt = tmp.split(',')
            n=n.lower()
            if n not in name: name[n]={'m':0, 'f':0}
            if gender=='F':name[n]['f']+=int(cnt)
            elif gender=='M':name[n]['m']+=int(cnt)
            else: print("Error:", tmp)
            tmp = f.readline()
        f.close()

    for n in name:
        if name[n]['m'] > name[n]['f']:
            name[n]['p'] = name[n]['m']/(name[n]['m']+name[n]['f'])
            name[n]['gender']='m'
        elif name[n]['m'] <= name[n]['f']:
            name[n]['p'] = name[n]['f']/(name[n]['f']+name[n]['m'])
            name[n]['gender']='f'

    print("Number of names", len(name)) ## 98400


    num_m, num_f, num_u, num_no = plot_tradeoff_for_names(author, name)
    print("done: plot_tradeoff_for_names")

    male, female, unknown, no_first_name = partition_authors_by_gender(0.9, author, name)
    print("done: partition_authors_by_gender")

    male_yr, female_yr = plot_number_of_authors_by_first_publication_yr(male, female)
    print("done: plot_number_of_authors_by_first_publication_yr")

    male_cit_u, female_cit_u = distribution_of_citations(male, female, after_yr=1980, plot=1)
    print("done: distribution_of_citations")


    write(male_cit_u, 'male_cit_u')
    write(female_cit_u, 'female_cit_u')

    male_cit_scaled = male_cit_u / 5000
    female_cit_scaled = female_cit_u / 5000

    Omega = get_omega([male_cit_scaled, female_cit_scaled])
    pdf_latent = get_pdf_from_utils(male_cit_scaled, Omega)
    pdf_biased = get_pdf_from_utils(female_cit_scaled, Omega)

    write(Omega, 'Omega_SS')
    write(pdf_latent, 'pdf_latent_SS')
    write(pdf_biased, 'pdf_biased_SS')
else: 
    
    Omega = read_obj('Omega_SS')
    pdf_latent = read_obj('pdf_latent_SS')
    pdf_biased = read_obj('pdf_biased_SS')

In [None]:
plt.figure().clear()

plt.plot(Omega, pdf_latent, label='above 50 degrees')
plt.plot(Omega, pdf_biased, label='below 50 degrees')

plt.legend(fontsize=22)
plt.xlim(0, 0.005)

plt.show() 

In [None]:
plt.figure().clear()

plt.plot(Omega[10:], pdf_latent[10:] / sum(pdf_latent[10:]), label='above 50 years')
plt.plot(Omega[10:], pdf_biased[10:] / sum(pdf_biased[10:]), label='below 50 years')

plt.legend(fontsize=22)
plt.xlim(0, 0.05)

plt.show()
# plt.close()

In [None]:
f_0 = get_pdf_uniform_omega(Omega[10:1800])


kl_div_fd = 0
for i, p in enumerate(pdf_latent):
    if p > 0:
        kl_div_fd += p * np.log(p / f_0(Omega[i]))
print(f'kl_div_fd = {kl_div_fd}')

In [None]:
def merge_omega_and_pdf(omega, pdf_latent, pdf_biased, window_sz):
    omega_new = []
    pdf_latent_new = []
    pdf_biased_new = []

    for i in range(0, len(omega), window_sz):
        window_sz_ = min(window_sz, len(omega) - i)
        tmp = sum([ omega[i + j] for j in range(window_sz_) ])
        omega_new.append( tmp / window_sz_)
        
        pdf_latent_new.append( sum([ pdf_latent[i + j] for j in range(window_sz_) ]) )
        pdf_biased_new.append( sum([ pdf_biased[i + j] for j in range(window_sz_) ]) )


    return omega_new, pdf_latent_new, pdf_biased_new

### Validation simulations

In [None]:
lb = 10
ub = 1800  
Omega_new, pdf_latent_new, pdf_biased_new = merge_omega_and_pdf(omega=Omega[lb:ub], pdf_latent=pdf_latent[lb:ub], pdf_biased=pdf_biased[lb:ub], window_sz=4)

alpha_list = np.logspace(-4, 2, 100)
tau_list = np.logspace(0.1, 1, 10)

prior_list = [get_pdf_uniform_omega(Omega_new)]
names_of_prior = ['uniform']

f_0 = get_pdf_uniform_omega(Omega_new[10:1800])

kl_div_fd = 0
for i, p in enumerate(pdf_latent_new):
    kl_div_fd += p * np.log(p / f_0(Omega_new[i])) if p > 0 else 0
print(f'kl_div_fd = {kl_div_fd}')
print(f'tv in data: {sum(abs(pdf_latent - pdf_biased)) / 2}')

In [None]:
generate_plots(latent_utilities = None, biased_utilities = None, pdf_latent = pdf_latent_new, pdf_biased = pdf_biased_new, Omega = Omega_new, 
                err_func=err_func_pareto, alpha_list=alpha_list, tau_list=tau_list, 
                prior_list=prior_list, names_of_prior=names_of_prior, print_verbose=False, plot_fd=False,
                plot_prior=False, plot=False, xlim = (0,0.3), trunc_length = None, verbose=False,
                FAIL_CNT_THRESH=1000)

#### Our framework with $\alpha=1$

In [None]:
generate_plots(latent_utilities = None, biased_utilities = None, pdf_latent = pdf_latent_new, pdf_biased = pdf_biased_new, Omega = Omega_new, 
                err_func=err_func_pareto, alpha_list=[1], tau_list=tau_list, 
                prior_list=prior_list, names_of_prior=names_of_prior, print_verbose=False, plot_fd=False,
                plot_prior=False, plot=False, xlim = (0,0.3), trunc_length = None, verbose=False,
                FAIL_CNT_THRESH=1000)

#### Our framework with $\tau=Ent(f_D)$

In [None]:
generate_plots(latent_utilities = None, biased_utilities = None, pdf_latent = pdf_latent_new, pdf_biased = pdf_biased_new, Omega = Omega_new, 
                err_func=err_func_pareto, alpha_list=alpha_list, tau_list=[kl_div_fd], 
                prior_list=prior_list, names_of_prior=names_of_prior, print_verbose=False, plot_fd=False,
                plot_prior=False, plot=False, xlim = (0,0.3), trunc_length = None, verbose=False,
                FAIL_CNT_THRESH=1000)

#### Multiplicative bias

In [None]:
def ok(i, tmp): return i >= 0 and i < len(tmp)

best_tv = 1e10
best_param = {'beta': -1, 'shift': 0}


Omega_new, pdf_latent_new, pdf_biased_new = merge_omega_and_pdf(omega=Omega[lb:ub], pdf_latent=pdf_latent[lb:ub], pdf_biased=pdf_biased[lb:ub], window_sz=4)

mp = {v: i      for i, v in enumerate(Omega_new)}
 
mu = 0
pbar = tqdm([0] + list(range(-len(Omega_new), len(Omega_new), 1)))
for shift in pbar:
    pbar.set_description(f"best_tv={np.round(best_tv, 2)} || best_param={np.round(best_param['beta'], 2), best_param['shift']}")
    for beta in np.logspace(-4, 0, 1000):
        pdf_syn_biased_utilities_ = np.zeros_like(pdf_latent_new)

        for i, v in enumerate(Omega_new):
            w = beta * (v + min(Omega_new)) - min(Omega_new)
            w = Omega_new[   np.searchsorted(Omega_new, w)   ]
            pdf_syn_biased_utilities_[mp[w]] += pdf_latent_new[i] 

        tmp = copy.deepcopy(pdf_syn_biased_utilities_)
        get_shift(pdf_syn_biased_utilities_, tmp, s=shift)
        pdf_syn_biased_utilities_ = tmp
    
        cur_tv = sum(abs(pdf_syn_biased_utilities_/sum(pdf_syn_biased_utilities_) - pdf_biased_new/sum(pdf_biased_new))) / 2

        if cur_tv < best_tv:
            best_tv = cur_tv
            best_param['beta'] = beta
            best_param['shift'] = shift

plt.figure().clear()
x = list(Omega_new)
print(f'best_tv={best_tv}')
beta = best_param['beta']
shift = best_param['shift']
print(f'beta={beta}')

pdf_syn_biased_utilities_ = np.zeros_like(pdf_latent_new)

for i, v in enumerate(Omega_new):
    w = beta * (v + min(Omega_new)) - min(Omega_new)
    w = Omega_new[   np.searchsorted(Omega_new, w)   ]
    pdf_syn_biased_utilities_[mp[w]] += pdf_latent_new[i]

tmp = copy.deepcopy(pdf_syn_biased_utilities_)
get_shift(pdf_syn_biased_utilities_, tmp, s=shift)
pdf_syn_biased_utilities_ = tmp

plt.plot(x, pdf_syn_biased_utilities_, label='Synthetic Biased Utilities', linewidth=4)
plt.plot(x, pdf_biased_new, label='Biased Utilities in Data', linewidth=4)

plt.legend()
plt.show()
plt.close()

#### Implicit variance

In [None]:
def add_gaussian(pdf_latent, omega, mu, sigma):
    pdf_norm = lambda x: np.exp(  - (x-mu)**2 / 2 / sigma / sigma  ) /  np.sqrt( 2 * np.pi) / sigma

    biased_utilities = np.zeros(len(omega))
    for i, v in enumerate(omega):
        for j, w in enumerate(omega):
            biased_utilities[i] += pdf_norm(v - w) * pdf_latent[j]

    return biased_utilities / sum(biased_utilities)

In [None]:
best_tv = 1e10
best_param = {'mu': 0, 'sigma': -1, 'shift': 0}

Omega_new, pdf_latent_new, pdf_biased_new = merge_omega_and_pdf(omega=Omega[lb:ub], pdf_latent=pdf_latent[lb:ub], pdf_biased=pdf_biased[lb:ub], window_sz=4)
 
mu = 0
pbar = tqdm([0] + list(range(-len(Omega_new), len(Omega_new), 1)))
for shift in pbar:
    pbar.set_description(f"best_tv={np.round(best_tv, 2)} || best_param={np.round(best_param['mu'], 2), np.round(best_param['sigma'], 2), best_param['shift']}")
    for sigma in np.logspace(-2, 1, 100):
        pdf_syn_biased_utilities_ = add_gaussian(pdf_latent=pdf_latent_new, omega=Omega_new, mu=mu, sigma=sigma)
        
        tmp = copy.deepcopy(pdf_syn_biased_utilities_)
        get_shift(pdf_syn_biased_utilities_, tmp, s=shift)
        pdf_syn_biased_utilities_ = tmp

        cur_tv = sum(abs(pdf_syn_biased_utilities_ - pdf_biased_new)) / 2

        if cur_tv < best_tv:
            best_tv = cur_tv
            best_param['mu'] = mu
            best_param['sigma'] = sigma 
            best_param['shift'] = shift

plt.figure().clear()
x = list(Omega_new)
print(f'best_tv={best_tv}')

pdf_syn_biased_utilities_ = add_gaussian(pdf_latent=pdf_latent_new, omega=Omega_new, mu=best_param['mu'], sigma=best_param['sigma'])
shift = best_param['shift']
tmp = copy.deepcopy(pdf_syn_biased_utilities_)
get_shift(pdf_syn_biased_utilities_, tmp, s=shift)
pdf_syn_biased_utilities_ = tmp
                
plt.plot(x, pdf_syn_biased_utilities_, label='Synthetic Biased Utilities', linewidth=4)
plt.plot(x, pdf_biased_new, label='Biased Utilities in Data', linewidth=4)

plt.legend()
plt.show()
plt.close()