# Gender Bias Measurement of Information Retrieval
https://arxiv.org/abs/2005.00372

## Step1: calculate_bias_documents
Step 1 calculates the gender bias of each document in collection. Set `collection_path` to the path of the collection with TSV format. For MS MARCO collection, this should refer to the `collection.tsv` file. The code does not apply any particular pre-processing to the text unless converting it to lower case.
* Input: collection.tsv, wordlist_genderspecific.txt
* Output: docs_bias_bool.pkl, docs_bias_tc.pkl, docs_bias_tf.pkl

In [None]:
import collections
import numpy as np
import pickle

import copy
from csv import DictWriter

In [None]:
import os
path_directory = os.path.dirname(os.getcwd())
path_collection = os.path.join(path_directory, 'MSMARCO', 'collection.tsv')
path_collection

In [None]:
collection_path = path_collection #path to a preprocessed version of collection.tsv provided by MSMARCO collection

wordlist_path = "resources/wordlist_genderspecific.txt"

if not os.path.exists("data") or not os.path.isdir("data"):
    os.makedirs("data")

docs_bias_save_paths = {'tc':"data/docs_bias_tc.pkl",
                        'bool':"data/docs_bias_bool.pkl",
                        'tf':"data/docs_bias_tf.pkl"}


In [None]:
# List of gender-related words
genderwords_feml = []
genderwords_male = []

for l in open(wordlist_path):
    vals = l.strip().lower().split(',')
    if vals[1]=='f':
        genderwords_feml.append(vals[0])
    elif vals[1]=='m':
        genderwords_male.append(vals[0])

genderwords_feml = set(genderwords_feml)
genderwords_male = set(genderwords_male)

print(len(genderwords_feml), len(genderwords_male))

In [None]:
# Functions to convert to lower case and get gender magnitudes: term count, term frequency and boolean
def get_tokens(text):
    return text.lower().split(" ")

def get_bias(tokens):
    text_cnt = collections.Counter(tokens)
    
    cnt_feml = 0
    cnt_male = 0
    cnt_logfeml = 0
    cnt_logmale = 0
    for word in text_cnt:
        if word in genderwords_feml:
            cnt_feml += text_cnt[word]
            cnt_logfeml += np.log(text_cnt[word] + 1)
        elif word in genderwords_male:
            cnt_male += text_cnt[word]
            cnt_logmale += np.log(text_cnt[word] + 1)
    text_len = np.sum(list(text_cnt.values()))
    
    bias_tc = (float(cnt_feml - cnt_male), float(cnt_feml), float(cnt_male))
    bias_tf = (np.log(cnt_feml + 1) - np.log(cnt_male + 1), np.log(cnt_feml + 1), np.log(cnt_male + 1))
    bias_bool = (np.sign(cnt_feml) - np.sign(cnt_male), np.sign(cnt_feml), np.sign(cnt_male))
    
    return bias_tc, bias_tf, bias_bool

get_bias (get_tokens("a war day and many boys , women and men"))

In [None]:
# gender magnitude for all documents in collection
docs_bias = {'tc':{}, 'tf':{}, 'bool':{}}
empty_cnt = 0
with open(collection_path) as fr:
    for i, line in enumerate(fr):
        vals = line.strip().split('\t')
        docid = int(vals[0])
        if len(vals) == 2:
            _text = vals[1]
        else:
            _text = ""
            empty_cnt += 1
        
        _res = get_bias(get_tokens(_text))
        docs_bias['tc'][docid] = _res[0]
        docs_bias['tf'][docid] = _res[1]
        docs_bias['bool'][docid] = _res[2]
            
        if i % 1000000 == 0:
            print (i)
            
print ('done!')
print ('number of skipped documents: %d' % empty_cnt)

In [None]:
# saving bias values of documents
for _method in docs_bias:
    print (_method)
    with open(docs_bias_save_paths[_method], 'wb') as fw:
        pickle.dump(docs_bias[_method], fw)


## Step 2: calculate_bias_runs
Using the pre-calculated document biases, Step 2 calculates gender bias scores for each query for the given retrieval run files.
* Inputs: .run files for each model, docs_bias_tc/tf/bool.pkl (from step 1), queries_gender_annotated.csv
* Outputs: e.g.: run_bias_bi_msmarco_L6_bool_ARaB.pkl

In [None]:
# Paths to TREC run files
experiments_paths = {'bi_msmarco_L6': 'reranker_trec_run/results_bi_msmarco_L6.run',
                   'bi_msmarco_L12': 'reranker_trec_run/results_bi_msmarco_L12.run',
                   'cross_msmarco_L6': 'reranker_trec_run/results_cross_msmarco_L6.run',
                   'cross_msmarco_L12': 'reranker_trec_run/results_cross_msmarco_L12.run'}

docs_bias_paths = {'tc':"data/docs_bias_tc.pkl",
                   'tf':"data/docs_bias_tf.pkl",
                   'bool':"data/docs_bias_bool.pkl",
                   }

at_ranklist = [5, 10, 20, 30, 40]

queries_gender_annotated_path = "resources/queries_gender_annotated.csv"

save_path_base = "data/all"
if not os.path.exists(save_path_base):
    os.makedirs(save_path_base)

In [None]:
#Loading saved document bias values
docs_bias = {}
for _method in docs_bias_paths:
    print (_method)
    with open(docs_bias_paths[_method], 'rb') as fr:
        docs_bias[_method] = pickle.load(fr)

In [None]:
#Loading gendered queries (in our case all)
qryids_filter = []
with open(queries_gender_annotated_path, 'r') as fr:
    for line in fr:
        vals = line.strip().split(',')
        qryid = vals[0]
        qryids_filter.append(qryid)

qryids_filter = set(qryids_filter)
print (len(qryids_filter))

In [None]:
#Loading run files

runs_docs_bias = {}
    
for exp_name in experiments_paths:
    
    run_path = experiments_paths[exp_name]
    runs_docs_bias[exp_name] = {}
    
    for _method in docs_bias_paths:
        runs_docs_bias[exp_name][_method] = {}
    
    with open(run_path) as fr:
        qryid_cur = 0
        for i, line in enumerate(fr):
            vals = line.strip().split(' ')
            if len(vals) == 6:
                qryid = vals[0] #int(vals[0])
                docid = int(vals[2])

                if (qryid not in qryids_filter):
                    continue
                
                if qryid != qryid_cur:
                    for _method in docs_bias_paths:
                        runs_docs_bias[exp_name][_method][qryid] = []
                    qryid_cur = qryid
                for _method in docs_bias_paths:
                    runs_docs_bias[exp_name][_method][qryid].append(docs_bias[_method][docid])
      
    for _method in docs_bias_paths:
        print ("Number of effective queries in %s using %s : %d" % (exp_name, _method, len(runs_docs_bias[exp_name][_method].keys())))
    print ()
print ('done!')

runs_docs_bias_FM = copy.deepcopy(runs_docs_bias)

In [None]:
runs_docs_bias_FM

In [None]:
def calc_RaB_q(bias_list, at_rank):
    bias_val = np.mean([x[0] for x in bias_list[:at_rank]])
    bias_feml_val = np.mean([x[1] for x in bias_list[:at_rank]])
    bias_male_val = np.mean([x[2] for x in bias_list[:at_rank]])
    
    return bias_val, bias_feml_val, bias_male_val
       
    
def calc_ARaB_q(bias_list, at_rank):
    
    _vals = []
    _feml_vals = []
    _male_vals = []
    for t in range(at_rank):
        if len(bias_list) >= t+1:
            _val_RaB, _feml_val_RaB, _male_val_RaB = calc_RaB_q(bias_list, t+1)
            _vals.append(_val_RaB)
            _feml_vals.append(_feml_val_RaB)
            _male_vals.append(_male_val_RaB)

    bias_val = np.mean(_vals)
    bias_feml_val = np.mean(_feml_vals)
    bias_male_val = np.mean(_male_vals)
    
    return bias_val, bias_feml_val, bias_male_val

def calc_nDRaB_q(bias_list, at_rank):
    weight = 1/np.log2(np.arange(1,at_rank+1)+1)
    
    bias_val = np.mean([x[0] for x in bias_list[:at_rank]]*weight)/np.mean(weight)
    bias_feml_val = np.mean([x[1] for x in bias_list[:at_rank]]*weight)/np.mean(weight)
    bias_male_val = np.mean([x[2] for x in bias_list[:at_rank]]*weight)/np.mean(weight)
    
    return bias_val, bias_feml_val, bias_male_val

_test = [(0.0, 0.0, 0.0),(3, 3, 0.0),(0, 0, 0.0),(0, 0, 0.0),(0, 0, 0.0),(0, 0, 0.0),(0, 0.0, 0.0),(-5, 0.0, 5),(0, 0.0, 0.0),(-2, 0.0, 2)]
#_test = [(10, 0, 0),(1, 1, 0),(0, 0, 0),(1, 1, 0),(0, 0, 0),(-1, 1, 0),(-1, 1, 0),(0, 0, 0),(-1, 1, 0),(1, 1, 0)]

print ('RaB_q', calc_RaB_q(_test, 10))
print ('ARaB_q', calc_ARaB_q(_test, 10))
print ('nDRaB_q', calc_nDRaB_q(_test, 10))


In [None]:
qry_bias_RaB = {}
qry_bias_ARaB = {}
qry_bias_nDRaB = {}
     
print ('Calculating ranking bias ...')

        
for exp_name in experiments_paths:
    qry_bias_RaB[exp_name] = {}
    qry_bias_ARaB[exp_name] = {}
    qry_bias_nDRaB[exp_name] = {}


    for _method in docs_bias_paths:
        print (exp_name, _method)

        qry_bias_RaB[exp_name][_method] = {}
        qry_bias_ARaB[exp_name][_method] = {}
        qry_bias_nDRaB[exp_name][_method] = {}

        for at_rank in at_ranklist:
            qry_bias_RaB[exp_name][_method][at_rank] = {}
            qry_bias_ARaB[exp_name][_method][at_rank] = {}
            qry_bias_nDRaB[exp_name][_method][at_rank] = {}

            for qry_id in runs_docs_bias[exp_name][_method]:
                qry_bias_RaB[exp_name][_method][at_rank][qry_id] = calc_RaB_q(runs_docs_bias[exp_name][_method][qry_id], at_rank)
                qry_bias_ARaB[exp_name][_method][at_rank][qry_id] = calc_ARaB_q(runs_docs_bias[exp_name][_method][qry_id], at_rank)
                qry_bias_nDRaB[exp_name][_method][at_rank][qry_id] = calc_nDRaB_q(runs_docs_bias[exp_name][_method][qry_id], at_rank)
    
print ('done!')


In [None]:
for exp_name in experiments_paths:
    for _method in docs_bias_paths:
        save_path = save_path_base + "/run_bias_%s_%s" % (exp_name, _method)

        print (save_path)

        with open(save_path + '_RaB.pkl', 'wb') as fw:
            pickle.dump(qry_bias_RaB[exp_name][_method], fw)

        with open(save_path + '_ARaB.pkl', 'wb') as fw:
            pickle.dump(qry_bias_ARaB[exp_name][_method], fw)
            
        with open(save_path + '_nDRaB.pkl', 'wb') as fw:
            pickle.dump(qry_bias_nDRaB[exp_name][_method], fw)


## Step3: bias_metrics
Step 3 calculates the final gender bias metrics for each experiment.

In [None]:
experiments = list(experiments_paths.keys())

metrics = ['RaB', 'ARaB', 'nDRaB'] 
methods = ['tf', 'bool']


qry_bias_paths = {}
for metric in metrics:
    qry_bias_paths[metric] = {}
    for exp_name in experiments:
        qry_bias_paths[metric][exp_name] = {}
        for _method in methods:
            qry_bias_paths[metric][exp_name][_method] = save_path_base + '/run_bias_%s_%s_%s.pkl' % (exp_name, _method, metric)

results_path = 'results'
if not os.path.exists(results_path) or not os.path.isdir(results_path):
    os.makedirs(results_path)

In [None]:
qry_bias_perqry = {}

for metric in metrics:
    qry_bias_perqry[metric] = {}
    for exp_name in experiments:
        qry_bias_perqry[metric][exp_name] = {}
        for _method in methods:
            _path = qry_bias_paths[metric][exp_name][_method]
            print (_path)
            with open(_path, 'rb') as fr:
                qry_bias_perqry[metric][exp_name][_method] = pickle.load(fr)

In [None]:
queries_effective = {}
with open(queries_gender_annotated_path, 'r') as fr:
    for line in fr:
        vals = line.strip().split(',')
        qryid = vals[0] #int(vals[0])
        qrytext = ' '.join(vals[1:-1])
        qrygender = vals[-1]
        if qrygender == 'n':
            queries_effective[qryid] = qrytext
len(queries_effective)

In [None]:
# Save results per query (F, M)
results = []

for at_rank in at_ranklist:
    for _method in methods:
        for exp_name in experiments:
            for metric in metrics:
                for qryid in queries_effective.keys():
                    results.append(dict(
                        rank=at_rank,
                        method=_method,
                        model=exp_name,
                        metric=metric,
                        query = qryid,
                        F=qry_bias_perqry[metric][exp_name][_method][at_rank][qryid][1],
                        M=qry_bias_perqry[metric][exp_name][_method][at_rank][qryid][2]))

path = os.path.join(results_path, 'FM_per_query' + '.csv')

with open(path, 'w+') as f:
    writer = DictWriter(f, fieldnames=results[0].keys(), delimiter='\t')
    writer.writeheader()
    for r in results:
        writer.writerow(r)

In [None]:
# Load results and save in needed format
import pandas as pd
import numpy as np

df = pd.read_csv(path, sep='\t')

df['P'] = np.where(df['query'].astype(str).str[2]=='F', df['F'], df['M'])
df['CP'] = np.where(df['query'].astype(str).str[2]=='F', df['M'], df['F'])
df['P-CP'] = df['P'] - df['CP']
df.drop(['F','M'], axis=1, inplace=True)

# save per query results (P, CP, P-CP)
path_per_query = os.path.join(results_path, 'res_per_query' + '.csv')
df.to_csv(path_per_query, sep='\t', index=False)

In [None]:
# per topic
df['topic'] = df['query'].astype(str).str[0]
df_topic = df.groupby(['topic', 'rank', 'method', 'model', 'metric'])[['P', 'CP', 'P-CP']].mean()

path_per_topic = os.path.join(results_path, 'res_per_topic' + '.csv')
df_topic.to_csv(path_per_topic, sep='\t')

In [None]:
# averaged over all topics
df_averaged = df.groupby(['rank', 'method', 'model', 'metric'])[['P', 'CP', 'P-CP']].mean()

path_averaged = os.path.join(results_path, 'res_averaged' + '.csv')
df_averaged.to_csv(path_averaged, sep='\t')