In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/sample.csv')

df.drop('Resume score', axis=1, inplace=True)

columns = df.columns

sensitive_space = {
    'Gender': ['M', 'F', pd.NA],
    'Veteran status': [0, 1],
    'Work authorization': [0, 1],
    'Disability': [0, 1, pd.NA],
    'Ethnicity': [0, 1, pd.NA]
}

sensitive_space_product = pd.MultiIndex.from_product(
    sensitive_space.values(), names=sensitive_space.keys())


# Generate len(df)*possible combinations of sensitive space samples
# Fill non-sensitive columns with random rows from sample.csv
# Each row from original df should not observe different scoring results(if fair)
idx = np.arange(0, len(df)).reshape(-1, 1).repeat(len(sensitive_space_product), axis=1).reshape(-1)

df_dup = df.iloc[idx]
df_dup.reset_index(drop=True, inplace=True)
df_dup = df_dup.drop(sensitive_space.keys(), axis=1)

df_sensitive = pd.DataFrame(sensitive_space_product.to_list(),
                            columns=sensitive_space.keys())
df_sensitive = pd.concat([df_sensitive] * len(df), axis=0)
df_sensitive = df_sensitive.reset_index(drop=True)

df_dup = pd.concat([df_sensitive, df_dup], axis=1)
df_dup['Applicant ID'] = np.arange(0, len(df_dup))

display(df_dup.head())
df_dup = df_dup[columns]

df_dup.to_csv('data/query.csv', index=False, na_rep='N/A')

with open('data/query.csv', 'rb+') as filehandle:
    filehandle.seek(-1, 2)
    filehandle.truncate()



Unnamed: 0,Gender,Veteran status,Work authorization,Disability,Ethnicity,Applicant ID,School Name,GPA,Degree,Location,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,M,0,0,0.0,0.0,0,Providence University,3.81,Bachelors,Providence,Junior SWE,5/20,,,,,,,
1,M,0,0,0.0,1.0,1,Providence University,3.81,Bachelors,Providence,Junior SWE,5/20,,,,,,,
2,M,0,0,0.0,,2,Providence University,3.81,Bachelors,Providence,Junior SWE,5/20,,,,,,,
3,M,0,0,1.0,0.0,3,Providence University,3.81,Bachelors,Providence,Junior SWE,5/20,,,,,,,
4,M,0,0,1.0,1.0,4,Providence University,3.81,Bachelors,Providence,Junior SWE,5/20,,,,,,,


> Upload to scorer

In [3]:
df = pd.read_csv('data/scores.csv')
df.drop('applicant_id', axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df = pd.concat([df_dup, df], axis=1)
df.to_csv('data/to_evaluate.csv', index=False, na_rep='N/A')

with open('data/to_evaluate.csv', 'rb+') as filehandle:
    filehandle.seek(-1, 2)
    filehandle.truncate()

> Upload to evaluator

In [4]:
df_decision = pd.read_csv('data/decision.csv')
df_decision = df_decision.drop('applicant_id', axis=1)
df_decision.reset_index(drop=True, inplace=True)
df = pd.concat([df, df_decision], axis=1)
df.to_csv('data/result.csv', index=False, na_rep='N/A')