# Algorithmic Fairness, Accountability, and Ethics, Spring 2025

## Mandatory Assignment 2

Please use the following code to prepare the dataset.
 

In [1]:
from folktables.acs import adult_filter
from folktables import ACSDataSource
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)

feature_names = ['AGEP', # Age
                 "CIT", # Citizenship status
                 'COW', # Class of worker
                 "ENG", # Ability to speak English
                 'SCHL', # Educational attainment
                 'MAR', # Marital status
                 "HINS1", # Insurance through a current or former employer or union
                 "HINS2", # Insurance purchased directly from an insurance company
                 "HINS4", # Medicaid
                 "RAC1P", # Recoded detailed race code
                 'SEX']

target_name = "PINCP" # Total person's income

def data_processing(data, features, target_name:str, threshold: float = 35000):
    df = data
    ### Adult Filter (STARTS) (from Foltktables)
    df = df[~df["SEX"].isnull()]
    df = df[~df["RAC1P"].isnull()]
    df = df[df['AGEP'] > 16]
    df = df[df['PINCP'] > 100]
    df = df[df['WKHP'] > 0]
    df = df[df['PWGTP'] >= 1]
    ### Adult Filter (ENDS)
    ### Groups of interest
    sex = df["SEX"].values
    ### Target
    df["target"] = df[target_name] > threshold
    target = df["target"].values
    df = df[features + ["target", target_name]] ##we want to keep df before one_hot encoding to make Bias Analysis
    df_processed = df[features].copy()
    cols = [ "HINS1", "HINS2", "HINS4", "CIT", "COW", "SCHL", "MAR", "SEX", "RAC1P"]
    df_processed = pd.get_dummies(df_processed, prefix=None, prefix_sep='_', dummy_na=False, columns=cols, drop_first=True)
    df_processed = pd.get_dummies(df_processed, prefix=None, prefix_sep='_', dummy_na=True, columns=["ENG"], drop_first=True)
    return df_processed, df, target, sex

data, data_original, target, group = data_processing(acs_data, feature_names, target_name)

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    data, target, group, test_size=0.2, random_state=0)

Downloading data for 2018 1-Year person survey for CA...


In [18]:
data_original

Unnamed: 0,AGEP,CIT,COW,ENG,SCHL,MAR,HINS1,HINS2,HINS4,RAC1P,SEX,target,PINCP
0,30,1,6.0,1.0,14.0,1,2,2,1,8,1,True,48500.0
6,21,1,4.0,,16.0,5,2,1,2,1,1,False,7700.0
7,65,1,2.0,1.0,22.0,5,2,2,2,1,1,False,17200.0
10,33,1,1.0,1.0,14.0,3,2,2,1,1,1,False,12000.0
13,18,1,2.0,,19.0,5,2,1,2,1,2,False,300.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
378801,38,5,1.0,1.0,22.0,1,1,2,2,6,1,True,565280.0
378802,39,5,1.0,1.0,22.0,1,1,2,2,6,2,True,210000.0
378807,61,1,1.0,,19.0,1,1,2,2,1,1,True,105000.0
378811,69,4,7.0,2.0,24.0,1,2,2,2,6,1,False,30000.0


In [16]:
np.unique(target, return_counts=True)

(array([False,  True]), array([ 87516, 108149]))

In [8]:
X_train.SEX_2.value_counts(normalize=True)

0    0.528263
1    0.471737
Name: SEX_2, dtype: float64

In [21]:
print(f"men's target : {data_original[data_original.SEX==1].target.value_counts(normalize=True)}")

print(f"women's target : {data_original[data_original.SEX==2].target.value_counts(normalize=True)}")

men's target : True     0.607118
False    0.392882
Name: target, dtype: float64
women's target : False    0.508121
True     0.491879
Name: target, dtype: float64


In [11]:
data_original.RAC1P.value_counts(normalize=True)

1    0.618435
6    0.167168
8    0.116490
2    0.043733
9    0.041939
3    0.006613
7    0.003256
5    0.002300
4    0.000066
Name: RAC1P, dtype: float64

 "RAC1P": {
        1.0: "White alone",
        2.0: "Black or African American alone",
        3.0: "American Indian alone",
        4.0: "Alaska Native alone",
        5.0: (
            "American Indian and Alaska Native tribes specified;"
            "or American Indian or Alaska Native,"
            "not specified and no other"
        ),
        6.0: "Asian alone",
        7.0: "Native Hawaiian and Other Pacific Islander alone",
        8.0: "Some Other Race alone",
        9.0: "Two or More Races",
    }

In [22]:
print(f"white people's target : {data_original[data_original.RAC1P==1].target.value_counts(normalize=True)}")

print(f"non-white people's target : {data_original[data_original.RAC1P!=1].target.value_counts(normalize=True)}")

white people's target : True     0.582723
False    0.417277
Name: target, dtype: float64
non-white people's target : True     0.504105
False    0.495895
Name: target, dtype: float64
