#**Bias Detection**

#Equalized Odds

###For 'sex' attribute

In [None]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race",
    "sex", "capital-gain", "capital-loss", "hours-per-week",
    "native-country", "income"
]

df = pd.read_csv(url, names=columns, na_values=' ?', skipinitialspace=True)

protected_attribute = 'sex'
positive_outcome = '>50K'

protected_groups = df[protected_attribute].unique()
bias_results = {}

for group in protected_groups:
    group_data = df[df[protected_attribute] == group]
    positive_outcome_count = group_data[group_data['income'] == positive_outcome].shape[0]
    total_count = group_data.shape[0]
    proportion_positive = positive_outcome_count / total_count
    bias_results[group] = proportion_positive


tpr_results = {}
tnr_results = {}

female_value = 'Female'
male_value = 'Male'

for group in protected_groups:
    group_data = df[df[protected_attribute] == group]
    true_positive_count = group_data[(group_data['income'] == positive_outcome) & (group_data['race'] == 'White')].shape[0]
    true_negative_count = group_data[(group_data['income'] != positive_outcome) & (group_data['race'] == 'White')].shape[0]
    total_positive = group_data[group_data['income'] == positive_outcome].shape[0]
    total_negative = group_data[group_data['income'] != positive_outcome].shape[0]
    tpr_results[group] = true_positive_count / total_positive
    tnr_results[group] = true_negative_count / total_negative


max_tpr = max(tpr_results.values())
min_tpr = min(tpr_results.values())
max_tnr = max(tnr_results.values())
min_tnr = min(tnr_results.values())

equalized_odds_tpr = max_tpr - min_tpr
equalized_odds_tnr = max_tnr - min_tnr

print("Bias Results:")
for group, proportion in bias_results.items():
    print(f"{group}: Proportion of positive outcome = {proportion:.2f}")

print(f"\nEqualized Odds (TPR): {equalized_odds_tpr:.2f}")
print(f"Equalized Odds (TNR): {equalized_odds_tnr:.2f}")


Bias Results:
Male: Proportion of positive outcome = 0.31
Female: Proportion of positive outcome = 0.11

Equalized Odds (TPR): 0.04
Equalized Odds (TNR): 0.07


In [None]:
protected_attribute = 'sex'
positive_outcome = '>50K'
protected_groups = df[protected_attribute].unique()

equalized_odds_tpr = {}
equalized_odds_tnr = {}

for group in protected_groups:
    group_data = df[df[protected_attribute] == group]
    true_positive_count = group_data[group_data['income'] == positive_outcome].shape[0]
    true_negative_count = group_data[group_data['income'] != positive_outcome].shape[0]
    total_positive = df[df['income'] == positive_outcome].shape[0]
    total_negative = df[df['income'] != positive_outcome].shape[0]
    tpr = true_positive_count / total_positive
    tnr = true_negative_count / total_negative
    equalized_odds_tpr[group] = tpr
    equalized_odds_tnr[group] = tnr

equalized_odds_tpr_value = max(equalized_odds_tpr.values()) - min(equalized_odds_tpr.values())
equalized_odds_tnr_value = max(equalized_odds_tnr.values()) - min(equalized_odds_tnr.values())

print("Equalized Odds (TPR):", equalized_odds_tpr_value)
print("Equalized Odds (TNR):", equalized_odds_tnr_value)


Equalized Odds (TPR): 0.6992730519066446
Equalized Odds (TNR): 0.22394822006472492


###For 'race' attribute

In [None]:
protected_attribute = 'race'
positive_outcome = '>50K'
protected_groups = df[protected_attribute].unique()
bias_results = {}

for group in protected_groups:
    group_data = df[df[protected_attribute] == group]
    positive_outcome_count = group_data[group_data['income'] == positive_outcome].shape[0]
    total_count = group_data.shape[0]
    proportion_positive = positive_outcome_count / total_count
    bias_results[group] = proportion_positive

tpr_results = {}
tnr_results = {}

female_value = 'Female'
male_value = 'Male'

for group in protected_groups:
    group_data = df[df[protected_attribute] == group]
    true_positive_count = group_data[(group_data['income'] == positive_outcome) & (group_data['sex'] == female_value)].shape[0]
    true_negative_count = group_data[(group_data['income'] != positive_outcome) & (group_data['sex'] == female_value)].shape[0]
    total_positive = group_data[group_data['income'] == positive_outcome].shape[0]
    total_negative = group_data[group_data['income'] != positive_outcome].shape[0]
    tpr_results[group] = true_positive_count / total_positive
    tnr_results[group] = true_negative_count / total_negative

max_tpr = max(tpr_results.values())
min_tpr = min(tpr_results.values())
max_tnr = max(tnr_results.values())
min_tnr = min(tnr_results.values())

equalized_odds_tpr = max_tpr - min_tpr
equalized_odds_tnr = max_tnr - min_tnr

print("Bias Results:")
for group, proportion in bias_results.items():
    print(f"{group}: Proportion of positive outcome = {proportion:.2f}")

print(f"\nEqualized Odds (TPR): {equalized_odds_tpr:.2f}")
print(f"Equalized Odds (TNR): {equalized_odds_tnr:.2f}")


Bias Results:
White: Proportion of positive outcome = 0.26
Black: Proportion of positive outcome = 0.12
Asian-Pac-Islander: Proportion of positive outcome = 0.27
Amer-Indian-Eskimo: Proportion of positive outcome = 0.12
Other: Proportion of positive outcome = 0.09

Equalized Odds (TPR): 0.19
Equalized Odds (TNR): 0.17


In [None]:
protected_attribute = 'race'
positive_outcome = '>50K'
protected_groups = df[protected_attribute].unique()
equalized_odds_tpr = {}
equalized_odds_tnr = {}

for group in protected_groups:
    group_data = df[df[protected_attribute] == group]
    true_positive_count = group_data[group_data['income'] == positive_outcome].shape[0]
    true_negative_count = group_data[group_data['income'] != positive_outcome].shape[0]
    total_positive = df[df['income'] == positive_outcome].shape[0]
    total_negative = df[df['income'] != positive_outcome].shape[0]
    tpr = true_positive_count / total_positive
    tnr = true_negative_count / total_negative
    equalized_odds_tpr[group] = tpr
    equalized_odds_tnr[group] = tnr


equalized_odds_tpr_value = max(equalized_odds_tpr.values()) - min(equalized_odds_tpr.values())
equalized_odds_tnr_value = max(equalized_odds_tnr.values()) - min(equalized_odds_tnr.values())

print("Equalized Odds (TPR):", equalized_odds_tpr_value)
print("Equalized Odds (TNR):", equalized_odds_tnr_value)


Equalized Odds (TPR): 0.9044764698380309
Equalized Odds (TNR): 0.8273867313915858


#Statistical Parity

###For 'race' attribute

In [None]:
protected_attribute = 'race'
positive_outcome = '>50K'

protected_groups = df[protected_attribute].unique()
bias_results = {}

for group in protected_groups:
    group_data = df[df[protected_attribute] == group]
    positive_outcome_count = group_data[group_data['income'] == positive_outcome].shape[0]
    total_count = group_data.shape[0]
    proportion_positive = positive_outcome_count / total_count
    bias_results[group] = proportion_positive

statistical_parity = max(bias_results.values()) - min(bias_results.values())

print("Bias Results:")
for group, proportion in bias_results.items():
    print(f"{group}: Proportion of positive outcome = {proportion:.2f}")

print(f"\nStatistical Parity for race attribute: {statistical_parity:.2f}")


Bias Results:
White: Proportion of positive outcome = 0.26
Black: Proportion of positive outcome = 0.12
Asian-Pac-Islander: Proportion of positive outcome = 0.27
Amer-Indian-Eskimo: Proportion of positive outcome = 0.12
Other: Proportion of positive outcome = 0.09

Statistical Parity for race attribute: 0.17


###For 'sex' attribute

In [None]:

protected_attribute = 'sex'
positive_outcome = '>50K'

protected_groups = df[protected_attribute].unique()
bias_results = {}

for group in protected_groups:
    group_data = df[df[protected_attribute] == group]
    positive_outcome_count = group_data[group_data['income'] == positive_outcome].shape[0]
    total_count = group_data.shape[0]
    proportion_positive = positive_outcome_count / total_count
    bias_results[group] = proportion_positive

statistical_parity = max(bias_results.values()) - min(bias_results.values())

print("Bias Results:")
for group, proportion in bias_results.items():
    print(f"{group}: Proportion of positive outcome = {proportion:.2f}")

print(f"\nStatistical Parity for sex attribute: {statistical_parity:.2f}")


Bias Results:
Male: Proportion of positive outcome = 0.31
Female: Proportion of positive outcome = 0.11

Statistical Parity for sex attribute: 0.20


#Bias Mitigation

#Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
import plotly.graph_objects as go

In [None]:
data = pd.read_csv('/content/adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


#Visual Representation of Bias

In [None]:
import plotly.express as px
features = ['sex', 'race', 'native-country']
threshold = 1
small_counts = {}

for feature in features:
    counts = data[feature].value_counts(normalize=True) * 100
    small_counts[feature] = counts[counts < threshold].sum()
    counts = counts[counts >= threshold]
    fig = px.pie(names=counts.index, values=counts.values, title=f'Distribution of {feature}')
    fig.show()


In [None]:
counts = data['salary'].value_counts()
fig = px.pie(names=counts.index, values=counts.values, title="Pie of number of datatopoints per income level")
fig.show()

The fraction of the population that makes less than 50K is about many times more than the fraction of the population that earns more than 50K.

In [None]:
male_data = data[data.sex == ' Male']
female_data = data[data.sex == ' Female']

titles = ['Distribution of income categories among males', 'Distribution of income categories among females']
for df, title in zip([male_data, female_data], titles):
    counts = df['salary'].value_counts()
    fig = px.pie(names=counts.index, values=counts.values, title=title)
    fig.show()

The number of datapoints in the male population is considerably higher than the number of datapoints in the female category, exceeding it by more than 3 times in the higher income category.

In [None]:
counts = data['race'].value_counts()
fig = px.pie(names=counts.index, values=counts.values, title="Distribution of race categories")
fig.show()

In [None]:
unique_races = data['race'].unique()
for index, race in enumerate(unique_races):
    race_data = data[data['race'] == race]
    counts = race_data['salary'].value_counts()
    fig = px.pie(names=counts.index, values=counts.values, title="Distribution of salary among the population of " + race + " race")
    fig.show()

Different races have different levels of representation in terms number of datapoints with a given race
The levels of income distribution among a given race are different

#Pre-Processing the Data

In [None]:
df = data.copy()
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [None]:
df = df.drop(['education'], axis=1)

In [None]:

for feature in df.columns:
    df[feature] = df[feature].replace(' ?', np.nan)
df.dropna(how='any', inplace=True)

In [None]:
df.loc[df['native-country']!=' United-States', 'native-country'] = 'Non-US'
df.loc[df['native-country'] == ' United-States', 'native-country'] = 'US'
US_LABEL, NON_US_LABEL = (0, 1)
df['native-country'] = df['native-country'].map({'US':US_LABEL,'Non-US':NON_US_LABEL}).astype(int)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516.0,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,0,<=50K
1,50,Self-emp-not-inc,83311.0,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,0,<=50K
2,38,Private,215646.0,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,0,<=50K
3,53,Private,234721.0,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,0,<=50K
4,28,Private,338409.0,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,1,<=50K


In [None]:

label_encoder = LabelEncoder()

non_numeric_columns = df.select_dtypes(exclude=['int', 'float']).columns
for column in non_numeric_columns:
    df[column] = label_encoder.fit_transform(df[column])

print("Unique values for 'native-country' after encoding:", df['native-country'].unique())
print("Unique values for 'workclass' after encoding:", df['workclass'].unique())
print("Unique values for 'occupation' after encoding:", df['occupation'].unique())


Unique values for 'native-country' after encoding: [0 1]
Unique values for 'workclass' after encoding: [5 4 2 0 1 3 6]
Unique values for 'occupation' after encoding: [ 0  3  5  9  7 11 13  4  6 12  2 10  1  8]


Making value greater than 0 as 1

In [None]:


df['capital-gain'] = (df['capital-gain'] > 0).astype(int)
df['capital-loss'] = (df['capital-loss'] > 0).astype(int)

df.head()


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,5,77516.0,13.0,4,0,1,4,1,1,0,40.0,0,0
1,50,4,83311.0,13.0,2,3,0,4,1,0,0,13.0,0,0
2,38,2,215646.0,9.0,0,5,1,4,1,0,0,40.0,0,0
3,53,2,234721.0,7.0,2,5,0,2,1,0,0,40.0,0,0
4,28,2,338409.0,13.0,2,9,5,2,0,0,0,40.0,1,0


Normalize continuous features

In [None]:
continuous_features = ['age', 'fnlwgt', 'education-num', 'hours-per-week']

for feature in continuous_features:
    mean = df[feature].mean()
    std = df[feature].std()
    df[feature] = (df[feature] - mean) / std

df.head()


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,0.034201,5,-1.062283,1.12874,4,0,1,4,1,1,0,-0.078119,0,0
1,0.866407,4,-1.007427,1.12874,2,3,0,4,1,0,0,-2.326712,0,0
2,-0.041455,2,0.245281,-0.438117,0,5,1,4,1,0,0,-0.078119,0,0
3,1.093373,2,0.425848,-1.221545,2,5,0,2,1,0,0,-0.078119,0,0
4,-0.798006,2,1.407378,1.12874,2,9,5,2,0,0,0,-0.078119,1,0


#Train a machine learning algorithm on the Data

In [None]:

data_shuffled = df.sample(frac=1).reset_index(drop=True)
X = data_shuffled.drop(['salary'], axis=1)
y = data_shuffled['salary']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


In [None]:
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=1)
rf_classifier.fit(x_train, y_train)
prediction = rf_classifier.predict(x_test)


In [None]:
test_df = x_test.copy()
test_df['salary'] = y_test
test_df['pred'] = pd.Series(prediction, index=test_df.index)
test_df['accurate'] = (test_df['pred'] == test_df['salary'])
test_df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,pred,accurate
32347,0.790752,4,-1.485336,-0.438117,0,13,1,4,1,0,0,2.420318,0,0,0,True
574,-1.100627,2,-1.13096,-0.438117,4,7,3,4,1,0,0,-0.078119,0,0,0,True
22980,1.320338,2,1.217297,-0.438117,2,6,0,4,1,0,0,-0.078119,0,0,0,True
39070,-1.024972,2,0.430553,1.12874,4,3,3,4,0,0,0,-0.078119,0,0,0,True
17215,1.547304,5,1.073856,-0.046402,2,3,0,4,1,0,1,-0.078119,0,0,1,False


In [None]:
accuracy_percentage = test_df.accurate.mean() * 100
print("Accuracy: {:.2f}%".format(accuracy_percentage))

Accuracy: 83.50%


#Functions used for plotting

In [None]:
# FEMALE, MALE = (0, 1)
# HIGH_SALARY, LOW_SALARY = (1, 0)

In [None]:
def evaluate_gender_performance(results_df, print_stats=False):
    summaries = {}
    overall_accuracy = results_df.accurate.mean()
    summaries['accuracy_overall'] = overall_accuracy

    for gender in [(0, "Female"), (1, "Male")]:
        rows = results_df[results_df.sex==gender[0]]
        accuracy_for_gender = rows.accurate.mean();
        summaries['accuracy_'+gender[1]] = accuracy_for_gender

    for gender in [(0, "Female"), (1, "Male")]:
        rows = results_df[results_df.sex==gender[0]]
        positive_rate_for_gender = (rows['pred'] ==1).mean()
        summaries['positive_rate_'+gender[1]] = positive_rate_for_gender

    for gender in [(0, "Female"), (1, "Male")]:
        rows = results_df[results_df.sex==gender[0]]
        positive_rate_for_gender = (rows['pred'] ==0).mean()
        summaries['negative_rate_'+gender[1]] = positive_rate_for_gender

    for index, gender in enumerate([(0, "Female"), (1, "Male")]):
        rows = results_df[results_df.sex==gender[0]]

        high_income = rows[rows.salary== 1]
        low_income=rows[rows.salary == 0]

        high_pred = rows[rows.pred == 1]
        low_pred = rows[rows.pred == 0]

        true_positive_rate = high_income.accurate.mean()
        true_negative_rate = low_income.accurate.mean()
        summaries['true_positive_rate_'+gender[1]] = true_positive_rate
        summaries['true_negative_rate_'+gender[1]] = true_negative_rate


    return summaries

In [None]:
def model_summary(model_name, title, summary):
    metrics = ["accuracy", "positive_rate", "negative_rate", "true_positive_rate", "true_negative_rate"]
    gender_labels = ["Male", "Female"]
    traces = []
    for metric in metrics:
        male_value = summary[metric + "_Male"]
        female_value = summary[metric + "_Female"]
        trace_male = go.Bar(
            x=[metric],
            y=[male_value],
            name="Male",
            marker=dict(color='rgba(31, 119, 180, 0.5)'),
        )
        traces.append(trace_male)
        trace_female = go.Bar(
            x=[metric],
            y=[female_value],
            name="Female",
            marker=dict(color='rgba(255, 127, 14, 0.5)'),
        )
        traces.append(trace_female)

    layout = go.Layout(
        title=f'{title} - Metrics Comparison by Gender',
        xaxis=dict(title='Metrics', tickvals=list(range(len(metrics))), ticktext=metrics),
        yaxis=dict(title='Metric Value'),
        barmode='group',
        width=800,
        height=500
    )

    fig = go.Figure(data=traces, layout=layout)
    fig.show()

original = evaluate_gender_performance(test_df)
model_summary("RandomForest_no_debias", "", original)


In [None]:


def evaluate_gender_performance1(results_df, print_stats=False):

    summaries = {}
    overall_accuracy = results_df.accurate.mean()
    summaries['accuracy_overall'] = overall_accuracy

    for gender in [(0, "Female"), (1, "Male")]:
        rows = results_df[results_df.sex==gender[0]]
        accuracy_for_gender = rows.accurate.mean()
        summaries['accuracy_'+gender[1]] = accuracy_for_gender
        y_true = rows['salary']
        y_pred = rows['pred']
        recall = recall_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        summaries['recall_'+gender[1]] = recall
        summaries['precision_'+gender[1]] = precision
        summaries['f1_score_'+gender[1]] = f1

    return summaries


In [None]:

def model_summary1(model_name, title, summary):
    metrics = ["Accuracy", "Recall", "Precision", "F1 Score"]
    gender_labels = ["Male", "Female"]
    traces = []
    for metric_idx, metric in enumerate(metrics):
        male_value = summary[metric.lower().replace(" ", "_") + "_Male"]
        female_value = summary[metric.lower().replace(" ", "_") + "_Female"]
        trace_male = go.Bar(
            x=[metric_idx],
            y=[male_value],
            name=f"{metric} - Male",
            marker=dict(color='rgba(31, 119, 180, 0.5)')
        )
        trace_female = go.Bar(
            x=[metric_idx],
            y=[female_value],
            name=f"{metric} - Female",
            marker=dict(color='rgba(255, 127, 14, 0.5)'),
        )
        traces.append(trace_male)
        traces.append(trace_female)

    layout = go.Layout(
        title=f'{title} - Metrics Comparison by Gender',
        xaxis=dict(title='Metrics', tickvals=list(range(len(metrics))), ticktext=metrics),
        yaxis=dict(title='Metric Value'),
        barmode='group',
        width=800,
        height=500
    )

    fig = go.Figure(data=traces, layout=layout)
    fig.show()


original = evaluate_gender_performance1(test_df)
model_summary1("RandomForest_no_debias", "", original)


#Mitigation through dataset balancing


###Equal number of datapoints per demographic

In [None]:
datav3 = df.copy()
datav3.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,0.034201,5,-1.062283,1.12874,4,0,1,4,1,1,0,-0.078119,0,0
1,0.866407,4,-1.007427,1.12874,2,3,0,4,1,0,0,-2.326712,0,0
2,-0.041455,2,0.245281,-0.438117,0,5,1,4,1,0,0,-0.078119,0,0
3,1.093373,2,0.425848,-1.221545,2,5,0,2,1,0,0,-0.078119,0,0
4,-0.798006,2,1.407378,1.12874,2,9,5,2,0,0,0,-0.078119,1,0


Get gender balanced dataset

In [None]:


males = df[df.sex == 1]
females = df[df.sex== 0]

sampled_males = males.sample(n=min(females.shape[0], males.shape[0])).reset_index(drop=True)
combined = pd.concat([sampled_males, females]).sample(frac=1).reset_index(drop=True)

X = combined.drop(["salary"], axis=1)
Y = combined["salary"]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)


In [None]:
x_train.shape, x_test.shape

((22042, 13), (7348, 13))

In [None]:
def evaluate_predictor_performance(predictions, x_test, y_test):
    testdata = x_test.copy()
    testdata['salary'] = y_test
    testdata['pred'] = pd.Series(predictions, index=x_test.index)
    testdata['accurate'] = (testdata['pred'] == testdata['salary'])
    return evaluate_gender_performance(testdata)

In [None]:
def evaluate_predictor_performance1(predictions, x_test, y_test):
    testdata = x_test.copy()
    testdata['salary'] = y_test
    testdata['pred'] = pd.Series(predictions, index=x_test.index)
    testdata['accurate'] = (testdata['pred'] == testdata['salary'])
    return evaluate_gender_performance1(testdata)

In [None]:
predictor = RandomForestClassifier(n_estimators=50, random_state=1)
predictor.fit(x_train, y_train)
approach_2 = evaluate_predictor_performance(predictor.predict(x_test), x_test, y_test)
model_summary("MLP, equal_datapoints", "", approach_2)

In [None]:
predictor = RandomForestClassifier(n_estimators=50, random_state=1)
predictor.fit(x_train, y_train)
approach_2 = evaluate_predictor_performance1(predictor.predict(x_test), x_test, y_test)
model_summary1("MLP, equal_datapoints", "", approach_2)

Gender Unaware

In [None]:
predictor = RandomForestClassifier(n_estimators=50, random_state=1)
predictor.fit(x_train.drop(['sex'], axis=1), y_train)
approach_2_blind = evaluate_predictor_performance(predictor.predict(x_test.drop(['sex'], axis=1)), x_test, y_test)
model_summary("MLP, equal_datapoints_blind", "", approach_2_blind)

In [None]:
predictor = RandomForestClassifier(n_estimators=50, random_state=1)
predictor.fit(x_train.drop(['sex'], axis=1), y_train)
approach_2_blind = evaluate_predictor_performance1(predictor.predict(x_test.drop(['sex'], axis=1)), x_test, y_test)
model_summary1("MLP, equal_datapoints_blind", "", approach_2_blind)

## Equal number of datapoints per demographic in each category

In [None]:

males = datav3[(datav3.sex == 1)]
females = datav3[(datav3.sex == 0)]
male_high = males[(males.salary == 1)]
male_low = males[(males.salary == 0)]
female_high = females[(females.salary == 1)]
female_low = females[(females.salary == 0)]

smallest = min((x.shape[0] for x in [male_high, male_low, female_high, female_low]))

_male_high = male_high.sample(n=smallest).reset_index(drop=True)
_male_low = male_low.sample(n=smallest).reset_index(drop=True)
_female_high = female_high.sample(n=smallest).reset_index(drop=True)
_female_low = female_low.sample(n=smallest).reset_index(drop=True)
_combined = pd.concat([_male_high, _male_low, _female_high, _female_low]).sample(frac=1).reset_index(drop=True)

Xvals = _combined.drop(["salary"], axis=1)
Yvals = _combined["salary"]

x_train, x_test, y_train, y_test = train_test_split(Xvals, Yvals, test_size=0.25)


In [None]:
predictor = RandomForestClassifier(n_estimators=50, random_state=1)
predictor.fit(x_train, y_train)
predictions = predictor.predict(x_test)
approach_3 = evaluate_predictor_performance(predictions, x_test, y_test)
model_summary("Random Forest, equal_datapoints_per_category", "", approach_3)

In [None]:
predictor = RandomForestClassifier(n_estimators=50, random_state=1)
predictor.fit(x_train, y_train)
predictions = predictor.predict(x_test)
approach_3 = evaluate_predictor_performance1(predictions, x_test, y_test)
model_summary1("Random Forest, equal_datapoints_per_category", "", approach_3)

Equal datapoints, gender_unaware

In [None]:
predictor = RandomForestClassifier()
predictor.fit(x_train.drop(['sex'], axis=1), y_train)
predictions = predictor.predict(x_test.drop(['sex'], axis=1))
approach_3_blind = evaluate_predictor_performance(predictions, x_test, y_test)
model_summary("Random Forest, equal_datapoints_per_category_blind", "", approach_3_blind)

In [None]:
predictor = RandomForestClassifier()
predictor.fit(x_train.drop(['sex'], axis=1), y_train)
predictions = predictor.predict(x_test.drop(['sex'], axis=1))
approach_3_blind = evaluate_predictor_performance1(predictions, x_test, y_test)
model_summary1("Random Forest, equal_datapoints_per_category_blind", "", approach_3_blind)

##Equal ratios instead of equal number of datapoints

In [None]:

males = datav3[(datav3.sex == 1)]
females = datav3[(datav3.sex == 0)]

male_high = males[(males.salary == 1)]
male_low = males[(males.salary == 0)]

female_high = females[(females.salary == 1)]
female_low = females[(females.salary == 0)]

print("Shapes mh, ml, fh, fl:", [x.shape[0] for x in [male_high, male_low, female_high, female_low]])

ratio = float(male_high.shape[0]) / float(male_low.shape[0])

print("Ratio is", ratio)
n_female_high = female_high.shape[0]
n_female_low = int(n_female_high / ratio)

_male_low = male_low.copy()
_male_high = male_high.copy()
_female_high = female_high.copy()
_female_low = female_low.sample(n=n_female_low).reset_index(drop=True)
_combined = pd.concat([_male_high, _male_low, _female_high, _female_low]).sample(frac=1).reset_index(drop=True)

Xvals = _combined.drop(["salary"], axis=1)
Yvals = _combined["salary"]

x_train, x_test, y_train, y_test = train_test_split(Xvals, Yvals, test_size=0.25)


Shapes mh, ml, fh, fl: [9539, 20988, 1669, 13026]
Ratio is 0.4544978082713932


In [None]:
datav3.shape
predictor = RandomForestClassifier(n_estimators=50, random_state =1)
predictor.fit(x_train, y_train)
predictions = predictor.predict(x_test)

In [None]:
approach_4 = evaluate_predictor_performance(predictions, x_test, y_test)
model_summary("Random Forest equal_ratios", "", approach_4)

In [None]:
approach_4 = evaluate_predictor_performance1(predictions, x_test, y_test)
model_summary1("Random Forest equal_ratios", "", approach_4)

Gender Unaware

In [None]:
predictor = RandomForestClassifier()
predictor.fit(x_train.drop(['sex'], axis=1), y_train)
predictions = predictor.predict(x_test.drop(['sex'], axis=1))
approach_4_blind = evaluate_predictor_performance(predictions, x_test, y_test)
model_summary("Random Forest, equal_ratios_blind", "", approach_4_blind)

In [None]:
predictor = RandomForestClassifier()
predictor.fit(x_train.drop(['sex'], axis=1), y_train)
predictions = predictor.predict(x_test.drop(['sex'], axis=1))
approach_4_blind = evaluate_predictor_performance1(predictions, x_test, y_test)
model_summary1("Random Forest, equal_ratios_blind", "", approach_4_blind)