In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load the datasets
result  = pd.read_csv('data/result.csv')
result['Gender'] = result['Gender'].map({'M': 1, 'F': 0})

result['score'] = result['score'].astype(float)

sensitive_attr = ['Gender', 'Veteran status', 'Work authorization', 'Disability', 'Ethnicity']

result.head()


Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,...,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3,score,prediction
0,0,Providence University,3.81,Bachelors,Providence,1.0,0,0,0.0,0.0,...,5/20,,,,,,,,4.35,1
1,1,Providence University,3.81,Bachelors,Providence,1.0,0,0,0.0,1.0,...,5/20,,,,,,,,7.17,0
2,2,Providence University,3.81,Bachelors,Providence,1.0,0,0,0.0,,...,5/20,,,,,,,,7.37,1
3,3,Providence University,3.81,Bachelors,Providence,1.0,0,0,1.0,0.0,...,5/20,,,,,,,,3.2,1
4,4,Providence University,3.81,Bachelors,Providence,1.0,0,0,1.0,1.0,...,5/20,,,,,,,,0.56,0


In [2]:
def score_metrics(data, attr):
    """
    Calculate fairness metrics for the 'score' column based on a specified attribute.
    """
    # Disparate Impact
    average_score_privileged = data[data[attr] == 1]['score'].mean()
    average_score_unprivileged = data[data[attr] == 0]['score'].mean()
    disparate_impact = average_score_unprivileged / average_score_privileged
    print(f"Disparate Impact based on {attr}:", disparate_impact)

    # Statistical Parity Difference
    statistical_parity_difference = average_score_unprivileged - average_score_privileged
    print(f"Statistical Parity Difference based on {attr}:", statistical_parity_difference)

    # Balanced Accuracy for Regression (using RMSE)
    overall_mean_score = data['score'].mean()
    mae_privileged = mean_absolute_error(data[data[attr] == 1]['score'], 
                                     np.full(data[data[attr] == 1].shape[0], overall_mean_score))
    mae_unprivileged = mean_absolute_error(data[data[attr] == 0]['score'], 
                                        np.full(data[data[attr] == 0].shape[0], overall_mean_score))

    # R-squared for each group
    r2_privileged = r2_score(data[data[attr] == 1]['score'], 
                            np.full(data[data[attr] == 1].shape[0], overall_mean_score))
    r2_unprivileged = r2_score(data[data[attr] == 0]['score'], 
                            np.full(data[data[attr] == 0].shape[0], overall_mean_score))
    print("Mean Absolute Error (Privileged):", mae_privileged)
    print("Mean Absolute Error (Unprivileged):", mae_unprivileged)
    print("R-squared (Privileged):", r2_privileged)
    print("R-squared (Unprivileged):", r2_unprivileged)

    # Average Odds Difference based on residuals
    residuals_privileged = data[data[attr] == 1]['score'] - overall_mean_score
    residuals_unprivileged = data[data[attr] == 0]['score'] - overall_mean_score
    average_odds_difference = abs(residuals_unprivileged.mean() - residuals_privileged.mean())
    print(f"Average Odds Difference (Regression) based on {attr}:", average_odds_difference)


In [3]:
def eval_metrics(data, attr):
    """
    Calculate fairness metrics for the 'prediction' column based on a specified attribute.

    """
    # Disparate Impact
    selection_rate_privileged = data[data[attr] == 1]['prediction'].mean()
    selection_rate_unprivileged = data[data[attr] == 0]['prediction'].mean()
    disparate_impact = selection_rate_unprivileged / selection_rate_privileged
    print(f"Disparate Impact based on {attr}:", disparate_impact)

    # Statistical Parity Difference
    statistical_parity_difference = selection_rate_unprivileged - selection_rate_privileged
    print(f"Statistical Parity Difference based on {attr}:", statistical_parity_difference)


In [4]:
for attr in sensitive_attr:
    print(f"Metrics for {attr}")
    score_metrics(result, attr)
    print("\n")

Metrics for Gender
Disparate Impact based on Gender: 0.9848074875798438
Statistical Parity Difference based on Gender: -0.07611111111111146
Mean Absolute Error (Privileged): 2.572015020576132
Mean Absolute Error (Unprivileged): 2.490090946502058
R-squared (Privileged): -0.00013386002675597197
R-squared (Unprivileged): -0.00021342060485673997
Average Odds Difference (Regression) based on Gender: 0.0761111111111111


Metrics for Veteran status
Disparate Impact based on Veteran status: 1.0194429413179413
Statistical Parity Difference based on Veteran status: 0.09581481481481458
Mean Absolute Error (Privileged): 2.578026886145404
Mean Absolute Error (Unprivileged): 2.512111111111111
R-squared (Privileged): -0.0002594992913607097
R-squared (Unprivileged): -0.00027743276929159677
Average Odds Difference (Regression) based on Veteran status: 0.09581481481481482


Metrics for Work authorization
Disparate Impact based on Work authorization: 1.047854584254249
Statistical Parity Difference based 

In [5]:
for attr in sensitive_attr:
    print(f"Metrics for {attr}")
    eval_metrics(result, attr)
    print("\n")

Metrics for Gender
Disparate Impact based on Gender: 0.6507936507936508
Statistical Parity Difference based on Gender: -0.1222222222222222


Metrics for Veteran status
Disparate Impact based on Veteran status: 0.8571428571428572
Statistical Parity Difference based on Veteran status: -0.029629629629629617


Metrics for Work authorization
Disparate Impact based on Work authorization: 0.9622641509433961
Statistical Parity Difference based on Work authorization: -0.007407407407407418


Metrics for Disability
Disparate Impact based on Disability: 0.9411764705882354
Statistical Parity Difference based on Disability: -0.0111111111111111


Metrics for Ethnicity
Disparate Impact based on Ethnicity: 1.090909090909091
Statistical Parity Difference based on Ethnicity: 0.01666666666666669




## Regression analysis

In [6]:
cate_cols = ['School Name', "Degree", "Location", 'Role 1', 'Start 1', 'End 1', 'Role 2', 'Start 2', 'End 2', 'Role 3',
       'Start 3', 'End 3']
num_cols = ["GPA", "Gender", "Veteran status", "Work authorization", "Disability", "Ethnicity"]
# fillna for cate_cols using mode
for col in cate_cols:
    result[col].fillna(result[col].mode()[0], inplace=True)
# fillna for num_cols using mean
for col in num_cols:
    result[col].fillna(result[col].mean(), inplace=True)


In [7]:
X = result.drop(['score', "prediction", 'Applicant ID'], axis=1)[sensitive_attr]
y = result['score']



categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

model = LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 8.635572198301835


In [9]:
# summarize coefficients
for i in range(len(X.columns)):
    print(f"{X.columns[i]}: {model.coef_[i]}")

Gender: 0.2902313299911564
Veteran status: -0.23038445093969992
Work authorization: -0.17185752868660079
Disability: -0.17041185503105022
Ethnicity: -0.007419267085687381


## Model Audit

In [10]:
X = result.drop('prediction', axis=1)
y = result['prediction']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Replace missing values with 'missing'
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cate_cols)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 10, 20],
    'classifier__min_samples_leaf': [1, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test set accuracy: {:.2f}%".format(test_accuracy * 100))




Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Test set accuracy: 81.48%
