In [1]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from sklearn.model_selection import train_test_split


from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot

%matplotlib inline

## Incorporating fairness into cross validation

Read in data 

In [16]:
### load data
data = pd.read_csv("../data/broward_data.csv")

X = data.loc[:,:'five_year']
Y = data['recid_two_year']

In [21]:
### Implementing fairness as scoring method 


def compute_fairness(df: pd.DataFrame, 
                     decoders: dict, 
                     sensitive_attrs: list,
                     ref_groups_dict: dict) -> pd.DataFrame:
    """
    decoders: dictionary of dictionary of decoders 
    """
    df = df.rename({"person_id": "entity_id"}, axis="columns")
    # decode numeric encodings for cat var
    for decoder_name, decoder_dict in decoders.items():
        df = df.replace({decoder_name: decoder_dict})
    
    g = Group()
    xtab, _ = g.get_crosstabs(df, attr_cols=sensitive_attrs)
    # compute bias 
    b = Bias()
    bdf = b.get_disparity_predefined_groups(xtab, 
                                            original_df=df, 
                                            ref_groups_dict=ref_groups_dict, 
                                            alpha=0.05, 
#                                           check_significance=True, 
#                                           mask_significance=True
                                            )
    f = Fairness()
    fdf = f.get_group_value_fairness(bdf)

    # list results of fairness analysis
    parity_determinations = f.list_parities(fdf)
    
    absolute_metrics = g.list_absolute_metrics(xtab)
    return fdf[['attribute_name', 'attribute_value'] + absolute_metrics + b.list_disparities(fdf) + parity_determinations].style

In [22]:
# prepare data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.3, shuffle = True, random_state = 5829)

clf = LogisticRegression(class_weight = 'balanced', solver='liblinear', random_state=0)
clf.fit(X_train.drop(['person_id', 'screening_date'], axis=1),
        Y_train)
preds = clf.predict(X_test.drop(['person_id', 'screening_date'], axis=1))
X_test.loc[:,"score"] = preds
X_test.loc[:,"label_value"] = Y_test

df = X_test[["person_id", "screening_date", "sex", "score", "label_value"]]
df = df.rename({"person_id": "entity_id"}, axis="columns")


decoders = {"sex": {0: "male",
                   1: "female"}
           }

sensitive_attrs = ['sex']

ref_groups_dict = {'sex':'male'}
res = compute_fairness(df, 
                       decoders=decoders, 
                       sensitive_attrs=sensitive_attrs,
                       ref_groups_dict=ref_groups_dict)

res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


model_id, score_thresholds 1 {'rank_abs': [307]}
get_disparity_predefined_group()


Unnamed: 0,attribute_name,attribute_value,tpr,tnr,for,fdr,fpr,fnr,npv,precision,ppr,pprev,prev,ppr_disparity,pprev_disparity,precision_disparity,fdr_disparity,for_disparity,fpr_disparity,fnr_disparity,tpr_disparity,tnr_disparity,npv_disparity,TNR Parity,FNR Parity,Statistical Parity,FPR Parity,Supervised Fairness,TypeII Parity,FDR Parity,TypeI Parity,Impact Parity,Precision Parity,NPV Parity,FOR Parity,Equalized Odds,TPR Parity,Unsupervised Fairness
0,sex,female,0.470588,0.9,0.285714,0.238095,0.1,0.529412,0.714286,0.761905,0.0684039,0.25,0.404762,0.0734266,0.439685,1.28938,0.582011,0.925373,0.228205,1.86479,0.657153,1.602,1.03333,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False
1,sex,male,0.716102,0.561798,0.308756,0.409091,0.438202,0.283898,0.691244,0.590909,0.931596,0.568588,0.469185,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


### Create Custom Scorer

https://scikit-learn.org/stable/modules/model_evaluation.html#scoring

In [7]:
def my_custom_loss_func(y_true, y_pred):
    diff = np.abs(y_true - y_pred).max()
    return np.log1p(diff)

 # score will negate the return value of my_custom_loss_func,
 # which will be np.log(2), 0.693, given the values for X
 # and y defined below.
score = make_scorer(my_custom_loss_func, greater_is_better=False)
X = [[1], [1]]
y = [0, 1]
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent', random_state=0)
clf = clf.fit(X, y)
print(my_custom_loss_func(y, clf.predict(X)) )

print(score(clf, X, y))


0.6931471805599453
-0.6931471805599453


### Adapt current logistic prediction method 

#### Original

In [17]:
def Logistic(x, y, C,seed):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import KFold, GridSearchCV, cross_validate
    
    ## cross validation set up
    inner_cv = KFold(n_splits=5,shuffle=True,random_state=seed)
    outer_cv = KFold(n_splits=5,shuffle=True,random_state=seed)
    
    ### model & parameters
    lr = LogisticRegression(class_weight = 'balanced', solver='liblinear', random_state=seed)
    c_grid = {"C": C}
    
    ### nested cross validation
    clf = GridSearchCV(estimator=lr, param_grid=c_grid, scoring='roc_auc',cv=inner_cv, return_train_score=True)
    nested_score = cross_validate(clf, X=x, y=y, scoring='roc_auc', cv=outer_cv, return_train_score=True)
    train_score, test_score = nested_score['train_score'], nested_score['test_score']
    
    return train_score, test_score

In [18]:
c = [1e-5, 1e-4, 1e-3]

logistic_train, logistic_test = Logistic(X.drop(['person_id', 'screening_date'], axis=1),
                                         Y,
                                         c,
                                         816)
res = [np.mean(logistic_train) - np.mean(logistic_test), np.mean(logistic_test)]
res

[0.011141768594555845, 0.6651818167782302]

#### New

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, cross_validate

def Logistic(X, y, C,seed):
    """
    X: pandas dataframe including ID columns
    Y:
    """
    X_no_id = X.drop(["person_id", "screening_date"], axis=1)
    # cross validation set up
    inner_cv = KFold(n_splits=5,shuffle=True,random_state=seed)
    outer_cv = KFold(n_splits=5,shuffle=True,random_state=seed)
    
    # model & parameters
    lr = LogisticRegression(class_weight = 'balanced', solver='liblinear', random_state=seed)
    c_grid = {"C": C}
    
    # nested cross validation
    clf = GridSearchCV(estimator=lr, param_grid=c_grid, scoring='roc_auc',cv=inner_cv, return_train_score=True)
    nested_score = cross_validate(clf, X=X_no_id, y=y, scoring='roc_auc', cv=outer_cv, return_train_score=True)
    train_score, test_score = nested_score['train_score'], nested_score['test_score']
    
    
    # prepare for fairness analysis
#     preds = clf.predict(X_test.drop(['person_id', 'screening_date'], axis=1))
#     X_test.loc[:,"score"] = preds
#     X_test.loc[:,"label_value"] = Y_test
#     df = X_test[["person_id", "screening_date", "sex", "score", "label_value"]]
#     df["entity_id"] = " ".join(df["person_id"], df["entity_id"])
#     df.drop(["person_id", "screening_date"])
    return train_score, test_score

In [23]:
X

Unnamed: 0,person_id,screening_date,sex,p_current_age,p_age_first_offense,p_charge,p_jail30,p_prison,p_probation,p_juv_fel_count,...,p_stalking,p_voyeurism,p_fraud,p_stealing,p_trespass,years_since_last_crime,six_month,one_year,three_year,five_year
0,1001,2014-02-03,0,45,19,39,0,2,3,0,...,0,0,0,2,0,4.140828,0,0,0,1
1,101,2013-01-13,0,42,26,9,1,3,0,0,...,0,0,0,1,2,8.597023,0,0,0,0
2,101,2014-02-02,0,43,26,15,2,3,0,0,...,0,0,0,3,3,0.065708,1,1,1,1
3,1015,2014-01-22,0,35,19,47,0,1,3,0,...,0,0,0,6,0,7.415469,0,0,0,0
4,1016,2013-04-15,0,27,18,11,0,3,2,0,...,0,0,0,4,1,4.685832,0,0,0,1
5,1016,2013-05-11,0,27,18,12,0,3,2,0,...,0,0,0,4,1,0.073922,1,1,1,1
6,102,2013-05-25,0,19,18,4,0,0,0,0,...,0,0,0,1,2,0.804757,0,1,1,1
7,1027,2013-04-04,0,19,18,5,0,0,0,0,...,0,0,0,0,0,0.758214,0,1,1,1
8,1032,2013-09-23,0,26,19,15,0,1,3,0,...,0,0,0,0,0,0.685832,0,1,1,1
9,1034,2013-01-14,0,19,19,7,0,0,0,0,...,0,0,0,0,0,0.124401,1,1,1,1
