In [43]:
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import scipy as sp 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from data_loader import CompasDataset, ACSEmploymentDataset
from utils import predict_ensemble, predict_CIID
from utils import partition_by_group_intersectional, partition_by_group_binary, set_protected_groups

In [44]:
!ls

COMPAS.csv                 [34mVirny[m[m
Flow1.ipynb                [34m__pycache__[m[m
Flow4-group-specific.ipynb config.py
Inprocessing-Flow.ipynb    data_loader.py
LICENSE                    utils.py
README.md


In [45]:
SEED = 42

In [46]:
dataset = CompasDataset()

train, test_, = train_test_split(dataset.dataset, test_size=0.2, random_state=42)
val, test, = train_test_split(test_, test_size=0.5, random_state=42)

train.shape, val.shape, test.shape


((4222, 13), (528, 13), (528, 13))

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from config import MODELS_CONFIG

In [48]:
clf = RandomForestClassifier(max_depth= 4, max_features = 0.6, min_samples_leaf= 1, n_estimators=500, random_state=SEED)
clf.fit(train[dataset.features], train[dataset.target])
clf.score(val[dataset.features], val[dataset.target])

0.7102272727272727

In [49]:
from fairlearn.preprocessing import CorrelationRemover

In [50]:
train.columns

Index(['age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count',
       'priors_count', 'age_cat_25 - 45', 'age_cat_Greater than 45',
       'age_cat_Less than 25', 'c_charge_degree_F', 'c_charge_degree_M',
       'race', 'sex', 'recidivism'],
      dtype='object')

In [51]:
# Race 1 = Caucasian, 0 = African-American
# don't run several times, all values will be set to zero then
train.loc[:, 'race'] = train['race'].apply(lambda x: 1.0 if x == 'Caucasian' else 0.0)
test.loc[:, 'race'] = test['race'].apply(lambda x: 1.0 if x == 'Caucasian' else 0.0)

  train.loc[:, 'race'] = train['race'].apply(lambda x: 1.0 if x == 'Caucasian' else 0.0)
  test.loc[:, 'race'] = test['race'].apply(lambda x: 1.0 if x == 'Caucasian' else 0.0)


In [52]:
#sensitive_feature_names_compas = ['race', 'sex']
#sensitive_feature_names_compas = ['sex']
sensitive_feature_names_compas = ['race']

In [53]:
train

Unnamed: 0,age,juv_fel_count,juv_misd_count,juv_other_count,priors_count,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,c_charge_degree_F,c_charge_degree_M,race,sex,recidivism
4552,63.223029,0.0,0.0,0.0,3.867031,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1158,59.000000,0.0,0.0,0.0,3.863501,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2517,44.000000,0.0,0.0,0.0,4.203198,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2960,39.343556,0.0,0.0,0.0,10.000000,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
1501,22.187717,0.0,0.0,0.0,8.000000,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,26.000000,0.0,0.0,0.0,5.000000,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
3772,22.330164,0.0,0.0,0.0,0.455417,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
5191,40.000000,0.0,0.0,0.0,2.000000,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5226,36.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [54]:
preprocessor = CorrelationRemover(sensitive_feature_ids=sensitive_feature_names_compas, alpha=0.1)
preprocessor.fit(train)

fair_train = preprocessor.transform(train)
fair_test = preprocessor.transform(test)

In [55]:
fair_train.shape, train.shape

((4222, 12), (4222, 13))

In [56]:
clf = RandomForestClassifier(max_depth= 4, max_features = 0.6, min_samples_leaf= 1, n_estimators=500, random_state=SEED)
clf.fit(train[dataset.features], train[dataset.target])
clf.score(test[dataset.features], test[dataset.target])
clf_preds = clf.predict(test[dataset.features])

In [57]:
clf_fair = RandomForestClassifier(max_depth= 4, max_features = 0.6, min_samples_leaf= 1, n_estimators=500, random_state=SEED)
clf_fair.fit(fair_train, train[dataset.target])
clf_fair.score(fair_test, test[dataset.target])
clf_fair_preds = clf_fair.predict(fair_test)

In [58]:
from Virny.virny.analyzers.subgroup_statistical_bias_analyzer import SubgroupStatisticalBiasAnalyzer
from Virny.virny.analyzers.subgroup_variance_analyzer import SubgroupVarianceAnalyzer
from Virny.virny.configs.constants import ModelSetting

In [59]:
compas_dict = dict({
    'sex': 0.0, 
    'race': 1.0, 
    'sex&race': None
    })

test_protected_groups = set_protected_groups(test, ['sex', 'race'], [0.0, 1.0])

In [61]:
test[dataset.target].value_counts()

0.0    280
1.0    248
Name: recidivism, dtype: int64

In [62]:
error_analyser = SubgroupStatisticalBiasAnalyzer(test[dataset.features], test[dataset.target], compas_dict, test_protected_groups)
error_metrics = error_analyser._compute_metrics(test[dataset.target], clf_preds)
pd.DataFrame(error_metrics, index=[1])

Unnamed: 0,TPR,TNR,PPV,FNR,FPR,Accuracy,F1,Selection-Rate,Positive-Rate
1,0.641129,0.732143,0.679487,0.358871,0.267857,0.689394,0.659751,0.443182,0.943548


In [63]:
error_analyser_fair = SubgroupStatisticalBiasAnalyzer(fair_test, test[dataset.target], compas_dict, test_protected_groups)
error_metrics_fair = error_analyser_fair._compute_metrics(test[dataset.target], clf_fair_preds)
pd.DataFrame(error_metrics_fair, index=[1])

Unnamed: 0,TPR,TNR,PPV,FNR,FPR,Accuracy,F1,Selection-Rate,Positive-Rate
1,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.469697,1.0


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

pipe_svc = make_pipeline(StandardScaler(),
                         SVC(random_state=1))

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range, 
               'svc__kernel': ['linear']},
              {'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svc, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  refit=True,
                  cv=10)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [64]:

variance_analyser = SubgroupVarianceAnalyzer(n_estimators=200, base_model=clf, base_model_name='unconstrained', 
                                             bootstrap_fraction=0.8, X_train=train[dataset.features], y_train=train[dataset.target],
                                             X_test=test[dataset.features], y_test=test[dataset.target], target_name=dataset.target,
                                             dataset_name='compas', sensitive_attributes_dct=compas_dict, 
                                             test_protected_groups=test_protected_groups, sensitive_features=None)
variance_metrics = variance_analyser.compute_metrics(save_results=True, result_filename='compas_testing.png')





2023-04-24 17:10:13 abstract_overall_variance_analyzer.py INFO    : Start classifiers testing by bootstrap


Classifiers testing by bootstrap:   0%|          | 0/200 [00:00<?, ?it/s]

  return classifier.fit(X_train, y_train)


NameError: name 'sesitive_features' is not defined