In [136]:
import numpy as np
import pandas as pd

In [137]:
compas_df = pd.read_csv('https://raw.githubusercontent.com/divexplorer/divexplorer/main/datasets/compas_discretized.csv')

In [138]:
from divexplorer.outcomes import get_false_positive_rate_outcome, get_false_negative_rate_outcome
from divexplorer import DivergenceExplorer

y_trues = compas_df["class"]
y_preds = compas_df["predicted"]

compas_df['fp'] =  get_false_positive_rate_outcome(y_trues, y_preds)
compas_df['fn'] =  get_false_negative_rate_outcome(y_trues, y_preds)

In [139]:
fp_diver = DivergenceExplorer(compas_df)

attributes = ['race', '#prior', 'sex', 'age']
# attributes = ['age', 'charge', 'race', 'sex', '#prior', 'stay']
FP_fm = fp_diver.get_pattern_divergence(min_support=0.1, attributes=attributes, 
                                        boolean_outcomes=['fp'])
FP_fm.sort_values(by="fp_div", ascending=False).head(10)

Unnamed: 0,support,itemset,fp,fp_div,fp_t,length,support_count
37,0.128645,"(race=Afr-Am, sex=Male, #prior=>3, age=25-45)",0.308036,0.219722,7.116857,4,794.0
31,0.14501,"(race=Afr-Am, #prior=>3, age=25-45)",0.299242,0.210928,7.44826,3,895.0
26,0.175308,"(race=Afr-Am, sex=Male, #prior=>3)",0.266871,0.178557,7.214382,3,1082.0
20,0.196695,"(race=Afr-Am, #prior=>3)",0.261097,0.172783,7.582039,2,1214.0
23,0.180655,"(sex=Male, #prior=>3, age=25-45)",0.252941,0.164627,6.903692,3,1115.0
18,0.207226,"(#prior=>3, age=25-45)",0.251889,0.163575,7.384973,2,1279.0
13,0.256643,"(sex=Male, #prior=>3)",0.222868,0.134554,7.1467,2,1584.0
41,0.107583,"(race=Afr-Am, age=<25, sex=Male)",0.219409,0.131095,4.886362,3,664.0
11,0.293422,(#prior=>3),0.219269,0.130955,7.503321,1,1811.0
36,0.131076,"(race=Afr-Am, age=<25)",0.207547,0.119233,5.200687,2,809.0


In [140]:
from divexplorer import DivergencePatternProcessor
fp_details = DivergencePatternProcessor(FP_fm, 'fp')

pattern = fp_details.patterns['itemset'].iloc[37]
fp_details.shapley_value(pattern)

{frozenset({'race=Afr-Am'}): 0.05115538154975479,
 frozenset({'sex=Male'}): 0.007220590601649036,
 frozenset({'#prior=>3'}): 0.14226087371897317,
 frozenset({'age=25-45'}): 0.019084863062973356}

In [141]:
df = compas_df.copy()
retain = df['class'] == 0
X = df[['age', 'charge', 'race', 'sex', '#prior', 'stay']][retain]
y = (df['class'] != df['predicted'])[retain]

c1 = X['#prior'] == '>3'
c2 = X['race'] == 'Afr-Am'
c3 = X['age'] == '25-45'
np.mean(y[np.logical_and(np.logical_and(c1,c2),c3)]) - np.mean(y)

0.2109284188900603

In [142]:
# df = pd.read_csv("compas_discretized.csv")
df = compas_df.copy()

df.columns = ['age', 'charge', 'race', 'sex', 'prior', 'stay', 'class', 'predicted', 'fp', 'fn']
df.head()

Unnamed: 0,age,charge,race,sex,prior,stay,class,predicted,fp,fn
0,>45,F,Other,Male,0,<week,0,0,0.0,
1,25-45,F,Afr-Am,Male,0,1w-3M,1,0,,1.0
2,<25,F,Afr-Am,Male,>3,<week,1,0,,1.0
3,25-45,M,Other,Male,0,<week,0,0,0.0,
4,25-45,F,Cauc,Male,>3,<week,1,0,,1.0


In [143]:
X = df[['age', 'charge', 'race', 'sex', 'prior', 'stay']]
y = (df['class'] == df['predicted'])

c1 = X['prior'] == '0'
c2 = X['charge'] == 'M'
np.mean(y[np.logical_and(c1,c2)]) - np.mean(y)

0.1285077770576798

In [144]:
# accuracy

from PRIM import PRIM_nominal_pat

pr = PRIM_nominal_pat()
pr.fit(X, y)
tmp = pr.get_res()
tmp[1] - tmp[1][0], tmp[2]


(array([0.        , 0.03191009, 0.07099349, 0.07472337, 0.08671465,
        0.15085626, 0.18198   , 0.18113383, 0.17850778, 0.17797359]),
 [1.0,
  0.7065780946208684,
  0.5102073882047958,
  0.4944912508101102,
  0.42174335709656513,
  0.21386908619572262,
  0.13998703823720027,
  0.11568373298768632,
  0.11406351263771873,
  0.11373946856772521])

In [145]:
boxn = 6
print('Delta_accuracy = %s' %(tmp[1] - tmp[1][0])[boxn])
print('Support = %s' %tmp[2][boxn])

tmp[0][boxn], tmp[0][0]

Delta_accuracy = 0.18197999927990205
Support = 0.13998703823720027


([array(['>45', '25-45'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Cauc', 'Hispanic', 'Asian', 'Native American'],
        dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['0'], dtype=object),
  array(['<week'], dtype=object)],
 [array(['>45', '25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic', 'Asian', 'Native American'],
        dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['0', '>3', '[1,3]'], dtype=object),
  array(['<week', '1w-3M', '>3Months'], dtype=object)])

In [146]:
boxn = 5
print('Delta_accuracy = %s' %(tmp[1] - tmp[1][0])[boxn])
print('Support = %s' %tmp[2][boxn])

tmp[0][boxn], tmp[0][0]

Delta_accuracy = 0.15085626190616463
Support = 0.21386908619572262


([array(['>45', '25-45'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic', 'Asian', 'Native American'],
        dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['0'], dtype=object),
  array(['<week'], dtype=object)],
 [array(['>45', '25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic', 'Asian', 'Native American'],
        dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['0', '>3', '[1,3]'], dtype=object),
  array(['<week', '1w-3M', '>3Months'], dtype=object)])

In [147]:
%load_ext autoreload
%autoreload 2
%config Application.log_level='INFO'

from primnew import PRIM
sd = PRIM(patience=0.4)
sd.fit(X, y)

np.array(sd.get_qual()), np.array(sd.get_size())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


(array([ 0.03191009,  0.09262648,  0.14257453,  0.17098703,  0.17921962,
         0.18198   ,  0.18155302,  0.17938806,  0.17797359, -0.08786328,
        -0.11870065, -0.22640523, -0.25319512]),
 array([0.70657809, 0.33781594, 0.23784835, 0.15618924, 0.14225535,
        0.13998704, 0.13966299, 0.13804277, 0.11373947, 0.07080363,
        0.03337654, 0.0066429 , 0.00502268]))

In [148]:
n = 2
sd.get_box(n), sd.get_qual()[n]

([['<25'], [], [], [], ['>3', '[1,3]'], []], 0.14257453455086788)

In [149]:
n = 3
sd.get_box(n), sd.get_qual()[n]

([['<25'], [], ['Afr-Am'], [], ['>3', '[1,3]'], []], 0.17098703016971306)

In [150]:
# error rate (more patient - fails.)
y = (df['class'] != df['predicted'])
pr.fit(X, y)
tmp = pr.get_res()
tmp[1] - tmp[1][0], tmp[2]

(array([0.        , 0.00070774, 0.0008764 , 0.04752236, 0.0624306 ,
        0.06606644, 0.07146741, 0.07873139, 0.07750184, 0.07663561,
        0.07404672, 0.06952231]),
 [1.0,
  0.9949773169151005,
  0.9931950745301361,
  0.6586195722618277,
  0.5218729747245625,
  0.48898250162022033,
  0.40813350615683736,
  0.18324692158133507,
  0.13480233311730394,
  0.12994167206740118,
  0.11892417368762152,
  0.11309138042773817])

In [152]:
boxn = 7
print('Delta_error = %s' %(tmp[1] - tmp[1][0])[boxn])
print('Support = %s' %tmp[2][boxn])

tmp[0][boxn], tmp[0][0]

Delta_error = 0.07873139181942007
Support = 0.18324692158133507


([array(['25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic'], dtype=object),
  array(['Male'], dtype=object),
  array(['>3'], dtype=object),
  array(['<week', '1w-3M'], dtype=object)],
 [array(['>45', '25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic', 'Asian', 'Native American'],
        dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['0', '>3', '[1,3]'], dtype=object),
  array(['<week', '1w-3M', '>3Months'], dtype=object)])

In [154]:
boxn = 8
print('Delta_error = %s' %(tmp[1] - tmp[1][0])[boxn])
print('Support = %s' %tmp[2][boxn])

tmp[0][boxn], tmp[0][0]

Delta_error = 0.07750183832693552
Support = 0.13480233311730394


([array(['25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic'], dtype=object),
  array(['Male'], dtype=object),
  array(['>3'], dtype=object),
  array(['<week'], dtype=object)],
 [array(['>45', '25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic', 'Asian', 'Native American'],
        dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['0', '>3', '[1,3]'], dtype=object),
  array(['<week', '1w-3M', '>3Months'], dtype=object)])

In [49]:
sd = PRIM(patience=0.4)
sd.fit(X, y)

np.array(sd.get_qual()), np.array(sd.get_size())

(array([ 0.04725378,  0.06168073,  0.06534536,  0.06587053,  0.12059937,
         0.12132456,  0.12312266, -0.13232446, -0.18946457]),
 array([0.66218406, 0.52430331, 0.49092677, 0.48995463, 0.10887881,
        0.10871679, 0.10434219, 0.02543746, 0.01036941]))

In [56]:
n = 4
print(X.columns)
sd.get_box(n), sd.get_size()[n], sd.get_qual()[n]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')


([['>45', '25-45'], [], ['Asian'], [], ['0'], ['>3Months']],
 0.10887880751782242,
 0.120599365799463)

In [55]:
n = 6
print(X.columns)
sd.get_box(n), sd.get_size()[n], sd.get_qual()[n]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')


([['>45', '25-45'],
  [],
  ['Asian', 'Native American', 'Other'],
  [],
  ['0'],
  ['>3Months']],
 0.10434219053791316,
 0.12312265772492886)

In [58]:
sd = PRIM(patience=0, mass_min=0.095)
sd.fit(X, y)

np.array(sd.get_qual()), np.array(sd.get_size())

(array([ 0.04725378,  0.07684148,  0.13482974,  0.13483539,  0.13314333,
        -1.        , -1.        ]),
 array([0.66218406, 0.29342191, 0.09672715, 0.09607907, 0.09543098,
        0.03386261, 0.00826312]))

In [59]:
n = 3
sd.get_box(n), sd.get_qual()[n]

([[], [], ['Afr-Am', 'Native American'], [], ['0', '[1,3]'], []],
 0.13483539326272487)

In [60]:
# FPR
from PRIM import PRIM_nominal_fpr

y = df['class'].to_numpy()
ypred = df['predicted'].to_numpy()
X = df[['age', 'charge', 'race', 'sex', 'prior', 'stay']]
pr = PRIM_nominal_fpr(mass_min = 0.1)
pr.fit(X, y, ypred)
tmp = pr.get_res()
tmp[1] - tmp[1][0], tmp[2]
#tmp[1][0]

(array([0.        , 0.00519745, 0.02796506, 0.02833821, 0.06499878,
        0.19109776, 0.21963301, 0.23075214, 0.23509025, 0.23289812,
        0.22875917]),
 [1.0,
  0.9444264419961115,
  0.7470836033700583,
  0.7438431626701232,
  0.5014581983149708,
  0.2214841218405703,
  0.17012313674659754,
  0.15084251458198314,
  0.14095917044718081,
  0.10612443292287752,
  0.10596241088788075])

In [64]:
boxn = 8 
print(X.columns)
print('Support = %s' %tmp[2][boxn])
print('Delta_FPR = %s' %(tmp[1] - tmp[1][0])[boxn])
tmp[0][boxn], tmp[0][0]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')
Support = 0.14095917044718081
Delta_FPR = 0.23509024996678501


([array(['25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Afr-Am', 'Native American'], dtype=object),
  array(['Male'], dtype=object),
  array(['>3'], dtype=object),
  array(['<week', '1w-3M', '>3Months'], dtype=object)],
 [array(['>45', '25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic', 'Asian', 'Native American'],
        dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['0', '>3', '[1,3]'], dtype=object),
  array(['<week', '1w-3M', '>3Months'], dtype=object)])

In [65]:
boxn = 6 
print(X.columns)
print('Support = %s' %tmp[2][boxn])
print('Delta_FPR = %s' %(tmp[1] - tmp[1][0])[boxn])
tmp[0][boxn], tmp[0][0]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')
Support = 0.17012313674659754
Delta_FPR = 0.21963301451518574


([array(['25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Afr-Am', 'Hispanic', 'Native American'], dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['>3'], dtype=object),
  array(['<week', '1w-3M', '>3Months'], dtype=object)],
 [array(['>45', '25-45', '<25'], dtype=object),
  array(['F', 'M'], dtype=object),
  array(['Other', 'Afr-Am', 'Cauc', 'Hispanic', 'Asian', 'Native American'],
        dtype=object),
  array(['Male', 'Female'], dtype=object),
  array(['0', '>3', '[1,3]'], dtype=object),
  array(['<week', '1w-3M', '>3Months'], dtype=object)])

In [66]:
df.head()

Unnamed: 0,age,charge,race,sex,prior,stay,class,predicted,fp,fn
0,>45,F,Other,Male,0,<week,0,0,0.0,
1,25-45,F,Afr-Am,Male,0,1w-3M,1,0,,1.0
2,<25,F,Afr-Am,Male,>3,<week,1,0,,1.0
3,25-45,M,Other,Male,0,<week,0,0,0.0,
4,25-45,F,Cauc,Male,>3,<week,1,0,,1.0


In [93]:
sd = PRIM(patience=1, fprfnr = True)
sd.fit(X, df['fp'])
np.array(sd.get_qual()), np.array(sd.get_size())

(array([ 0.03168599,  0.1309551 ,  0.18591767,  0.1921738 ,  0.19109776,
         0.18932727,  0.19105936,  0.19563661,  0.23219882,  0.21972171,
         0.20154107, -0.00497512, -0.0212766 ]),
 array([0.66218406, 0.29342191, 0.22909916, 0.22180817, 0.22148412,
        0.22116008, 0.20998056, 0.1830849 , 0.14079715, 0.1286455 ,
        0.11924822, 0.03256643, 0.00761504]))

In [103]:
n = 8
print(X.columns)
sd.get_box(n), sd.get_size()[n], sd.get_qual()[n]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')


([['>45'],
  [],
  ['Other', 'Asian', 'Native American', 'Hispanic', 'Cauc'],
  ['Female'],
  ['0', '[1,3]'],
  []],
 0.14079714841218405,
 0.2321988151604566)

In [102]:
n = 9
print(X.columns)
sd.get_box(n), sd.get_size()[n], sd.get_qual()[n]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')


([['>45', '<25'],
  [],
  ['Other', 'Asian', 'Native American', 'Hispanic', 'Cauc'],
  ['Female'],
  ['0', '[1,3]'],
  []],
 0.1286454957874271,
 0.21972170893335038)

In [104]:
# FNR
from PRIM import PRIM_nominal_fnr

y = df['class'].to_numpy()
ypred = df['predicted'].to_numpy()
X = df[['age', 'charge', 'race', 'sex', 'prior', 'stay']]
pr = PRIM_nominal_fnr(mass_min = 0.1)
pr.fit(X, y, ypred)
tmp = pr.get_res()
tmp[1] - tmp[1][0], tmp[2]

# in fact these are richer subgroup descriptions!

(array([0.        , 0.00053222, 0.01630743, 0.11840457, 0.19718297,
        0.22598134, 0.25541429, 0.29094878, 0.29094878, 0.2899701 ]),
 [1.0,
  0.9982177576150356,
  0.9479909267660401,
  0.681950745301361,
  0.49368114063512636,
  0.4210952689565781,
  0.2551847051198963,
  0.131399870382372,
  0.13026571613739468,
  0.11633182112767336])

In [106]:
boxn = 7 #6
print(X.columns)
print('Delta_FNR = %s' %(tmp[1] - tmp[1][0])[boxn])
print('Support = %s' %tmp[2][boxn])
tmp[0][boxn]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')
Delta_FNR = 0.2909487832948139
Support = 0.131399870382372


[array(['>45', '25-45'], dtype=object),
 array(['M'], dtype=object),
 array(['Other', 'Cauc', 'Hispanic', 'Asian'], dtype=object),
 array(['Male', 'Female'], dtype=object),
 array(['0', '[1,3]'], dtype=object),
 array(['<week'], dtype=object)]

In [108]:
boxn = 6 #5
print(X.columns)
print('Delta_FNR = %s' %(tmp[1] - tmp[1][0])[boxn])
print('Support = %s' %tmp[2][boxn])
tmp[0][boxn]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')
Delta_FNR = 0.25541428902245555
Support = 0.2551847051198963


[array(['>45', '25-45'], dtype=object),
 array(['F', 'M'], dtype=object),
 array(['Other', 'Cauc', 'Hispanic', 'Asian'], dtype=object),
 array(['Male', 'Female'], dtype=object),
 array(['0', '[1,3]'], dtype=object),
 array(['<week'], dtype=object)]

In [128]:

sd = PRIM(patience=0, fprfnr = True)
X = df[['age', 'charge', 'race', 'sex', 'prior', 'stay']]
sd.fit(X, df['fn'])
np.array(sd.get_qual()), np.array(sd.get_size())

(array([ 0.11599072,  0.19654392,  0.24232027,  0.27462496,  0.28590579,
         0.29094878,  0.29094878,  0.29094878,  0.2899701 , -1.        ,
        -1.        ]),
 array([0.48558004, 0.19815295, 0.16931303, 0.1463059 , 0.13350616,
        0.13188594, 0.13139987, 0.13026572, 0.11633182, 0.04617628,
        0.00745301]))

In [134]:
n = 5
print(X.columns)
sd.get_box(n), sd.get_size()[n], sd.get_qual()[n]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')


([['<25'], ['F'], ['Afr-Am'], [], ['>3'], ['1w-3M', '>3Months']],
 0.13188593648736227,
 0.2909487832948139)

In [132]:
n = 2
print(X.columns)
sd.get_box(n), sd.get_size()[n], sd.get_qual()[n]

Index(['age', 'charge', 'race', 'sex', 'prior', 'stay'], dtype='object')


([['<25'], ['F'], ['Afr-Am'], [], [], []],
 0.16931302657161373,
 0.24232026756103497)

In [329]:
X.head()

Unnamed: 0,age,charge,race,sex,prior,stay
0,>45,F,Other,Male,0,<week
1,25-45,F,Afr-Am,Male,0,1w-3M
2,<25,F,Afr-Am,Male,>3,<week
3,25-45,M,Other,Male,0,<week
4,25-45,F,Cauc,Male,>3,<week
