## MDSS SCAN ON AUTOSTRAT MODE 

Import the MDSS module and Bernoulli modules

In [1]:
import pandas as pd

import warnings
from time import perf_counter

from comp_metrics import get_metrics
from pandas.api.types import is_numeric_dtype

warnings.filterwarnings('ignore')

### Data

In [2]:
compas = pd.read_csv('../datasets/compas-scores-two-years-violent.csv')
cols = ['sex', 'age_cat', 'race', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree', 'v_decile_score']
compas = compas[cols]
compas

Unnamed: 0,sex,age_cat,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,v_decile_score
0,Male,Greater than 45,Other,0,0,0,0,F,1
1,Male,25 - 45,African-American,0,0,0,0,F,1
2,Male,Less than 25,African-American,0,1,0,1,F,6
3,Male,25 - 45,Other,0,0,0,2,F,1
4,Male,25 - 45,Other,0,0,0,0,M,1
...,...,...,...,...,...,...,...,...,...
4738,Male,Less than 25,African-American,0,0,0,0,F,9
4739,Male,Less than 25,African-American,0,0,0,0,F,5
4740,Male,Less than 25,African-American,0,0,0,0,F,5
4741,Male,Greater than 45,Other,0,0,0,0,F,1


In [3]:
numeric_cols = [col for col in compas.columns if 'count' in col]
numeric_cols

['juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count']

In [4]:
compas[numeric_cols].nunique().sort_values(ascending=False)

priors_count       33
juv_fel_count       9
juv_other_count     9
juv_misd_count      8
dtype: int64

In [5]:
compas.head()

Unnamed: 0,sex,age_cat,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,v_decile_score
0,Male,Greater than 45,Other,0,0,0,0,F,1
1,Male,25 - 45,African-American,0,0,0,0,F,1
2,Male,Less than 25,African-American,0,1,0,1,F,6
3,Male,25 - 45,Other,0,0,0,2,F,1
4,Male,25 - 45,Other,0,0,0,0,M,1


In [6]:
compas.shape

(4743, 9)

In [7]:
compas.columns

Index(['sex', 'age_cat', 'race', 'juv_fel_count', 'juv_misd_count',
       'juv_other_count', 'priors_count', 'c_charge_degree', 'v_decile_score'],
      dtype='object')

In [8]:
compas.isna().sum()

sex                0
age_cat            0
race               0
juv_fel_count      0
juv_misd_count     0
juv_other_count    0
priors_count       0
c_charge_degree    0
v_decile_score     0
dtype: int64

In [9]:
dff = compas.dropna()

target_col = 'v_decile_score'
search_space =  list(dff.drop(columns=[target_col]).columns)

for col in numeric_cols:
    dff[col] = pd.qcut(dff[col], 10, duplicates='drop')
    dff[col] = dff[col].apply(lambda x: str(round(x.left, 2)) + ' - ' + str(round(x.right,2)))

for col in search_space:
    dff[col] = dff[col].astype(str)

dff[target_col] = (dff[target_col] > 5).astype(int)
expected_prob = dff[target_col].mean()
dff['expected'] = expected_prob
results = []


In [10]:
expected_prob

0.20430107526881722

### MDSS

In [11]:
from aif360.detectors.mdss.ScoringFunctions.BerkJones import BerkJones
from aif360.detectors.mdss.MDSS import MDSS

In [12]:
direction = 'positive'
penalty = 10
num_iters = 10


scoring_function = BerkJones(direction=direction, alpha = expected_prob)
scanner = MDSS(scoring_function)

start = perf_counter()
subset, score = scanner.parallel_scan(coordinates = dff[search_space], outcomes = dff[target_col], expectations = dff['expected'], penalty = penalty, num_iters = num_iters)
end = perf_counter()

subset, score


({'age_cat': ['Less than 25']}, 264.0578)

In [13]:
get_metrics('mdss', dff, subset, target_col, start, end)

['mdss',
 {'age_cat': ['Less than 25']},
 2.7690225563909774,
 0.18448239510858105,
 9.327281812125252,
 274.0578,
 0.06667437468686167,
 3.0317575639999994]

In [14]:
dff = dff.drop(columns = 'expected')

In [15]:
results.append(get_metrics('mdss', dff, subset, target_col, start, end))

In [16]:
results

[['mdss',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  3.0317575639999994]]

## PYSUBGROUP ON POLICE DATASET

In [17]:
import pysubgroup as ps
import ast

In [18]:
target = ps.BinaryTarget (target_col, True)
search_space = ps.create_selectors(dff, ignore=[target_col])

task = ps.SubgroupDiscoveryTask (
    data = dff, 
    target = target, 
    search_space = search_space, 
    result_set_size=1, 
    depth=5, 
    qf=ps.WRAccQF())

start = perf_counter()   
result = ps.BeamSearch().execute(task)
end = perf_counter()

In [19]:
desc = result.to_dataframe()

final_desc_format = []

for index, row in desc.iterrows():
    final_dict = {}
    sub_desc_str = "'"+row['subgroup'].replace("AND ", "],'").replace("==", "':[")+"]"
    sub_desc_list = '{'+sub_desc_str+'}'
    subset = ast.literal_eval(sub_desc_list)

subset

{'age_cat': ['Less than 25']}

In [20]:
results.append(get_metrics('pysubgroup - beam search', dff, subset, target_col, start, end))

In [21]:
results

[['mdss',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  3.0317575639999994],
 ['pysubgroup - beam search',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  0.0140874610000008]]

### Pysubgroup with Apriori search

In [22]:
start = perf_counter()   
result = ps.Apriori().execute(task)
end = perf_counter()

Apriori: Using numba for speedup
9
26
35
24


In [23]:
desc = result.to_dataframe()

final_desc_format = []

for index, row in desc.iterrows():
    final_dict = {}
    sub_desc_str = "'"+row['subgroup'].replace("AND ", "],'").replace("==", "':[")+"]"
    sub_desc_list = '{'+sub_desc_str+'}'
    subset = ast.literal_eval(sub_desc_list)

subset

{'age_cat': ['Less than 25']}

In [24]:
results.append(get_metrics('pysubgroup - apriori', dff, subset, target_col, start, end))

In [25]:
results

[['mdss',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  3.0317575639999994],
 ['pysubgroup - beam search',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  0.0140874610000008],
 ['pysubgroup - apriori',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  1.3025313599999997]]

## DIVEXPLORER ON POLICE DATASET

In [26]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence
from slicefinder.SliceLogisticRegression import MyFakeLR

In [27]:
search_space = list(dff.drop(columns=[target_col]).columns)

In [28]:
X = dff[search_space]
y = dff[target_col]

model = MyFakeLR().getModel()
model.fit(X, y)

predictions = (model.predict_proba(X)[:,1]>0.5).astype(int)

In [29]:
dff['predictions'] = predictions

#### DivExplorer - FpGrowth

In [30]:
min_sup=0.1
K = 1

start = perf_counter()
fp_diver=FP_DivergenceExplorer(dff, true_class_name = target_col, predicted_class_name = "predictions", class_map={"P":1, "N":0})
FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_accuracy"],FPM_type='fpgrowth')
fp_divergence=FP_Divergence(FP_fm, "d_accuracy")
FP_sorted=fp_divergence.getDivergence(th_redundancy=0)
FP_sorted = FP_sorted.sort_values(by = "d_accuracy").head(K)
end = perf_counter()

In [31]:
FP_sorted

Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,accuracy,d_accuracy,t_value_tp_tn
567,0.104575,"(age_cat=Less than 25, race=African-American)",185,0,311,0,2,496.0,0.372984,-0.422715,18.815923


In [32]:
itemsets_to_mdss = []

for itemset in FP_sorted['itemsets']:
    sub = {}
    divStr = ""
    for i in itemset:
        conj = '"'+i.replace("=", '":["') +'"],'
        divStr += conj
    divStr = "{"+divStr.removesuffix(",")+"}"
    subset =ast.literal_eval(divStr)

In [33]:
get_metrics('divexplorer - fpgrowth', dff, subset, target_col, start, end)

['divexplorer - fpgrowth',
 {'age_cat': ['Less than 25'], 'race': ['African-American']},
 3.0690789473684212,
 0.10457516339869281,
 9.169300911854105,
 208.5765,
 0.04420549581839905,
 0.36785375000000187]

In [34]:
results.append(get_metrics('divexplorer - fpgrowth', dff, subset, target_col, start, end))

In [35]:
results

[['mdss',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  3.0317575639999994],
 ['pysubgroup - beam search',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  0.0140874610000008],
 ['pysubgroup - apriori',
  {'age_cat': ['Less than 25']},
  2.7690225563909774,
  0.18448239510858105,
  9.327281812125252,
  274.0578,
  0.06667437468686167,
  1.3025313599999997],
 ['divexplorer - fpgrowth',
  {'age_cat': ['Less than 25'], 'race': ['African-American']},
  3.0690789473684212,
  0.10457516339869281,
  9.169300911854105,
  208.5765,
  0.04420549581839905,
  0.36785375000000187]]

In [36]:
dff = dff.drop(columns = 'predictions')

## SLICEFINDER ON POLICE DATASET

In [37]:
from slicefinder.slice_finder import SliceFinder
# from slicefinder.decision_tree import DecisionTree
import pickle
from sklearn.preprocessing import LabelEncoder

In [38]:
dff_sample = dff.sample(frac = 0.1)

for column in search_space:
    if dff_sample[column].dtype != "int64":
        dff_sample[column] = dff_sample[column].astype('category')

encoders = {}
for column in search_space:
    if dff_sample.dtypes[column] == "object" or dff_sample[column].dtypes.name == "category":
        le = LabelEncoder()
        dff_sample[column] = le.fit_transform(dff_sample[column])
        encoders[column] = le

pickle.dump(encoders, open("encoders.pkl", "wb"), protocol=2)

model = MyFakeLR().getModel()
model.fit(dff[search_space], dff[target_col])

In [39]:
sf = SliceFinder(model, (dff_sample[search_space], dff_sample[target_col]))

start = perf_counter()
recommendations = sf.find_slice(k=1, epsilon=0.5, degree=5,max_workers=4, max_time=300)
end = perf_counter()

subset = {}
for s in recommendations:
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
        temp_v = values.strip()
        subset[k] = [temp_v]

degree 1
crossing
effect size filtering
sorting
age_cat:Less than 25 


In [40]:
results.append(get_metrics('slice finder - lattice', dff, subset, target_col, start, end))

### Store Results

In [41]:
results_df = pd.DataFrame(results, columns = ['method', 'subset', 'lift', 'support', 'odds_ratio', 'mdss_score', 'quality_score', 'time'])
results_df

Unnamed: 0,method,subset,lift,support,odds_ratio,mdss_score,quality_score,time
0,mdss,{'age_cat': ['Less than 25']},2.769023,0.184482,9.327282,274.0578,0.066674,3.031758
1,pysubgroup - beam search,{'age_cat': ['Less than 25']},2.769023,0.184482,9.327282,274.0578,0.066674,0.014087
2,pysubgroup - apriori,{'age_cat': ['Less than 25']},2.769023,0.184482,9.327282,274.0578,0.066674,1.302531
3,divexplorer - fpgrowth,"{'age_cat': ['Less than 25'], 'race': ['Africa...",3.069079,0.104575,9.169301,208.5765,0.044205,0.367854
4,slice finder - lattice,{'age_cat': ['Less than 25']},2.769023,0.184482,9.327282,274.0578,0.066674,3.480051


In [42]:
results_df.to_csv('../results/compas.csv', index = False)

In [43]:
exp_total = 1

for col in search_space:
    exp_total *= (2**dff[col].nunique()) - 1

exp_total

250047

In [44]:
poly_total = 1 

for col in search_space:
    poly_total *= dff[col].nunique()

poly_total

432

print(recommendations[0])

recommendations[0].__ancestry__()

tree_finder = DecisionTree((dff_sample[search_space], dff_sample[target_col]), model)

start = perf_counter()
tree_finder.fit(max_depth = 5)
recommendations = tree_finder.recommend_slices(k = 1, min_effect_size = 0.5)
end = perf_counter()

subset = {}
for s in recommendations:
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
        temp_v = values.strip()
        subset[k] = [temp_v]

['slice finder - tree', subset, \
    lift(dff, subset, target_col), \
    support(dff, subset), \
    odds_ratio(dff, subset, target_col, expected_prob), \
    end - start]