## MDSS SCAN ON AUTOSTRAT MODE 

Import the MDSS module and Bernoulli modules

In [1]:
import pandas as pd

import warnings
from time import perf_counter

from comp_metrics import get_metrics
from pandas.api.types import is_numeric_dtype

warnings.filterwarnings('ignore')

### Data

In [2]:
compas = pd.read_csv('../datasets/default_of_credit_card_clients.csv')
compas

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,150000,1,3,2,43,-1,-1,-1,-1,0,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,30000,1,2,2,37,4,3,2,-1,0,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,80000,1,3,1,41,1,-1,0,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [3]:
numeric_cols = [col for col in compas.columns if (is_numeric_dtype(compas[col]) == True) & (compas[col].nunique() > 11)]
numeric_cols

['LIMIT_BAL',
 'AGE',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

In [4]:
compas[numeric_cols].nunique().sort_values(ascending=False)

BILL_AMT1    22723
BILL_AMT2    22346
BILL_AMT3    22026
BILL_AMT4    21548
BILL_AMT5    21010
BILL_AMT6    20604
PAY_AMT1      7943
PAY_AMT2      7899
PAY_AMT3      7518
PAY_AMT6      6939
PAY_AMT4      6937
PAY_AMT5      6897
LIMIT_BAL       81
AGE             56
dtype: int64

In [5]:
compas.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [6]:
compas.shape

(30000, 24)

In [7]:
compas.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [8]:
compas.isna().sum()

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [9]:
dff = compas.dropna()

target_col = 'default payment next month'
search_space = ['SEX', 'EDUCATION', 'MARRIAGE']

for col in numeric_cols:
    dff[col] = pd.qcut(dff[col], 10, duplicates='drop')
    dff[col] = dff[col].apply(lambda x: str(round(x.left, 2)) + ' - ' + str(round(x.right,2)))

for col in search_space:
    dff[col] = dff[col].astype(str)

dff[target_col] = dff[target_col].astype(int)
expected_prob = dff[target_col].mean()
dff['expected'] = expected_prob
results = []


In [10]:
dff.shape, expected_prob

((30000, 25), 0.2212)

### MDSS

In [11]:
from aif360.detectors.mdss.ScoringFunctions.BerkJones import BerkJones
from aif360.detectors.mdss.MDSS import MDSS

In [12]:
direction = 'positive'
penalty = 1e-3
num_iters = 10


scoring_function = BerkJones(direction=direction, alpha = expected_prob)
scanner = MDSS(scoring_function)

start = perf_counter()
subset, score = scanner.parallel_scan(coordinates = dff[search_space], outcomes = dff[target_col], expectations = dff['expected'], penalty = penalty, num_iters = num_iters)
end = perf_counter()

subset, score


({'MARRIAGE': ['1', '2', '3'], 'SEX': ['1'], 'EDUCATION': ['2', '3']}, 39.8974)

In [13]:
dff = dff.drop(columns = 'expected')

In [14]:
get_metrics('mdss', dff, subset, target_col, start, end)

['mdss',
 {'MARRIAGE': ['1', '2', '3'], 'SEX': ['1'], 'EDUCATION': ['2', '3']},
 1.199846596752899,
 0.24503333333333333,
 1.3853418712202066,
 39.9033,
 0.010831959999999998,
 1.7545045709999982]

In [15]:
results.append(get_metrics('mdss', dff, subset, target_col, start, end))

In [16]:
results

[['mdss',
  {'MARRIAGE': ['1', '2', '3'], 'SEX': ['1'], 'EDUCATION': ['2', '3']},
  1.199846596752899,
  0.24503333333333333,
  1.3853418712202066,
  39.9033,
  0.010831959999999998,
  1.7545045709999982]]

## PYSUBGROUP ON POLICE DATASET

In [17]:
import pysubgroup as ps
import ast

In [18]:
target = ps.BinaryTarget (target_col, True)

space = search_space + [target_col]
searchspace = ps.create_selectors(dff[space], ignore=[target_col])

task = ps.SubgroupDiscoveryTask (
    data = dff[space], 
    target = target, 
    search_space = searchspace, 
    result_set_size=1, 
    depth=5, 
    qf=ps.WRAccQF())

start = perf_counter()   
result = ps.BeamSearch().execute(task)
end = perf_counter()

In [19]:
desc = result.to_dataframe()

final_desc_format = []

for index, row in desc.iterrows():
    final_dict = {}
    sub_desc_str = "'"+row['subgroup'].replace("AND ", "],'").replace("==", "':[")+"]"
    sub_desc_list = '{'+sub_desc_str+'}'
    subset = ast.literal_eval(sub_desc_list)

subset

{'SEX': ['1']}

In [20]:
get_metrics('pysubgroup - beam search', dff, subset, target_col, start, end)

['pysubgroup - beam search',
 {'SEX': ['1']},
 1.092550969993599,
 0.39626666666666666,
 1.2152267259413068,
 14.1572,
 0.008112479999999993,
 0.024930590000000308]

In [21]:
results.append(get_metrics('pysubgroup - beam search', dff, subset, target_col, start, end))

In [22]:
results

[['mdss',
  {'MARRIAGE': ['1', '2', '3'], 'SEX': ['1'], 'EDUCATION': ['2', '3']},
  1.199846596752899,
  0.24503333333333333,
  1.3853418712202066,
  39.9033,
  0.010831959999999998,
  1.7545045709999982],
 ['pysubgroup - beam search',
  {'SEX': ['1']},
  1.092550969993599,
  0.39626666666666666,
  1.2152267259413068,
  14.1572,
  0.008112479999999993,
  0.024930590000000308]]

### Pysubgroup with Apriori search

In [23]:
start = perf_counter()   
result = ps.Apriori().execute(task)
end = perf_counter()

Apriori: Using numba for speedup
7
16
9


In [24]:
desc = result.to_dataframe()

final_desc_format = []

for index, row in desc.iterrows():
    final_dict = {}
    sub_desc_str = "'"+row['subgroup'].replace("AND ", "],'").replace("==", "':[")+"]"
    sub_desc_list = '{'+sub_desc_str+'}'
    subset = ast.literal_eval(sub_desc_list)

subset

{'SEX': ['1']}

In [25]:
results.append(get_metrics('pysubgroup - apriori', dff, subset, target_col, start, end))

In [26]:
results

[['mdss',
  {'MARRIAGE': ['1', '2', '3'], 'SEX': ['1'], 'EDUCATION': ['2', '3']},
  1.199846596752899,
  0.24503333333333333,
  1.3853418712202066,
  39.9033,
  0.010831959999999998,
  1.7545045709999982],
 ['pysubgroup - beam search',
  {'SEX': ['1']},
  1.092550969993599,
  0.39626666666666666,
  1.2152267259413068,
  14.1572,
  0.008112479999999993,
  0.024930590000000308],
 ['pysubgroup - apriori',
  {'SEX': ['1']},
  1.092550969993599,
  0.39626666666666666,
  1.2152267259413068,
  14.1572,
  0.008112479999999993,
  0.8141182040000032]]

## DIVEXPLORER ON POLICE DATASET

In [27]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence
from slicefinder.SliceLogisticRegression import MyFakeLR

In [28]:
X = dff[search_space]
y = dff[target_col]

model = MyFakeLR().getModel()
model.fit(X, y)

predictions = (model.predict_proba(X)[:,1]>0.5).astype(int)

In [29]:
dff['predictions'] = predictions

In [30]:
space = search_space + [target_col, 'predictions']
space

['SEX', 'EDUCATION', 'MARRIAGE', 'default payment next month', 'predictions']

#### DivExplorer - FpGrowth

In [31]:
min_sup=0.1
K = 1
 
start = perf_counter()
fp_diver=FP_DivergenceExplorer(dff[space], true_class_name = target_col, predicted_class_name = "predictions", class_map={"P":1, "N":0})
FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_accuracy"],FPM_type='fpgrowth')
fp_divergence=FP_Divergence(FP_fm, "d_accuracy")
FP_sorted=fp_divergence.getDivergence(th_redundancy=0)
FP_sorted = FP_sorted.sort_values(by = "d_accuracy").head(K)
end = perf_counter()

In [32]:
FP_sorted

Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,accuracy,d_accuracy,t_value_tp_tn
15,0.179133,"(EDUCATION=2, SEX=1)",3966,0,1408,0,2,5374.0,0.737998,-0.040802,6.328603


In [33]:
itemsets_to_mdss = []

for itemset in FP_sorted['itemsets']:
    sub = {}
    divStr = ""
    for i in itemset:
        conj = '"'+i.replace("=", '":["') +'"],'
        divStr += conj
    divStr = "{"+divStr.removesuffix(",")+"}"
    subset =ast.literal_eval(divStr)

In [34]:
get_metrics('divexplorer - fpgrowth', dff, subset, target_col, start, end)

['divexplorer - fpgrowth',
 {'EDUCATION': ['2'], 'SEX': ['1']},
 1.1844585577467293,
 0.17913333333333334,
 1.317259444374351,
 24.9301,
 0.007309040000000002,
 0.37273551300000207]

In [35]:
results.append(get_metrics('divexplorer - fpgrowth', dff, subset, target_col, start, end))

In [36]:
results

[['mdss',
  {'MARRIAGE': ['1', '2', '3'], 'SEX': ['1'], 'EDUCATION': ['2', '3']},
  1.199846596752899,
  0.24503333333333333,
  1.3853418712202066,
  39.9033,
  0.010831959999999998,
  1.7545045709999982],
 ['pysubgroup - beam search',
  {'SEX': ['1']},
  1.092550969993599,
  0.39626666666666666,
  1.2152267259413068,
  14.1572,
  0.008112479999999993,
  0.024930590000000308],
 ['pysubgroup - apriori',
  {'SEX': ['1']},
  1.092550969993599,
  0.39626666666666666,
  1.2152267259413068,
  14.1572,
  0.008112479999999993,
  0.8141182040000032],
 ['divexplorer - fpgrowth',
  {'EDUCATION': ['2'], 'SEX': ['1']},
  1.1844585577467293,
  0.17913333333333334,
  1.317259444374351,
  24.9301,
  0.007309040000000002,
  0.37273551300000207]]

In [37]:
dff = dff.drop(columns = 'predictions')

## SLICEFINDER ON POLICE DATASET

In [38]:
from slicefinder.slice_finder import SliceFinder
# from slicefinder.decision_tree import DecisionTree
import pickle
from sklearn.preprocessing import LabelEncoder

In [39]:
dff_sample = dff.copy() #.sample(frac = 0.1)

for column in search_space:
    if dff_sample[column].dtype != "int64":
        dff_sample[column] = dff_sample[column].astype('category')

encoders = {}
for column in search_space:
    if dff_sample.dtypes[column] == "object" or dff_sample[column].dtypes.name == "category":
        le = LabelEncoder()
        dff_sample[column] = le.fit_transform(dff_sample[column])
        encoders[column] = le

pickle.dump(encoders, open("encoders.pkl", "wb"), protocol=2)

model = MyFakeLR().getModel()
model.fit(dff_sample[search_space], dff_sample[target_col])

In [40]:
search_space

['SEX', 'EDUCATION', 'MARRIAGE']

In [41]:
sf = SliceFinder(model, (dff_sample[search_space], dff_sample[target_col]))

start = perf_counter()
recommendations = sf.find_slice(k=1, epsilon=0.5, degree=5,max_workers=4, max_time=60)
end = perf_counter()

subset = {}
for s in recommendations:
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
        temp_v = values.strip()
        subset[k] = [temp_v]

degree 1
crossing
effect size filtering
degree 2
crossing
effect size filtering
degree 3
crossing
effect size filtering
sorting


In [42]:
get_metrics('slice finder - lattice', dff, subset, target_col, start, end)

['slice finder - lattice', {}, '-', '-', '-', '-', '-', 74.06958659400001]

In [43]:
results.append(get_metrics('slice finder - lattice', dff, subset, target_col, start, end))

### Store Results

In [44]:
results_df = pd.DataFrame(results, columns = ['method', 'subset', 'lift', 'support', 'odds_ratio', 'mdss_score', 'quality_score', 'time'])
results_df

Unnamed: 0,method,subset,lift,support,odds_ratio,mdss_score,quality_score,time
0,mdss,"{'MARRIAGE': ['1', '2', '3'], 'SEX': ['1'], 'E...",1.199847,0.245033,1.385342,39.9033,0.010832,1.754505
1,pysubgroup - beam search,{'SEX': ['1']},1.092551,0.396267,1.215227,14.1572,0.008112,0.024931
2,pysubgroup - apriori,{'SEX': ['1']},1.092551,0.396267,1.215227,14.1572,0.008112,0.814118
3,divexplorer - fpgrowth,"{'EDUCATION': ['2'], 'SEX': ['1']}",1.184459,0.179133,1.317259,24.9301,0.007309,0.372736
4,slice finder - lattice,{},-,-,-,-,-,74.069587


In [45]:
results_df.to_csv('../results/credit_card_restricted.csv', index = False)

In [46]:
exp_total = 1

for col in search_space:
    exp_total *= (2**dff[col].nunique()) - 1

exp_total

5715

In [47]:
poly_total = 1 

for col in search_space:
    poly_total *= dff[col].nunique()

poly_total

56