## MDSS SCAN ON AUTOSTRAT MODE 

Import the MDSS module and Bernoulli modules

In [1]:
import pandas as pd

import warnings
from time import perf_counter

from comp_metrics import get_metrics
from pandas.api.types import is_numeric_dtype

warnings.filterwarnings('ignore')

### Data

In [2]:
compas = pd.read_csv('../datasets/studentinfo.csv').drop(columns = ['id_student', 'code_module', 'code_presentation'])
compas['final_result'] = compas['final_result'].isin(['Withdrawn', 'Failed']).astype(int)
compas

Unnamed: 0,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,0
1,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,0
2,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,1
3,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,0
4,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,0
...,...,...,...,...,...,...,...,...,...
32588,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,0
32589,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,0
32590,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,0
32591,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,1


In [3]:
numeric_cols = [col for col in compas.columns if (is_numeric_dtype(compas[col]) == True) & (compas[col].nunique() > 11)]
numeric_cols

['studied_credits']

In [4]:
compas.head()

Unnamed: 0,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,0
1,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,0
2,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,1
3,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,0
4,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,0


In [5]:
compas.shape

(32593, 9)

In [6]:
compas.columns

Index(['gender', 'region', 'highest_education', 'imd_band', 'age_band',
       'num_of_prev_attempts', 'studied_credits', 'disability',
       'final_result'],
      dtype='object')

In [7]:
compas.isna().sum()

gender                     0
region                     0
highest_education          0
imd_band                1111
age_band                   0
num_of_prev_attempts       0
studied_credits            0
disability                 0
final_result               0
dtype: int64

In [8]:
dff = compas.dropna()

target_col = 'final_result'
search_space = ['gender', 'disability']

for col in numeric_cols:
    dff[col] = pd.qcut(dff[col], 10, duplicates='drop')
    dff[col] = dff[col].apply(lambda x: str(round(x.left, 2)) + ' - ' + str(round(x.right,2)))

for col in search_space:
    dff[col] = dff[col].astype(str)

dff[target_col] = dff[target_col].astype(int)
expected_prob = dff[target_col].mean()
dff['expected'] = expected_prob
results = []


In [9]:
expected_prob

0.31510069245918304

### MDSS

In [10]:
from aif360.detectors.mdss.ScoringFunctions.BerkJones import BerkJones
from aif360.detectors.mdss.MDSS import MDSS

In [11]:
direction = 'positive'
penalty = 1e-3
num_iters = 10


scoring_function = BerkJones(direction=direction, alpha = expected_prob)
scanner = MDSS(scoring_function)

start = perf_counter()
subset, score = scanner.parallel_scan(coordinates = dff[search_space], outcomes = dff[target_col], expectations = dff['expected'], penalty = penalty, num_iters = num_iters)
end = perf_counter()

subset, score


({'disability': ['Y']}, 43.9168)

In [12]:
dff = dff.drop(columns = 'expected')

In [13]:
results.append(get_metrics('mdss', dff, subset, target_col, start, end))

In [14]:
results

[['mdss',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  1.8731465529999998]]

## PYSUBGROUP ON POLICE DATASET

In [15]:
import pysubgroup as ps
import ast

In [16]:
target = ps.BinaryTarget (target_col, True)

space = search_space + [target_col]
searchspace = ps.create_selectors(dff[space], ignore=[target_col])

task = ps.SubgroupDiscoveryTask (
    data = dff[space], 
    target = target, 
    search_space = searchspace, 
    result_set_size=1, 
    depth=5, 
    qf=ps.WRAccQF())

start = perf_counter()   
result = ps.BeamSearch().execute(task)
end = perf_counter()

In [17]:
desc = result.to_dataframe()

final_desc_format = []

for index, row in desc.iterrows():
    final_dict = {}
    sub_desc_str = "'"+row['subgroup'].replace("AND ", "],'").replace("==", "':[")+"]"
    sub_desc_list = '{'+sub_desc_str+'}'
    subset = ast.literal_eval(sub_desc_list)

subset

{'disability': ['Y']}

In [18]:
results.append(get_metrics('pysubgroup - beam search', dff, subset, target_col, start, end))

In [19]:
results

[['mdss',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  1.8731465529999998],
 ['pysubgroup - beam search',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  0.006191786999998783]]

### Pysubgroup with Apriori search

In [20]:
start = perf_counter()   
result = ps.Apriori().execute(task)
end = perf_counter()

Apriori: Using numba for speedup
4
4


In [21]:
desc = result.to_dataframe()

final_desc_format = []

for index, row in desc.iterrows():
    final_dict = {}
    sub_desc_str = "'"+row['subgroup'].replace("AND ", "],'").replace("==", "':[")+"]"
    sub_desc_list = '{'+sub_desc_str+'}'
    subset = ast.literal_eval(sub_desc_list)

subset

{'disability': ['Y']}

In [22]:
results.append(get_metrics('pysubgroup - apriori', dff, subset, target_col, start, end))

In [23]:
results

[['mdss',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  1.8731465529999998],
 ['pysubgroup - beam search',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  0.006191786999998783],
 ['pysubgroup - apriori',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  0.9094027929999999]]

## DIVEXPLORER ON POLICE DATASET

In [24]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence
from slicefinder.SliceLogisticRegression import MyFakeLR

In [25]:
X = dff[search_space]
y = dff[target_col]

model = MyFakeLR().getModel()
model.fit(X, y)

predictions = (model.predict_proba(X)[:,1]>0.5).astype(int)

In [26]:
dff['predictions'] = predictions

In [27]:
space = search_space + [target_col, 'predictions']
space

['gender', 'disability', 'final_result', 'predictions']

#### DivExplorer - FpGrowth

In [28]:
min_sup=0.1
K = 1
 
start = perf_counter()
fp_diver=FP_DivergenceExplorer(dff[space], true_class_name = target_col, predicted_class_name = "predictions", class_map={"P":1, "N":0})
FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_accuracy"],FPM_type='fpgrowth')
fp_divergence=FP_Divergence(FP_fm, "d_accuracy")
FP_sorted=fp_divergence.getDivergence(th_redundancy=0)
FP_sorted = FP_sorted.sort_values(by = "d_accuracy").head(K)
end = perf_counter()

In [29]:
FP_sorted

Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,accuracy,d_accuracy,t_value_tp_tn
2,0.542119,(gender=M),11568,0,5499,0,1,17067.0,0.677799,-0.0071,1.6038


In [30]:
itemsets_to_mdss = []

for itemset in FP_sorted['itemsets']:
    sub = {}
    divStr = ""
    for i in itemset:
        conj = '"'+i.replace("=", '":["') +'"],'
        divStr += conj
    divStr = "{"+divStr.removesuffix(",")+"}"
    subset =ast.literal_eval(divStr)

In [31]:
get_metrics('divexplorer - fpgrowth', dff, subset, target_col, start, end)

['divexplorer - fpgrowth',
 {'gender': ['M']},
 1.022532625213343,
 0.5421193062702496,
 1.0745936500725977,
 1.9853,
 0.003849071907728949,
 0.35638318800000235]

In [32]:
results.append(get_metrics('divexplorer - fpgrowth', dff, subset, target_col, start, end))

In [33]:
results

[['mdss',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  1.8731465529999998],
 ['pysubgroup - beam search',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  0.006191786999998783],
 ['pysubgroup - apriori',
  {'disability': ['Y']},
  1.2517988043601447,
  0.09945365605742965,
  1.4749416400068984,
  43.9178,
  0.00789084975256648,
  0.9094027929999999],
 ['divexplorer - fpgrowth',
  {'gender': ['M']},
  1.022532625213343,
  0.5421193062702496,
  1.0745936500725977,
  1.9853,
  0.003849071907728949,
  0.35638318800000235]]

In [34]:
dff = dff.drop(columns = 'predictions')

## SLICEFINDER ON POLICE DATASET

In [35]:
from slicefinder.slice_finder import SliceFinder
# from slicefinder.decision_tree import DecisionTree
import pickle
from sklearn.preprocessing import LabelEncoder

In [36]:
dff_sample = dff.copy() #.sample(frac = 0.1)

for column in search_space:
    if dff_sample[column].dtype != "int64":
        dff_sample[column] = dff_sample[column].astype('category')

encoders = {}
for column in search_space:
    if dff_sample.dtypes[column] == "object" or dff_sample[column].dtypes.name == "category":
        le = LabelEncoder()
        dff_sample[column] = le.fit_transform(dff_sample[column])
        encoders[column] = le

pickle.dump(encoders, open("encoders.pkl", "wb"), protocol=2)

model = MyFakeLR().getModel()
model.fit(dff_sample[search_space], dff_sample[target_col])

In [37]:
search_space

['gender', 'disability']

In [38]:
sf = SliceFinder(model, (dff_sample[search_space], dff_sample[target_col]))

start = perf_counter()
recommendations = sf.find_slice(k=1, epsilon=0.5, degree=5,max_workers=4, max_time=180)
end = perf_counter()

subset = {}
for s in recommendations:
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
        temp_v = values.strip()
        subset[k] = [temp_v]

degree 1
crossing
effect size filtering
degree 2
crossing
effect size filtering
degree 3
crossing
effect size filtering
degree 4
crossing
effect size filtering
degree 5
crossing
effect size filtering
sorting


In [39]:
get_metrics('slice finder - lattice', dff, subset, target_col, start, end)

['slice finder - lattice', {}, '-', '-', '-', '-', '-', 80.936307392]

In [40]:
results.append(get_metrics('slice finder - lattice', dff, subset, target_col, start, end))

### Store Results

In [41]:
results_df = pd.DataFrame(results, columns = ['method', 'subset', 'lift', 'support', 'odds_ratio', 'mdss_score', 'quality_score', 'time'])
results_df

Unnamed: 0,method,subset,lift,support,odds_ratio,mdss_score,quality_score,time
0,mdss,{'disability': ['Y']},1.251799,0.099454,1.474942,43.9178,0.007891,1.873147
1,pysubgroup - beam search,{'disability': ['Y']},1.251799,0.099454,1.474942,43.9178,0.007891,0.006192
2,pysubgroup - apriori,{'disability': ['Y']},1.251799,0.099454,1.474942,43.9178,0.007891,0.909403
3,divexplorer - fpgrowth,{'gender': ['M']},1.022533,0.542119,1.074594,1.9853,0.003849,0.356383
4,slice finder - lattice,{},-,-,-,-,-,80.936307


In [42]:
results_df.to_csv('../results/education_restricted.csv', index = False)

In [43]:
exp_total = 1

for col in search_space:
    exp_total *= (2**dff[col].nunique()) - 1

exp_total

9

In [44]:
poly_total = 1 

for col in search_space:
    poly_total *= dff[col].nunique()

poly_total

4