## MDSS SCAN ON AUTOSTRAT MODE 

Import the MDSS module and Bernoulli modules

In [1]:
import pandas as pd

import warnings
from time import perf_counter

from comp_metrics import get_metrics
from pandas.api.types import is_numeric_dtype

warnings.filterwarnings('ignore')

### Data

In [2]:
compas = pd.read_csv('../datasets/studentinfo.csv').drop(columns = ['id_student', 'code_module', 'code_presentation'])
compas['final_result'] = compas['final_result'].isin(['Withdrawn', 'Failed']).astype(int)
compas

Unnamed: 0,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,0
1,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,0
2,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,1
3,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,0
4,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,0
...,...,...,...,...,...,...,...,...,...
32588,F,Wales,Lower Than A Level,10-20,0-35,0,30,N,0
32589,F,East Anglian Region,Lower Than A Level,40-50%,35-55,0,30,N,0
32590,F,South Region,A Level or Equivalent,20-30%,0-35,0,30,Y,0
32591,F,South East Region,Lower Than A Level,90-100%,35-55,0,30,N,1


In [3]:
numeric_cols = [col for col in compas.columns if (is_numeric_dtype(compas[col]) == True) & (compas[col].nunique() > 11)]
numeric_cols

['studied_credits']

In [4]:
compas[numeric_cols].nunique().sort_values(ascending=False)

studied_credits    61
dtype: int64

In [5]:
compas.head()

Unnamed: 0,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,0
1,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,0
2,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,1
3,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,0
4,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,0


In [6]:
compas.shape

(32593, 9)

In [7]:
compas.columns

Index(['gender', 'region', 'highest_education', 'imd_band', 'age_band',
       'num_of_prev_attempts', 'studied_credits', 'disability',
       'final_result'],
      dtype='object')

In [8]:
compas.isna().sum()

gender                     0
region                     0
highest_education          0
imd_band                1111
age_band                   0
num_of_prev_attempts       0
studied_credits            0
disability                 0
final_result               0
dtype: int64

In [9]:
dff = compas.dropna()

target_col = 'final_result'
search_space = list(dff.drop(columns=[target_col]).columns)

for col in numeric_cols:
    dff[col] = pd.qcut(dff[col], 10, duplicates='drop')
    dff[col] = dff[col].apply(lambda x: str(round(x.left, 2)) + ' - ' + str(round(x.right,2)))

for col in search_space:
    dff[col] = dff[col].astype(str)

dff[target_col] =  dff[target_col].astype(int)
expected_prob = dff[target_col].mean()
dff['expected'] = expected_prob
results = []


In [10]:
expected_prob

0.31510069245918304

### MDSS

In [11]:
from aif360.detectors.mdss.ScoringFunctions.BerkJones import BerkJones
from aif360.detectors.mdss.MDSS import MDSS

In [12]:
direction = 'positive'
penalty = 1
num_iters = 10


scoring_function = BerkJones(direction=direction, alpha = expected_prob)
scanner = MDSS(scoring_function)

start = perf_counter()
subset, score = scanner.parallel_scan(coordinates = dff[search_space], outcomes = dff[target_col], expectations = dff['expected'], penalty = penalty, num_iters = num_iters)
end = perf_counter()

subset, score


({'imd_band': ['0-10%',
   '10-20',
   '20-30%',
   '30-40%',
   '40-50%',
   '50-60%',
   '60-70%',
   '80-90%'],
  'studied_credits': ['120.0 - 655.0', '90.0 - 120.0'],
  'region': ['East Anglian Region',
   'East Midlands Region',
   'London Region',
   'North Region',
   'North Western Region',
   'Scotland',
   'South East Region',
   'South Region',
   'South West Region',
   'West Midlands Region',
   'Yorkshire Region']},
 287.7639)

In [14]:
get_metrics('mdss', dff, subset, target_col, start, end)

['mdss',
 {'imd_band': ['0-10%',
   '10-20',
   '20-30%',
   '30-40%',
   '40-50%',
   '50-60%',
   '60-70%',
   '80-90%'],
  'studied_credits': ['120.0 - 655.0', '90.0 - 120.0'],
  'region': ['East Anglian Region',
   'East Midlands Region',
   'London Region',
   'North Region',
   'North Western Region',
   'Scotland',
   'South East Region',
   'South Region',
   'South West Region',
   'West Midlands Region',
   'Yorkshire Region']},
 1.4724620104944357,
 0.20367193952099613,
 2.2589842507100446,
 308.7639,
 0.0303212743774766,
 25.437177081]

In [15]:
dff = dff.drop(columns = 'expected')

In [16]:
results.append(get_metrics('mdss', dff, subset, target_col, start, end))

In [17]:
results

[['mdss',
  {'imd_band': ['0-10%',
    '10-20',
    '20-30%',
    '30-40%',
    '40-50%',
    '50-60%',
    '60-70%',
    '80-90%'],
   'studied_credits': ['120.0 - 655.0', '90.0 - 120.0'],
   'region': ['East Anglian Region',
    'East Midlands Region',
    'London Region',
    'North Region',
    'North Western Region',
    'Scotland',
    'South East Region',
    'South Region',
    'South West Region',
    'West Midlands Region',
    'Yorkshire Region']},
  1.4724620104944357,
  0.20367193952099613,
  2.2589842507100446,
  308.7639,
  0.0303212743774766,
  25.437177081]]

## PYSUBGROUP ON POLICE DATASET

In [18]:
import pysubgroup as ps
import ast

In [19]:
target = ps.BinaryTarget (target_col, True)
search_space = ps.create_selectors(dff, ignore=[target_col])

task = ps.SubgroupDiscoveryTask (
    data = dff, 
    target = target, 
    search_space = search_space, 
    result_set_size=1, 
    depth=5, 
    qf=ps.WRAccQF())

start = perf_counter()   
result = ps.BeamSearch().execute(task)
end = perf_counter()

In [20]:
desc = result.to_dataframe()

final_desc_format = []

for index, row in desc.iterrows():
    final_dict = {}
    sub_desc_str = "'"+row['subgroup'].replace("AND ", "],'").replace("==", "':[")+"]"
    sub_desc_list = '{'+sub_desc_str+'}'
    subset = ast.literal_eval(sub_desc_list)

subset

{'studied_credits': ['90.0 - 120.0']}

In [21]:
results.append(get_metrics('pysubgroup - beam search', dff, subset, target_col, start, end))

In [22]:
results

[['mdss',
  {'imd_band': ['0-10%',
    '10-20',
    '20-30%',
    '30-40%',
    '40-50%',
    '50-60%',
    '60-70%',
    '80-90%'],
   'studied_credits': ['120.0 - 655.0', '90.0 - 120.0'],
   'region': ['East Anglian Region',
    'East Midlands Region',
    'London Region',
    'North Region',
    'North Western Region',
    'Scotland',
    'South East Region',
    'South Region',
    'South West Region',
    'West Midlands Region',
    'Yorkshire Region']},
  1.4724620104944357,
  0.20367193952099613,
  2.2589842507100446,
  308.7639,
  0.0303212743774766,
  25.437177081],
 ['pysubgroup - beam search',
  {'studied_credits': ['90.0 - 120.0']},
  1.294611324650737,
  0.19700146115240455,
  1.6679527772618592,
  118.4388,
  0.018288085425581177,
  0.7203770329999912]]

### Pysubgroup with Apriori search

In [23]:
start = perf_counter()   
result = ps.Apriori().execute(task)
end = perf_counter()

Apriori: Using numba for speedup
28
69
76
43


In [24]:
desc = result.to_dataframe()

final_desc_format = []

for index, row in desc.iterrows():
    final_dict = {}
    sub_desc_str = "'"+row['subgroup'].replace("AND ", "],'").replace("==", "':[")+"]"
    sub_desc_list = '{'+sub_desc_str+'}'
    subset = ast.literal_eval(sub_desc_list)

subset

{'studied_credits': ['90.0 - 120.0']}

In [25]:
results.append(get_metrics('pysubgroup - apriori', dff, subset, target_col, start, end))

In [26]:
results

[['mdss',
  {'imd_band': ['0-10%',
    '10-20',
    '20-30%',
    '30-40%',
    '40-50%',
    '50-60%',
    '60-70%',
    '80-90%'],
   'studied_credits': ['120.0 - 655.0', '90.0 - 120.0'],
   'region': ['East Anglian Region',
    'East Midlands Region',
    'London Region',
    'North Region',
    'North Western Region',
    'Scotland',
    'South East Region',
    'South Region',
    'South West Region',
    'West Midlands Region',
    'Yorkshire Region']},
  1.4724620104944357,
  0.20367193952099613,
  2.2589842507100446,
  308.7639,
  0.0303212743774766,
  25.437177081],
 ['pysubgroup - beam search',
  {'studied_credits': ['90.0 - 120.0']},
  1.294611324650737,
  0.19700146115240455,
  1.6679527772618592,
  118.4388,
  0.018288085425581177,
  0.7203770329999912],
 ['pysubgroup - apriori',
  {'studied_credits': ['90.0 - 120.0']},
  1.294611324650737,
  0.19700146115240455,
  1.6679527772618592,
  118.4388,
  0.018288085425581177,
  6.7160856770000095]]

## DIVEXPLORER ON POLICE DATASET

In [27]:
from divexplorer.FP_DivergenceExplorer import FP_DivergenceExplorer
from divexplorer.FP_Divergence import FP_Divergence
from slicefinder.SliceLogisticRegression import MyFakeLR

In [28]:
search_space = list(dff.drop(columns=[target_col]).columns)

In [29]:
X = dff[search_space]
y = dff[target_col]

model = MyFakeLR().getModel()
model.fit(X, y)

predictions = (model.predict_proba(X)[:,1]>0.5).astype(int)

In [30]:
dff['predictions'] = predictions

#### DivExplorer - FpGrowth

In [31]:
min_sup=0.1
K = 1

start = perf_counter()
fp_diver=FP_DivergenceExplorer(dff, true_class_name = target_col, predicted_class_name = "predictions", class_map={"P":1, "N":0})
FP_fm=fp_diver.getFrequentPatternDivergence(min_support=min_sup, metrics=["d_accuracy"],FPM_type='fpgrowth')
fp_divergence=FP_Divergence(FP_fm, "d_accuracy")
FP_sorted=fp_divergence.getDivergence(th_redundancy=0)
FP_sorted = FP_sorted.sort_values(by = "d_accuracy").head(K)
end = perf_counter()

In [32]:
FP_sorted

Unnamed: 0,support,itemsets,tn,fp,fn,tp,length,support_count,accuracy,d_accuracy,t_value_tp_tn
106,0.158853,"(num_of_prev_attempts=0, studied_credits=90.0 ...",2929,0,2072,0,2,5001.0,0.585683,-0.099216,13.339165


In [33]:
itemsets_to_mdss = []

for itemset in FP_sorted['itemsets']:
    sub = {}
    divStr = ""
    for i in itemset:
        conj = '"'+i.replace("=", '":["') +'"],'
        divStr += conj
    divStr = "{"+divStr.removesuffix(",")+"}"
    subset =ast.literal_eval(divStr)

In [34]:
results.append(get_metrics('divexplorer - fpgrowth', dff, subset, target_col, start, end))

In [35]:
results

[['mdss',
  {'imd_band': ['0-10%',
    '10-20',
    '20-30%',
    '30-40%',
    '40-50%',
    '50-60%',
    '60-70%',
    '80-90%'],
   'studied_credits': ['120.0 - 655.0', '90.0 - 120.0'],
   'region': ['East Anglian Region',
    'East Midlands Region',
    'London Region',
    'North Region',
    'North Western Region',
    'Scotland',
    'South East Region',
    'South Region',
    'South West Region',
    'West Midlands Region',
    'Yorkshire Region']},
  1.4724620104944357,
  0.20367193952099613,
  2.2589842507100446,
  308.7639,
  0.0303212743774766,
  25.437177081],
 ['pysubgroup - beam search',
  {'studied_credits': ['90.0 - 120.0']},
  1.294611324650737,
  0.19700146115240455,
  1.6679527772618592,
  118.4388,
  0.018288085425581177,
  0.7203770329999912],
 ['pysubgroup - apriori',
  {'studied_credits': ['90.0 - 120.0']},
  1.294611324650737,
  0.19700146115240455,
  1.6679527772618592,
  118.4388,
  0.018288085425581177,
  6.7160856770000095],
 ['divexplorer - fpgrowth',
  

In [36]:
dff = dff.drop(columns = 'predictions')

## SLICEFINDER ON POLICE DATASET

In [37]:
from slicefinder.slice_finder import SliceFinder
# from slicefinder.decision_tree import DecisionTree
import pickle
from sklearn.preprocessing import LabelEncoder

In [38]:
dff_sample = dff.sample(frac = 0.1)

for column in search_space:
    if dff_sample[column].dtype != "int64":
        dff_sample[column] = dff_sample[column].astype('category')

encoders = {}
for column in search_space:
    if dff_sample.dtypes[column] == "object" or dff_sample[column].dtypes.name == "category":
        le = LabelEncoder()
        dff_sample[column] = le.fit_transform(dff_sample[column])
        encoders[column] = le

pickle.dump(encoders, open("encoders.pkl", "wb"), protocol=2)

model = MyFakeLR().getModel()
model.fit(dff[search_space], dff[target_col])

In [39]:
sf = SliceFinder(model, (dff_sample[search_space], dff_sample[target_col]))

start = perf_counter()
recommendations = sf.find_slice(k=1, epsilon=0.5, degree=5,max_workers=4, max_time=300)
end = perf_counter()

subset = {}
for s in recommendations:
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
        temp_v = values.strip()
        subset[k] = [temp_v]

degree 1
crossing
effect size filtering
degree 2
crossing
effect size filtering
sorting
region:North Western Region 
num_of_prev_attempts:2 


In [40]:
get_metrics('slice finder - lattice', dff, subset, target_col, start, end)

['slice finder - lattice',
 {'region': ['North Western Region'], 'num_of_prev_attempts': ['2']},
 1.8512600806451613,
 0.001905850962454736,
 3.0502579666160856,
 9.1307,
 0.000511211436771775,
 260.772388852]

In [41]:
results.append(get_metrics('slice finder - lattice', dff, subset, target_col, start, end))

### Store Results

In [42]:
results_df = pd.DataFrame(results, columns = ['method', 'subset', 'lift', 'support', 'odds_ratio', 'mdss_score', 'quality_score', 'time'])
results_df

Unnamed: 0,method,subset,lift,support,odds_ratio,mdss_score,quality_score,time
0,mdss,"{'imd_band': ['0-10%', '10-20', '20-30%', '30-...",1.472462,0.203672,2.258984,308.7639,0.030321,25.437177
1,pysubgroup - beam search,{'studied_credits': ['90.0 - 120.0']},1.294611,0.197001,1.667953,118.4388,0.018288,0.720377
2,pysubgroup - apriori,{'studied_credits': ['90.0 - 120.0']},1.294611,0.197001,1.667953,118.4388,0.018288,6.716086
3,divexplorer - fpgrowth,"{'num_of_prev_attempts': ['0'], 'studied_credi...",1.314872,0.158853,1.679555,108.8191,0.015761,11.732667
4,slice finder - lattice,"{'region': ['North Western Region'], 'num_of_p...",1.85126,0.001906,3.050258,9.1307,0.000511,260.772389


In [43]:
results_df.to_csv('../results/education.csv', index = False)

In [44]:
exp_total = 1

for col in search_space:
    exp_total *= (2**dff[col].nunique()) - 1

exp_total

31175238377745

In [45]:
poly_total = 1 

for col in search_space:
    poly_total *= dff[col].nunique()

poly_total

218400