In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
import sys
BASE_PATH = os.path.realpath('..')
DATASETS_DIR = os.path.join(BASE_PATH, 'datasets')
LIB_DIR = os.path.join(BASE_PATH,'lib')
if LIB_DIR[:-3] not in sys.path:
    sys.path.append(LIB_DIR[:-3])

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from importlib import reload

In [2]:
RESULTS_DIR = os.path.join(BASE_PATH, 'results')

In [3]:
from lib import fca_interp as fcai
from lib.utils_ import powerset
from importlib import reload

In [4]:
from copy import copy, deepcopy

# Load Data

In [5]:
data_dict = {}

## Adult DS

In [6]:
adult_ds = pd.read_csv(DATASETS_DIR+'/adult/adult.data', header=None)
fs = ['age','workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
      'relationship', 'race', 'sex', 'capital-gain', 'capital-loss','hours-per-week','native-country',
      'y']
adult_ds.columns = fs
adult_ds['y_bin'] = adult_ds['y']==' >50K'
cat_feats = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
y_feat = 'y_bin'
train_feats = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-loss','native-country']
adult_ds.drop(train_feats,1)

print(adult_ds.shape)
adult_ds.head(2)

(32561, 16)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,y,y_bin
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,False
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,False


In [7]:
data_dict['adult'] = {'ds':adult_ds, 'train_feats':train_feats, 'cat_feats':cat_feats, 'y_feat':y_feat,}

# Test

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

In [9]:
from sklearn.metrics import f1_score, accuracy_score, log_loss

In [10]:
from lib import fca_interp as fcai

In [11]:
from datetime import datetime

In [12]:
data_name = 'adult'
data = data_dict[data_name]
ds, train_feats, cat_feats, y_feat = data['ds'], data['train_feats'], data['cat_feats'], data['y_feat']
for f in cat_feats:
    ds[f+'_le'] = LabelEncoder().fit_transform(ds[f])

In [13]:
train_idxs = ds.index[:23000]
test_idxs = ds.index[23000:]

In [14]:
fcai = reload(fcai)

In [15]:
cntx_train = fcai.MultiValuedContext(ds.loc[train_idxs, train_feats],
                                     y_true=ds.loc[train_idxs, y_feat], cat_attrs=cat_feats)
cntx_test = fcai.MultiValuedContext(ds.loc[test_idxs, train_feats],
                                     y_true=ds.loc[test_idxs, y_feat], cat_attrs=cat_feats)

In [16]:
%%time
fm = fcai.FormalManager(cntx_train)
fm.construct_concepts(algo='RandomForest', rf_params={"n_estimators":10, "max_depth":5, "random_state":42})

HBox(children=(FloatProgress(value=0.0, description='Postprocessing', max=565.0, style=ProgressStyle(descripti…


CPU times: user 14.2 s, sys: 136 ms, total: 14.4 s
Wall time: 14.4 s


In [17]:
fm._construct_spanning_tree(use_tqdm=True)

HBox(children=(FloatProgress(value=0.0, description='construct spanning tree', max=565.0, style=ProgressStyle(…




In [18]:
%%timeit
fm._construct_lattice_connections(use_tqdm=False)

19.5 s ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit
fm._construct_lattice_from_spanning_tree(use_tqdm=False)

51.1 s ± 5.01 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [69]:
len(fm.get_concepts())

565

In [86]:
def _construct_lattice_from_spanning_tree(self, use_tqdm=True):
    chains = self._get_chains()
    chains = np.array([np.array(sorted(ch)).astype(int) for ch in chains])
    #ndigits = int(np.log(len(fm.get_concepts()))//np.log(10)+1)
    chains = sorted(chains, key=lambda ch: (len(ch), ','.join([f"{x:05.0f}" for x in sorted(ch)])))
    
    cncpts_dict = {}
    all_up_neighbs, not_up_neighbs = {}, {}
    for c in self.get_concepts():
        c_id = c.get_id()
        cncpts_dict[c_id] = c
        all_up_neighbs[c_id] = set()
        not_up_neighbs[c_id] = set()
        c._up_neighbs = set()
        c._low_neighbs = set()
    

    for c_cur_id in tqdm(range(len(cncpts_dict)), total=len(cncpts_dict), disable=not use_tqdm):
        c_cur = cncpts_dict[c_cur_id]
        #print(c_cur_id,':', c_cur)
        
        for ch_id in tqdm(range(len(chains)), total=len(chains), disable=not use_tqdm):
            pass
    return chains

In [102]:
fm.get_concept_by_id(100)._up_neighb_st

3

In [107]:
fm.get_concept_by_id(0)._up_neighb_st

In [111]:
set([0,1])-set([0])

{1}

In [162]:
sorted([1,2,3], key=lambda x: -x)

[3, 2, 1]

In [240]:
def _construct_lattice_from_spanning_tree2(self, use_tqdm=True):
    cncpts_dict = {}
    all_up_neighbs = {}
    for c in self.get_concepts():
        c_id = c.get_id()
        cncpts_dict[c_id] = c
        all_up_neighbs[c_id] = set()
        c._up_neighbs = set()
        c._low_neighbs = set()
    
    for c_cur_id in tqdm(range(len(cncpts_dict)), total=len(cncpts_dict), disable=not use_tqdm):
        c_cur = cncpts_dict[c_cur_id]
        if c_cur_id != 0:
            all_up_neighbs[c_cur_id] |= all_up_neighbs[c_cur._up_neighb_st]            
        
        cncpts_to_check = [0]
        while len(cncpts_to_check)>0:
            c_check_id = cncpts_to_check.pop(0)
            c_check = cncpts_dict[c_check_id]
            
            if c_check_id in all_up_neighbs[c_cur_id] or (c_cur_id!=c_check_id and c_cur.is_subconcept_of(c_check) ):
                all_up_neighbs[c_cur_id].add(c_check_id)
                cncpts_to_check += list(c_check._low_neighbs_st)
        
        if c_cur_id != 0:
            c_cur._up_neighbs = copy(all_up_neighbs[c_cur_id])
            for un_id in sorted(c_cur._up_neighbs, key=lambda x: -x):
                if un_id in c_cur._up_neighbs:
                    c_cur._up_neighbs -= all_up_neighbs[un_id]
            for un_id in c_cur._up_neighbs:
                un = cncpts_dict[un_id]
                un._low_neighbs.add(c_cur_id)

    return

In [241]:
%%time
aun = _construct_lattice_from_spanning_tree2(fm,)

HBox(children=(FloatProgress(value=0.0, max=565.0), HTML(value='')))


CPU times: user 5.78 s, sys: 62.6 ms, total: 5.85 s
Wall time: 5.79 s


In [242]:
low_neighbs1 = {}
up_neighbs1 = {}
for c in fm.get_concepts():
    low_neighbs1[c.get_id()] = copy(c._low_neighbs)
    up_neighbs1[c.get_id()] = copy(c._up_neighbs)

In [243]:
%%time
fm._construct_lattice_connections()

HBox(children=(FloatProgress(value=0.0, description='construct lattice connections', max=565.0, style=Progress…


CPU times: user 22 s, sys: 94.6 ms, total: 22.1 s
Wall time: 22 s


In [244]:
low_neighbs2 = {}
up_neighbs2 = {}
for c in fm.get_concepts():
    low_neighbs2[c.get_id()] = copy(c._low_neighbs)
    up_neighbs2[c.get_id()] = copy(c._up_neighbs)

In [247]:
np.mean([low_neighbs1[c.get_id()]==low_neighbs2[c.get_id()] for c in fm.get_concepts()])

1.0

In [256]:
fcai = reload(fcai)

In [257]:
cntx_train = fcai.MultiValuedContext(ds.loc[train_idxs, train_feats],
                                     y_true=ds.loc[train_idxs, y_feat], cat_attrs=cat_feats)
cntx_test = fcai.MultiValuedContext(ds.loc[test_idxs, train_feats],
                                     y_true=ds.loc[test_idxs, y_feat], cat_attrs=cat_feats)

In [258]:
%%time
fm = fcai.FormalManager(cntx_train)
fm.construct_concepts(algo='RandomForest', rf_params={"n_estimators":10, "max_depth":5, "random_state":42})

HBox(children=(FloatProgress(value=0.0, description='Postprocessing', max=565.0, style=ProgressStyle(descripti…


CPU times: user 16.2 s, sys: 29.7 ms, total: 16.2 s
Wall time: 16.2 s


In [259]:
for c in fm.get_concepts():
    c._extent = list(ds.index[[int(g) for g in c.get_extent()]].astype(str))

In [260]:
%%time
fm.construct_lattice(use_tqdm=True, only_spanning_tree=False)

HBox(children=(FloatProgress(value=0.0, description='construct spanning tree', max=565.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='construct lattice from tree', max=565.0, style=ProgressSt…


CPU times: user 15 s, sys: 222 ms, total: 15.3 s
Wall time: 15.1 s


In [264]:
fig = fm.get_plotly_fig(new_objs_lim=0, new_attrs_lim=0)
fig