# Preliminary
* Import libraries
* Load data
* Train BlackBox model

In [1]:
import numpy as np
import pandas as pd
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
import sys
BASE_PATH = os.path.realpath('..')
DATASETS_DIR = os.path.join(BASE_PATH, 'datasets')
LIB_DIR = os.path.join(BASE_PATH,'lib')
if LIB_DIR not in sys.path:
    sys.path.append(LIB_DIR)

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook

from importlib import reload

In [2]:
import scipy as sp

In [3]:
from itertools import combinations, islice

In [4]:
from datetime import datetime

# Use basic Mango Dataset

In [5]:
mango_ds = pd.read_csv(os.path.join(DATASETS_DIR, 'mango.csv'), index_col=0)
mango_ds

Unnamed: 0_level_0,color,firm,smooth,form,fruit
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
apple,yellow,False,True,round,True
grapefruit,yellow,False,False,round,True
kiwi,green,False,False,oval,True
plum,blue,False,True,oval,True
toy cube,green,True,True,cubic,False
egg,white,True,True,oval,False
tennis ball,white,False,False,round,False
mango,green,False,True,oval,True


In [6]:
mango_bin_ds = pd.DataFrame()
for f in mango_ds.columns:
    if mango_ds[f].dtype==bool:
        mango_bin_ds[f] = mango_ds[f]
    elif mango_ds[f].dtype==np.dtype('O'):
        for v in mango_ds[f].unique():
            mango_bin_ds[f"{f}_{v}"] = mango_ds[f]==v
    else:
        raise TypeError(f'Column DataType {mango_ds[f].dtype} is not supported. Possible variants are [bool, str]')

In [7]:
import fca_interp as fcai

In [8]:
fcai = reload(fcai)

In [9]:
%%timeit
cntx = fcai.Context(mango_bin_ds)
fm = fcai.FormalManager(cntx)
fm.construct_concepts(use_tqdm=False)
fm.construct_lattice_connections(use_tqdm=False)

4.37 ms ± 41.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Trying more real bank_dataset

In [10]:
bank_ds = pd.read_csv(os.path.join(DATASETS_DIR, 'bank.csv'),sep=';')
bank_ds.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [11]:
bank_bin_ds = pd.DataFrame()
for f in bank_ds.columns:
    if bank_ds[f].dtype==bool:
        bank_bin_ds[f] = bank_ds[f]
    elif bank_ds[f].dtype==np.dtype('O'):
        for v in bank_ds[f].unique():
            bank_bin_ds[f"{f}_{v}"] = bank_ds[f]==v
    else:
        for q in np.quantile(bank_ds[f],[0.25,0.50,0.75]):
            bank_bin_ds[f"{f}<{q}"] = bank_ds[f]<q
    #else:
    #    raise TypeError(f'Column DataType {bank_ds[f].dtype} is not supported. Possible variants are [bool, str]')
bank_bin_ds.head()

Unnamed: 0,age<33.0,age<39.0,age<49.0,job_unemployed,job_services,job_management,job_blue-collar,job_self-employed,job_technician,job_entrepreneur,...,campaign<2.0,campaign<3.0,pdays<-1.0,previous<0.0,poutcome_unknown,poutcome_failure,poutcome_other,poutcome_success,y_no,y_yes
0,True,True,True,True,False,False,False,False,False,False,...,True,True,False,False,True,False,False,False,True,False
1,False,True,True,False,True,False,False,False,False,False,...,True,True,False,False,False,True,False,False,True,False
2,False,True,True,False,False,True,False,False,False,False,...,True,True,False,False,False,True,False,False,True,False
3,True,True,True,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,True,False
4,False,False,False,False,False,False,True,False,False,False,...,True,True,False,False,True,False,False,False,True,False


In [12]:
cds = bank_bin_ds.iloc[:10,:10]
print(cds.shape)

(10, 10)


In [13]:
%%timeit
cntx = fcai.Context(cds)
fm = fcai.FormalManager(cntx)
fm.construct_concepts(use_tqdm=False)
fm.construct_lattice_connections(use_tqdm=False)

2.37 ms ± 45.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
cds = bank_bin_ds.iloc[:100,:10]
print(cds.shape)

(100, 10)


In [15]:
%%timeit
cntx = fcai.Context(cds)
fm = fcai.FormalManager(cntx)
fm.construct_concepts(use_tqdm=False)
fm.construct_lattice_connections(use_tqdm=False)

44.5 ms ± 494 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
cds = bank_bin_ds.iloc[:100]
print(cds.shape)

(100, 63)


In [17]:
%%timeit
cntx = fcai.Context(cds)
fm = fcai.FormalManager(cntx)
fm.construct_concepts(use_tqdm=False)
fm.construct_lattice_connections(use_tqdm=False)

KeyboardInterrupt: 