In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
import sys
BASE_PATH = os.path.realpath('..')
DATASETS_DIR = os.path.join(BASE_PATH, 'datasets')
LIB_DIR = os.path.join(BASE_PATH,'lib')
if LIB_DIR not in sys.path:
    sys.path.append(LIB_DIR)

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from importlib import reload

In [2]:
import fca_interp as fcai

In [3]:
from utils_ import powerset

In [4]:
from importlib import reload

In [5]:
from sklearn.datasets import load_boston

# Attribute Information:

Input variables:

__bank client data:__
1. age (numeric)
2. job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3. marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4. education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5. default: has credit in default? (categorical: 'no','yes','unknown')
6. housing: has housing loan? (categorical: 'no','yes','unknown')
7. loan: has personal loan? (categorical: 'no','yes','unknown')

__related with the last contact of the current campaign:__
8. contact: contact communication type (categorical: 'cellular','telephone')
9. month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
10. day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
11. duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

__other attributes:__
12. campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13. pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14. previous: number of contacts performed before this campaign and for this client (numeric)
15. poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')

__social and economic context attributes__
16. emp.var.rate: employment variation rate - quarterly indicator (numeric)
17. cons.price.idx: consumer price index - monthly indicator (numeric)
18. cons.conf.idx: consumer confidence index - monthly indicator (numeric)
19. euribor3m: euribor 3 month rate - daily indicator (numeric)
20. nr.employed: number of employees - quarterly indicator (numeric)

__Output variable (desired target):__
21. y - has the client subscribed a term deposit? (binary: 'yes','no')



In [6]:
real_feats = ['age', 'default', 'housing', 'loan', 'campaign', 'pdays', 'previous', 'balance',]
cat_feats  = ['job', 'marital', 'education', 'contact', 'month', 'poutcome',]

In [7]:
bank_ds = pd.read_csv(os.path.join(DATASETS_DIR, 'bank.csv'), sep=';')

for f in ['default', 'housing','loan', 'y']:
    bank_ds[f] = bank_ds[f].apply(lambda x: {'no':0, 'yes':1, 'unknown':None}[x])
bank_ds.loc[bank_ds['pdays']==-1, 'pdays'] = None

bank_ds.index = bank_ds.index.astype(str)
bank_ds

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,0,1787,0,0,cellular,19,oct,79,1,,0,unknown,0
1,33,services,married,secondary,0,4789,1,1,cellular,11,may,220,1,339.0,4,failure,0
2,35,management,single,tertiary,0,1350,1,0,cellular,16,apr,185,1,330.0,1,failure,0
3,30,management,married,tertiary,0,1476,1,1,unknown,3,jun,199,4,,0,unknown,0
4,59,blue-collar,married,secondary,0,0,1,0,unknown,5,may,226,1,,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,0,-333,1,0,cellular,30,jul,329,5,,0,unknown,0
4517,57,self-employed,married,tertiary,1,-3313,1,1,unknown,9,may,153,1,,0,unknown,0
4518,57,technician,married,secondary,0,295,0,0,cellular,19,aug,151,11,,0,unknown,0
4519,28,blue-collar,married,secondary,0,1137,0,0,cellular,6,feb,129,4,211.0,3,other,0


In [8]:
bb = CatBoostClassifier(cat_features=cat_feats)
bb.fit(bank_ds[cat_feats+real_feats], bank_ds['y'], verbose=False, plot=False)
bank_ds['preds'] = bb.predict(bank_ds[cat_feats+real_feats])

In [9]:
bank_ds['preds'].sum()

149

In [10]:
bank_ds.to_csv('bank_ds_new.csv')
bank_ds = pd.read_csv('bank_ds_new.csv', index_col=0)

In [11]:
bank_ds.index = bank_ds.index.astype(str)

# Анализируем положительные примеры

In [12]:
import fca_interp as fcai

In [13]:
fcai = reload(fcai)

In [14]:
flg = bank_ds['preds']==1

## 1. Создаём формальный контекст
Каждая строка в нём должна соответствовать сильной гипотизе. И кол-во признаков должно быть минимальным для более быстрого расчёта

In [15]:
def get_negative_support(bin_ds, flg, fs=None, drop_negative_duplicates=False):
    fs = fs if fs is not None else bin_ds.columns
    fs = list(fs)
    pos_examples = bin_ds.loc[flg, fs].astype(int)
    neg_examples = bin_ds.loc[~flg, fs].astype(int)
    
    pos_examples = pos_examples.drop_duplicates()            
    if drop_negative_duplicates:
        neg_examples = neg_examples.drop_duplicates()
        coefs = [1]*len(neg_examples)
    else:
        neg_examples = neg_examples.pivot_table(index=fs, aggfunc='size').reset_index()
        coefs = neg_examples[0]
        neg_examples = neg_examples.drop(0, 1)
        
    cross = pos_examples.dot(neg_examples.T)    
    #cross_same = (cross==neg_examples.sum(1))&((cross.T==pos_examples.sum(1)).T)
    cross_same = (cross.T==pos_examples.sum(1)).T # if g_+` 
    neg_support = (cross_same.any(0) * coefs).sum()
    return neg_support

In [16]:
def is_feats_strong(bin_ds, flg, fs=None):
    fs = fs if fs is not None else bin_ds.columns
    neg_support = get_negative_support(bin_ds, flg, fs, drop_negative_duplicates=True)
    return neg_support==0

In [17]:
def greedy_search_of_strong_hyps_attrs(bin_ds, flg, max_neg_supp_lim=0):
    feat_pos_supp = bin_ds.loc[flg].sum().sort_values(ascending=False)
    feat_strongness = bin_ds.loc[flg].sum()/bin_ds.sum()
    feat_strongness = feat_strongness.sort_values(ascending=False)
    
    selected_feats = []
    min_neg_supps = []
    for i in tqdm(range(len(bin_ds.columns))):
        neg_supps = {}
        for f in tqdm(bin_ds.drop(selected_feats,1).columns, leave=False):
            neg_supps[f] = get_negative_support(bin_ds, flg, fs=selected_feats+[f])
        neg_supps = pd.Series(neg_supps).sort_values()
        
        smallest_neg_supps = neg_supps[neg_supps==neg_supps.min()].index    
        most_strong_feats = feat_strongness[feat_strongness == feat_strongness[smallest_neg_supps].max()].index
        most_strong_feats = [f for f in most_strong_feats if f not in selected_feats]
        
        f = feat_pos_supp[most_strong_feats].idxmax()    
        selected_feats.append(f)
        min_neg_supps.append(neg_supps.min())
        print(f'{i}) min neg support: {min_neg_supps[-1]}')
        if min_neg_supps[-1]<=max_neg_supp_lim:
            break
    return selected_feats

In [18]:
def squeeze_selected_feats(bin_ds, flg, selected_feats):
    selected_feats = selected_feats.copy()
    for i in tqdm(range(len(selected_feats))):
        for f in selected_feats:
            is_strong = is_feats_strong(bin_ds[selected_feats].drop(f,1), flg)
            if is_strong:
                selected_feats = [f_ for f_ in selected_feats if f_!=f]
            break
        else:
            break
    return selected_feats

### Получаем все возможные бинарные разбиения

In [19]:
binarizer = fcai.Binarizer()

In [140]:
ths = {}
for f in real_feats:
    ths[f] = sorted(bank_ds.loc[flg, f].unique())
cases = {}

In [141]:
bin_ds, forder = binarizer.binarize_ds(bank_ds[cat_feats+real_feats], cat_feats, ths, cases)
bin_ds.shape

(4521, 644)

In [142]:
bin_ds[flg].drop_duplicates().T.drop_duplicates().T.shape

(149, 626)

In [143]:
is_feats_strong(bin_ds, flg)

True

Отлично: на данном этапе все строки соответствуют сильным гипотезам. Попробуем сократить контекст

Сначала уберём описания, которые встречаются всегда/никогда

In [144]:
for i in [0,1]:
    s = bin_ds.columns[bin_ds.mean()==i]
    print(f'mean = {i}, n = {len(s)}')
    bin_ds = bin_ds.drop(s, 1)

mean = 0, n = 2
mean = 1, n = 9


In [145]:
is_feats_strong(bin_ds, flg)

True

Далее попробуем убрать описания, слишком редко стречающиеся в контексте

In [146]:
bin_ds_pos = bin_ds.loc[flg]
bin_ds_neg = bin_ds.loc[~flg]

In [147]:
feat_pos_supp = bin_ds_pos.sum().sort_values(ascending=False)
feat_pos_supp

balance__leq__14220     149
balance__geq__-1206     149
default__leq__0         149
campaign__leq__9        149
previous__leq__14       149
                       ... 
campaign__geq__9          1
contact__is__unknown      1
age__geq__87              1
balance__geq__14220       1
pdays__geq__761.0         1
Length: 633, dtype: int64

In [148]:
best_general_feats = feat_pos_supp[feat_pos_supp>=20].index
len(best_general_feats), get_negative_support(bin_ds[best_general_feats], flg)

(523, 0)

Кол-во столбцов уменьшилось почти на 200

In [149]:
bin_ds_short = bin_ds[best_general_feats]
bin_ds_short.shape

(4521, 523)

Попробуем убрать наименее сильные описания (т.е. они встречаются в отрицательном классе чаще, чем в положительном)

In [150]:
feat_strongness = bin_ds_short.loc[flg].sum()/bin_ds_short.sum()
feat_strongness = feat_strongness.sort_values(ascending=False)
feat_strongness.head()

poutcome__is__success    0.736434
pdays__leq__98.0         0.380952
pdays__leq__97.0         0.377049
pdays__leq__99.0         0.373134
pdays__leq__96.0         0.364407
dtype: float64

In [151]:
best_strongest_feats = feat_strongness[feat_strongness>=0.02].index
len(best_strongest_feats), get_negative_support(bin_ds_short, flg, best_strongest_feats)

(516, 0)

In [152]:
bin_ds_short = bin_ds[best_strongest_feats]

Уже не так успешно

In [153]:
feat_strongness = bin_ds_short.loc[flg].sum()/bin_ds_short.sum()
feat_strongness = feat_strongness.sort_values(ascending=False)
feat_strongness.head()

poutcome__is__success    0.736434
pdays__leq__98.0         0.380952
pdays__leq__97.0         0.377049
pdays__leq__99.0         0.373134
pdays__leq__96.0         0.364407
dtype: float64

Жадным образом подберём минимальное кол-во аттрибутов, составляющиъ сильные гипотезы

In [154]:
selected_feats = greedy_search_of_strong_hyps_attrs(bin_ds_short, flg, 0)

HBox(children=(FloatProgress(value=0.0, max=516.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=516.0), HTML(value='')))

0) min neg support: 4215


HBox(children=(FloatProgress(value=0.0, max=515.0), HTML(value='')))

1) min neg support: 4139


HBox(children=(FloatProgress(value=0.0, max=514.0), HTML(value='')))

2) min neg support: 4095


HBox(children=(FloatProgress(value=0.0, max=513.0), HTML(value='')))

3) min neg support: 4086


HBox(children=(FloatProgress(value=0.0, max=512.0), HTML(value='')))

4) min neg support: 4082


HBox(children=(FloatProgress(value=0.0, max=511.0), HTML(value='')))

5) min neg support: 4082


HBox(children=(FloatProgress(value=0.0, max=510.0), HTML(value='')))

6) min neg support: 3944


HBox(children=(FloatProgress(value=0.0, max=509.0), HTML(value='')))

7) min neg support: 3943


HBox(children=(FloatProgress(value=0.0, max=508.0), HTML(value='')))

8) min neg support: 3093


HBox(children=(FloatProgress(value=0.0, max=507.0), HTML(value='')))

9) min neg support: 2748


HBox(children=(FloatProgress(value=0.0, max=506.0), HTML(value='')))

10) min neg support: 2559


HBox(children=(FloatProgress(value=0.0, max=505.0), HTML(value='')))

11) min neg support: 2449


HBox(children=(FloatProgress(value=0.0, max=504.0), HTML(value='')))

KeyboardInterrupt: 

In [74]:
selected_feats_short = squeeze_selected_feats(bin_ds, flg, selected_feats)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [75]:
len(selected_feats), len(selected_feats_short)

(3, 3)

In [78]:
with open('short_feats_pos_samples_refined.txt', 'w') as f:
    f.write(','.join(selected_feats_short))
with open('short_feats_pos_samples_refined.txt', 'r') as f:
    short_feats = f.read().split(',')

In [111]:
bin_ds['y'] = flg
bin_ds[selected_feats_short+['y']].to_csv('Bank_bin_ds_pos.csv')

In [79]:
bin_ds_pos = bin_ds.loc[flg]
bin_ds_neg = bin_ds.loc[~flg]

# Анализируем отрицательные примеры

In [20]:
flg = bank_ds['preds']==0

### Получаем все возможные бинарные разбиения

In [21]:
binarizer = fcai.Binarizer()

In [38]:
ths = {}
for f in real_feats:
    s = bank_ds.loc[flg, f].value_counts()
    s /= s.sum()
    ths[f] =  np.random.choice(s.index, size=10, p=s.values)
cases = {}

In [39]:
bin_ds, forder = binarizer.binarize_ds(bank_ds[cat_feats+real_feats], cat_feats, ths, cases)
bin_ds.shape

(4521, 154)

In [40]:
bin_ds[flg].drop_duplicates().T.drop_duplicates().T.shape

(4224, 147)

In [41]:
is_feats_strong(bin_ds, flg)

True

Отлично: на данном этапе все строки соответствуют сильным гипотезам. Попробуем сократить контекст

Сначала уберём описания, которые встречаются всегда/никогда

In [42]:
for i in [0,1]:
    s = bin_ds.columns[bin_ds.mean()==i]
    print(f'mean = {i}, n = {len(s)}')
    bin_ds = bin_ds.drop(s, 1)

mean = 0, n = 0
mean = 1, n = 7


In [43]:
is_feats_strong(bin_ds, flg)

True

Далее попробуем убрать описания, слишком редко стречающиеся в контексте

In [44]:
bin_ds_pos = bin_ds.loc[flg]
bin_ds_neg = bin_ds.loc[~flg]

In [45]:
feat_pos_supp = bin_ds_pos.sum().sort_values(ascending=False)
feat_pos_supp

month__not__dec           4359
poutcome__not__success    4338
job__not__unknown         4337
month__not__mar           4334
month__not__sep           4332
                          ... 
pdays__geq__369.0           38
job__is__unknown            35
poutcome__is__success       34
month__is__dec              13
pdays__geq__435.0           13
Length: 147, dtype: int64

In [51]:
best_general_feats = feat_pos_supp[feat_pos_supp>=200].index
len(best_general_feats), get_negative_support(bin_ds[best_general_feats], flg)

(129, 0)

In [52]:
bin_ds_short = bin_ds[best_general_feats]
bin_ds_short.shape

(4521, 129)

Попробуем убрать наименее сильные описания (т.е. они встречаются в отрицательном классе чаще, чем в положительном)

In [53]:
feat_strongness = bin_ds_short.loc[flg].sum()/bin_ds_short.sum()
feat_strongness = feat_strongness.sort_values(ascending=False)
feat_strongness.head()

contact__is__unknown     0.999245
poutcome__is__unknown    0.993252
previous__leq__0         0.993252
month__is__may           0.992847
job__is__blue-collar     0.989429
dtype: float64

In [56]:
best_strongest_feats = feat_strongness[feat_strongness>=0.5].index
len(best_strongest_feats), get_negative_support(bin_ds_short, flg, best_strongest_feats)

(129, 0)

In [57]:
bin_ds_short = bin_ds[best_strongest_feats]

Уже не так успешно

In [58]:
feat_strongness = bin_ds_short.loc[flg].sum()/bin_ds_short.sum()
feat_strongness = feat_strongness.sort_values(ascending=False)
feat_strongness.head()

contact__is__unknown     0.999245
poutcome__is__unknown    0.993252
previous__leq__0         0.993252
month__is__may           0.992847
job__is__blue-collar     0.989429
dtype: float64

Жадным образом подберём минимальное кол-во аттрибутов, составляющиъ сильные гипотезы

In [60]:
selected_feats = squeeze_selected_feats(bin_ds, flg, best_strongest_feats)

HBox(children=(FloatProgress(value=0.0, max=129.0), HTML(value='')))




In [62]:
selected_feats = squeeze_selected_feats(bin_ds, flg, selected_feats)

HBox(children=(FloatProgress(value=0.0, max=123.0), HTML(value='')))




In [61]:
len(selected_feats)

123

In [64]:
selected_feats_short = selected_feats.copy()

In [66]:
with open('short_feats_neg_samples_refined.txt', 'w') as f:
    f.write(','.join(selected_feats_short))
with open('short_feats_neg_samples_refined.txt', 'r') as f:
    short_feats = f.read().split(',')

In [67]:
bin_ds['y'] = flg
bin_ds[selected_feats_short+['y']].to_csv('Bank_bin_ds_neg.csv')

In [68]:
bin_ds_pos = bin_ds.loc[flg]
bin_ds_neg = bin_ds.loc[~flg]