In [1]:
import pandas as pd
import pickle
import logging
from collections import Counter
from contrxt.trace import Trace
from contrxt.explain import Explain

HYPER = False
SAVE_SUR = True
SAVE_CSV = True

# Load Data

In [2]:
df_time_1 = pd.read_csv('../data/20newsgroups/df_time_1_train.csv')
df_time_2 = pd.read_csv('../data/20newsgroups/df_time_2_train.csv')

df_time_1 = df_time_1[~df_time_1['corpus'].isnull()]
df_time_2 = df_time_2[~df_time_2['corpus'].isnull()]

# Load vectorizer
vectorizer_time_1 = pickle.load(open('../model/20newsgroups/vectorizer_time_1.pickle', 'rb'))
vectorizer_time_2 = pickle.load(open('../model/20newsgroups/vectorizer_time_2.pickle', 'rb'))

# Load data
X_t1, Y_t1 = df_time_1['corpus'], df_time_1['category']
X_t2, Y_t2 = df_time_2['corpus'], df_time_2['category']

print('Classes for time 1: ', Counter(Y_t1), flush=True)
print('Classes for time 2: ', Counter(Y_t2), flush=True)

# Load model
model_time_1 = pickle.load(open('../model/20newsgroups/model_NB_time_1.pickle', 'rb'))
model_time_2 = pickle.load(open('../model/20newsgroups/model_NB_time_2.pickle', 'rb'))

# Predict
tranformed_corpus_t1 = vectorizer_time_1.transform(X_t1)
tranformed_corpus_t2 = vectorizer_time_2.transform(X_t2)
predicted_labels_t1 = model_time_1.predict(tranformed_corpus_t1)
predicted_labels_t2 = model_time_2.predict(tranformed_corpus_t2)

assert len(X_t1) == len(Y_t1) == len(predicted_labels_t1), f'{len(X_t1)}, {len(Y_t1)}, {len(predicted_labels_t1)}'
assert len(X_t2) == len(Y_t2) == len(predicted_labels_t2), f'{len(X_t2)}, {len(Y_t2)}, {len(predicted_labels_t2)}'     

Classes for time 1:  Counter({'sci.crypt': 464, 'sci.electronics': 456, 'rec.sport.baseball': 455, 'soc.religion.christian': 449, 'sci.space': 448, 'comp.graphics': 446, 'sci.med': 444, 'comp.windows.x': 444, 'rec.sport.hockey': 444, 'rec.motorcycles': 444, 'comp.sys.mac.hardware': 443, 'comp.sys.ibm.pc.hardware': 440, 'rec.autos': 439, 'misc.forsale': 437, 'comp.os.ms-windows.misc': 429, 'talk.politics.mideast': 425, 'talk.politics.guns': 404, 'alt.atheism': 352, 'talk.politics.misc': 350, 'talk.religion.misc': 272})
Classes for time 2:  Counter({'rec.sport.hockey': 249, 'rec.motorcycles': 248, 'soc.religion.christian': 248, 'rec.sport.baseball': 247, 'rec.autos': 246, 'sci.crypt': 246, 'sci.med': 246, 'comp.windows.x': 245, 'comp.os.ms-windows.misc': 244, 'sci.space': 244, 'sci.electronics': 243, 'comp.sys.ibm.pc.hardware': 242, 'misc.forsale': 240, 'comp.graphics': 239, 'comp.sys.mac.hardware': 235, 'talk.politics.mideast': 226, 'talk.politics.guns': 214, 'alt.atheism': 169, 'talk.p



# Trace

In [3]:
DATASET_DIR = '20newsgroups'
MODEL = 'NB'

trace = Trace(X_t1, Y_t1, predicted_labels_t1,
              X_t2, Y_t2, predicted_labels_t2,
              data_type='text', log_level=logging.INFO,
              hyperparameters_selection=HYPER, save_path=f'../results/{DATASET_DIR}/{MODEL}',
              save_surrogates=SAVE_SUR, save_csvs=SAVE_CSV)

# percent_dataset = trace.run_montecarlo(threshold=0.9)
percent_dataset = 1
print(f'Running trace with percent of dataset to use: {percent_dataset}', flush=True)
trace.run_trace(percent_dataset)

2021-09-22 11:58:58 - contrxt.trace - INFO - List of common classes: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']


Running trace with percent of dataset to use: 1


2021-09-22 11:58:58 - contrxt.data.data_manager - INFO - Sampling dataset with percent: 1 and saving labels...
2021-09-22 11:58:58 - contrxt.data.data_manager - INFO - N. Samples time_1: 8485
2021-09-22 11:59:01 - contrxt.data.data_manager - INFO - Finished predicting time_1
2021-09-22 11:59:01 - contrxt.trace - INFO - Starting explanation in time_1 for class_id alt.atheism
2021-09-22 11:59:01 - contrxt.surrogate.generic_surrogate - INFO - Fidelity of the surrogate: {'f1_binary': 0.708, 'f1_macro': 0.754, 'f1_weighted': 0.754, 'recall_weighted': 0.763, 'precision_weighted': 0.805, 'balanced_accuracy': 0.762}
2021-09-22 11:59:01 - contrxt.surrogate.generic_surrogate - INFO -               precision    recall  f1-score   support

           0       0.69      0.95      0.80       353
           1       0.92      0.58      0.71       351

    accuracy                           0.76       704
   macro avg       0.81      0.76      0.75       704
weighted avg       0.80      0.76      0.75  

2021-09-22 11:59:03 - contrxt.trace - INFO - Starting explanation in time_1 for class_id comp.windows.x
2021-09-22 11:59:03 - contrxt.surrogate.generic_surrogate - INFO - Fidelity of the surrogate: {'f1_binary': 0.643, 'f1_macro': 0.715, 'f1_weighted': 0.715, 'recall_weighted': 0.733, 'precision_weighted': 0.814, 'balanced_accuracy': 0.733}
2021-09-22 11:59:03 - contrxt.surrogate.generic_surrogate - INFO -               precision    recall  f1-score   support

           0       0.65      0.99      0.79       444
           1       0.97      0.48      0.64       444

    accuracy                           0.73       888
   macro avg       0.81      0.73      0.71       888
weighted avg       0.81      0.73      0.71       888

2021-09-22 11:59:03 - contrxt.surrogate.generic_surrogate - INFO - Time for fitting surrogate: 0.112
2021-09-22 11:59:03 - contrxt.surrogate.generic_surrogate - INFO - Transforming surrogate to BDD string...
2021-09-22 11:59:03 - contrxt.surrogate.generic_surroga

2021-09-22 11:59:05 - contrxt.surrogate.generic_surrogate - INFO - Transforming surrogate to BDD string...
2021-09-22 11:59:05 - contrxt.surrogate.generic_surrogate - INFO - BDD String for class rec.sport.hockey: ~team & ~game & ~hockey & ~nhl & playoff | ~team & ~game & ~hockey & nhl | ~team & ~game & hockey | ~team & game & ~problem & ~really & ~time | ~team & game & ~problem & ~really & time | ~team & game & ~problem & really | ~team & game & problem | team & ~problem & ~shot & ~hes & ~way | team & ~problem & ~shot & ~hes & way | team & ~problem & ~shot & hes | team & ~problem & shot | team & problem
2021-09-22 11:59:05 - contrxt.surrogate.generic_surrogate - INFO - Saving ../results/20newsgroups/NB/surrogate_tree/rec.sport.hockey_time_1.png to disk
2021-09-22 11:59:05 - contrxt.trace - INFO - Starting explanation in time_1 for class_id sci.crypt
2021-09-22 11:59:05 - contrxt.surrogate.generic_surrogate - INFO - Fidelity of the surrogate: {'f1_binary': 0.778, 'f1_macro': 0.81, 'f1_w

2021-09-22 11:59:07 - contrxt.trace - INFO - Starting explanation in time_1 for class_id talk.politics.guns
2021-09-22 11:59:07 - contrxt.surrogate.generic_surrogate - INFO - Fidelity of the surrogate: {'f1_binary': 0.741, 'f1_macro': 0.784, 'f1_weighted': 0.784, 'recall_weighted': 0.792, 'precision_weighted': 0.845, 'balanced_accuracy': 0.792}
2021-09-22 11:59:07 - contrxt.surrogate.generic_surrogate - INFO -               precision    recall  f1-score   support

           0       0.71      0.99      0.83       405
           1       0.98      0.60      0.74       403

    accuracy                           0.79       808
   macro avg       0.85      0.79      0.78       808
weighted avg       0.84      0.79      0.78       808

2021-09-22 11:59:07 - contrxt.surrogate.generic_surrogate - INFO - Time for fitting surrogate: 0.106
2021-09-22 11:59:07 - contrxt.surrogate.generic_surrogate - INFO - Transforming surrogate to BDD string...
2021-09-22 11:59:07 - contrxt.surrogate.generic_sur

2021-09-22 11:59:10 - contrxt.surrogate.generic_surrogate - INFO - Time for fitting surrogate: 0.049
2021-09-22 11:59:10 - contrxt.surrogate.generic_surrogate - INFO - Transforming surrogate to BDD string...
2021-09-22 11:59:10 - contrxt.surrogate.generic_surrogate - INFO - BDD String for class comp.graphics: ~graphics & ~format & ~computer_graphics & ~time & image | ~graphics & ~format & computer_graphics | ~graphics & format & ~based | ~graphics & format & based | graphics & ~utility & ~size | graphics & ~utility & size | graphics & utility
2021-09-22 11:59:10 - contrxt.surrogate.generic_surrogate - INFO - Saving ../results/20newsgroups/NB/surrogate_tree/comp.graphics_time_2.png to disk
2021-09-22 11:59:10 - contrxt.trace - INFO - Starting explanation in time_2 for class_id comp.os.ms-windows.misc
2021-09-22 11:59:10 - contrxt.surrogate.generic_surrogate - INFO - Fidelity of the surrogate: {'f1_binary': 0.815, 'f1_macro': 0.837, 'f1_weighted': 0.837, 'recall_weighted': 0.84, 'precisi

2021-09-22 11:59:12 - contrxt.surrogate.generic_surrogate - INFO - Fidelity of the surrogate: {'f1_binary': 0.812, 'f1_macro': 0.836, 'f1_weighted': 0.836, 'recall_weighted': 0.839, 'precision_weighted': 0.87, 'balanced_accuracy': 0.839}
2021-09-22 11:59:12 - contrxt.surrogate.generic_surrogate - INFO -               precision    recall  f1-score   support

           0       0.76      0.98      0.86       246
           1       0.98      0.70      0.81       246

    accuracy                           0.84       492
   macro avg       0.87      0.84      0.84       492
weighted avg       0.87      0.84      0.84       492

2021-09-22 11:59:12 - contrxt.surrogate.generic_surrogate - INFO - Time for fitting surrogate: 0.042
2021-09-22 11:59:12 - contrxt.surrogate.generic_surrogate - INFO - Transforming surrogate to BDD string...
2021-09-22 11:59:12 - contrxt.surrogate.generic_surrogate - INFO - BDD String for class rec.autos: ~car & ~cars & ~v_v & ~ford & engine | ~car & ~cars & ~v_v & 

2021-09-22 11:59:13 - contrxt.surrogate.generic_surrogate - INFO - Saving ../results/20newsgroups/NB/surrogate_tree/sci.electronics_time_2.png to disk
2021-09-22 11:59:13 - contrxt.trace - INFO - Starting explanation in time_2 for class_id sci.med
2021-09-22 11:59:14 - contrxt.surrogate.generic_surrogate - INFO - Fidelity of the surrogate: {'f1_binary': 0.649, 'f1_macro': 0.718, 'f1_weighted': 0.718, 'recall_weighted': 0.736, 'precision_weighted': 0.813, 'balanced_accuracy': 0.736}
2021-09-22 11:59:14 - contrxt.surrogate.generic_surrogate - INFO -               precision    recall  f1-score   support

           0       0.66      0.98      0.79       246
           1       0.97      0.49      0.65       246

    accuracy                           0.74       492
   macro avg       0.81      0.74      0.72       492
weighted avg       0.81      0.74      0.72       492

2021-09-22 11:59:14 - contrxt.surrogate.generic_surrogate - INFO - Time for fitting surrogate: 0.054
2021-09-22 11:59:1

2021-09-22 11:59:15 - contrxt.surrogate.generic_surrogate - INFO - Time for fitting surrogate: 0.037
2021-09-22 11:59:15 - contrxt.surrogate.generic_surrogate - INFO - Transforming surrogate to BDD string...
2021-09-22 11:59:15 - contrxt.surrogate.generic_surrogate - INFO - BDD String for class talk.politics.misc: ~gay_percentage & ~government & ~state_usa & ~opinions_mine & children | ~gay_percentage & ~government & ~state_usa & opinions_mine | ~gay_percentage & ~government & state_usa | ~gay_percentage & government & ~provided & ~laws & ~citizens | ~gay_percentage & government & ~provided & ~laws & citizens | ~gay_percentage & government & ~provided & laws & example | gay_percentage
2021-09-22 11:59:15 - contrxt.surrogate.generic_surrogate - INFO - Saving ../results/20newsgroups/NB/surrogate_tree/talk.politics.misc_time_2.png to disk
2021-09-22 11:59:15 - contrxt.trace - INFO - Starting explanation in time_2 for class_id talk.religion.misc
2021-09-22 11:59:15 - contrxt.surrogate.gene

In [4]:
pd.read_csv('../results/20newsgroups/NB/trace.csv', delimiter=';')

Unnamed: 0,time_label,class_id,bdd_string,runtime,percent_dataset,max_depth,min_samples_split,criterion,min_samples_leaf,f1_binary,f1_macro,f1_weighted,recall_weighted,precision_weighted,balanced_accuracy
0,time_1,alt.atheism,~god & ~political_atheists & ~religion & ~keit...,0.57,1,5,0.02,gini,0.01,0.708,0.754,0.754,0.763,0.805,0.762
1,time_1,comp.graphics,~graphics & ~people & ~images & ~files & algor...,0.459,1,5,0.02,gini,0.01,0.52,0.633,0.633,0.667,0.759,0.665
2,time_1,comp.os.ms-windows.misc,~windows & ~driver & ~file & ~dos & microsoft ...,0.43,1,5,0.02,gini,0.01,0.752,0.786,0.786,0.791,0.823,0.791
3,time_1,comp.sys.ibm.pc.hardware,~card & ~scsi & ~pc & ~motherboard & drive | ~...,0.399,1,5,0.02,gini,0.01,0.639,0.702,0.702,0.715,0.761,0.715
4,time_1,comp.sys.mac.hardware,~mac & ~apple & ~centris & ~quadra & se | ~mac...,0.387,1,5,0.02,gini,0.01,0.686,0.743,0.743,0.755,0.816,0.755
5,time_1,comp.windows.x,~window & ~motif & ~organization_internet & ~x...,0.301,1,5,0.02,gini,0.01,0.643,0.715,0.715,0.733,0.814,0.733
6,time_1,misc.forsale,~sale & ~writes & ~would & ~shipping & forsale...,0.462,1,5,0.02,gini,0.01,0.632,0.708,0.708,0.728,0.813,0.728
7,time_1,rec.autos,~car & ~cars & ~warningplease_read & ~dumbest_...,0.306,1,5,0.02,gini,0.01,0.786,0.813,0.813,0.817,0.844,0.816
8,time_1,rec.motorcycles,~bike & ~dod & ~motorcycle & ~bikes & ride | ~...,0.26,1,5,0.02,gini,0.01,0.785,0.817,0.818,0.823,0.868,0.823
9,time_1,rec.sport.baseball,~baseball & ~team & ~year & ~jewish_baseball &...,0.325,1,5,0.02,gini,0.01,0.697,0.746,0.746,0.755,0.798,0.755


In [5]:
pd.read_csv('../results/20newsgroups/NB/surrogate_paths.csv', delimiter=';').head(10)

Unnamed: 0,time_label,class_id,bdd_string,n
0,time_1,alt.atheism,~god & ~political_atheists & ~religion & ~keit...,17
1,time_1,alt.atheism,~god & ~political_atheists & ~religion & keith...,11
2,time_1,alt.atheism,~god & ~political_atheists & ~religion & keith...,10
3,time_1,alt.atheism,~god & ~political_atheists & religion & ~point,30
4,time_1,alt.atheism,~god & ~political_atheists & religion & point,12
5,time_1,alt.atheism,~god & political_atheists,43
6,time_1,alt.atheism,god & ~father & ~born & ~existence & ~true,67
7,time_1,alt.atheism,god & ~father & ~born & ~existence & true,12
8,time_1,alt.atheism,god & ~father & ~born & existence,10
9,time_1,alt.atheism,god & ~father & born,8


# Explain

In [8]:
explain = Explain(save_path=f'../results/{DATASET_DIR}/{MODEL}',
                  save_bdds=True, save_csvs=SAVE_CSV)
explain.run_explain()

2021-09-22 11:59:23 - contrxt.explain - INFO - Starting computation for class alt.atheism
2021-09-22 11:59:23 - contrxt.explain - INFO - Printing alt.atheism_add pdf file
2021-09-22 11:59:23 - contrxt.explain - INFO - Printing alt.atheism_del pdf file
2021-09-22 11:59:23 - contrxt.explain - INFO - {'sat_add': 4, 'sat_del': 5, 'add': 0.4444444444444444, 'del': 0.5555555555555556, 'j': 0.8333333333333334, 's1': '11', 's2': '10', 'union': '18', 'runtime': 0.152}
2021-09-22 11:59:23 - contrxt.explain - INFO - Finishing class alt.atheism, time: 0.152
2021-09-22 11:59:24 - contrxt.explain - INFO - Exiting class alt.atheism
2021-09-22 11:59:24 - contrxt.explain - INFO - Starting computation for class comp.graphics
2021-09-22 11:59:24 - contrxt.explain - INFO - Printing comp.graphics_add pdf file
2021-09-22 11:59:24 - contrxt.explain - INFO - Printing comp.graphics_del pdf file
2021-09-22 11:59:24 - contrxt.explain - INFO - {'sat_add': 3, 'sat_del': 4, 'add': 0.42857142857142855, 'del': 0.5714

2021-09-22 11:59:25 - contrxt.explain - INFO - Printing sci.med_add pdf file
2021-09-22 11:59:26 - contrxt.explain - INFO - Printing sci.med_del pdf file
2021-09-22 11:59:26 - contrxt.explain - INFO - {'sat_add': 4, 'sat_del': 4, 'add': 0.5, 'del': 0.5, 'j': 0.9230769230769231, 's1': '6', 's2': '8', 'union': '13', 'runtime': 0.156}
2021-09-22 11:59:26 - contrxt.explain - INFO - Finishing class sci.med, time: 0.156
2021-09-22 11:59:26 - contrxt.explain - INFO - Exiting class sci.med
2021-09-22 11:59:26 - contrxt.explain - INFO - Starting computation for class sci.space
2021-09-22 11:59:26 - contrxt.explain - INFO - Printing sci.space_add pdf file
2021-09-22 11:59:26 - contrxt.explain - INFO - Printing sci.space_del pdf file
2021-09-22 11:59:26 - contrxt.explain - INFO - {'sat_add': 3, 'sat_del': 3, 'add': 0.5, 'del': 0.5, 'j': 0.8666666666666667, 's1': '9', 's2': '8', 'union': '15', 'runtime': 0.135}
2021-09-22 11:59:26 - contrxt.explain - INFO - Finishing class sci.space, time: 0.135
2

In [9]:
pd.read_csv('../results/20newsgroups/NB/paths_add_del.csv', delimiter=';', header=None).head(10)

In [9]:
pd.read_csv('../results/20newsgroups/NB/explain.csv', delimiter=';')

Unnamed: 0,class_id,add,del,add_global,del_global,sat_add,sat_del,j,s1,s2,union,runtime
0,alt.atheism,0.444,0.556,0.667,0.25,4,5,0.833,11,10,18,0.136
1,comp.graphics,0.429,0.571,0.5,0.2,3,4,0.944,11,8,18,0.131
2,comp.os.ms-windows.misc,0.429,0.571,0.5,0.2,3,4,0.905,13,10,21,0.14
3,comp.sys.ibm.pc.hardware,0.5,0.5,0.667,0.2,4,4,0.947,10,10,19,0.158
4,comp.sys.mac.hardware,0.571,0.429,0.667,0.15,4,3,0.8,8,10,15,0.133
5,comp.windows.x,0.429,0.571,0.5,0.2,3,4,0.786,6,11,14,0.146
6,misc.forsale,0.857,0.143,1.0,0.05,6,1,0.583,9,8,12,0.139
7,rec.autos,0.5,0.5,0.5,0.15,3,3,0.867,10,7,15,0.143
8,rec.motorcycles,0.5,0.5,0.167,0.05,1,1,0.429,6,5,7,0.141
9,rec.sport.baseball,0.3,0.7,0.5,0.35,3,7,0.8,10,8,15,0.15


# Bdd2Text

In [11]:
explain.BDD2Text('alt.atheism')

alt.atheism


[48;5;155mThe model now uses the following classification rules for this class:[0m
This class has 4 added classification rules.

 - [38;5;10mHaving[0m [1m[38;5;10mgod[0m but [4m[38;5;9mnot[0m [1m[38;5;9msense[0m.
 - [38;5;10mHaving[0m [1m[38;5;10msay[0m but [4m[38;5;9mnot[0m [1m[38;5;9mgod[0m, [1m[38;5;9matheists[0m, and [1m[38;5;9mmedia[0m.
 - [38;5;10mHaving[0m [1m[38;5;10mgulf_war[0m but [4m[38;5;9mnot[0m [1m[38;5;9mgod[0m, [1m[38;5;9matheists[0m, and [1m[38;5;9msay[0m.
 - [38;5;10mHaving[0m [1m[38;5;10mfree_moral[0m but [4m[38;5;9mnot[0m [1m[38;5;9mgod[0m, [1m[38;5;9matheists[0m, [1m[38;5;9msay[0m, and [1m[38;5;9mgulf_war[0m.

[48;5;1mThe model is not using the following classification rules anymore:[0m
This class has 5 deleted classification rules.


Out of these 5 classification rules, 3 share the following criteria:
the document must [4m[38;5;9mnot[0m contain [1m[38;5;9mpolitical_atheists[0m