In [2]:
!pip install --quiet -r requirements.txt

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
%matplotlib inline
from ExKMC.Tree import Tree
from IPython.display import Image

np.random.seed(1)
import sys

import sklearn
import sklearn.ensemble
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

import shap
import joblib as jbl

import lime
import lime.lime_tabular

from anchor import utils
from anchor import anchor_tabular

In [5]:
data = pd.read_csv("21092023_Dataset_40k.csv")
data=data.set_index("id_")

In [6]:
# Separação de treino e teste
X = data.drop(['y'], axis=1)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

feature_names=list(X_train.columns)

In [7]:
# Importando shap
try:
    with open("./" + 'shap_explainer', 'rb') as f:
        explainer_shap = jbl.load(f)
except:
    explainer_shap = shap.TreeExplainer(forest)
    with open("./" + 'shap_explainer', 'wb') as f:
        jbl.dump(explainer_shap, f)

In [8]:
# LIME has one explainer for all the models
explainer_lime = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns.values.tolist(), class_names=[0, 1], verbose=True, mode='classification',  discretize_continuous=True)

In [9]:
explainer_anchors = anchor_tabular.AnchorTabularExplainer(
    [0, 1],
    X_train.columns.values.tolist(),
    X_train.values,
    {})

In [10]:
amostras = pd.read_csv("Amostra-Dataset.csv")

indices_positivos = amostras['id_'].to_list()
print(indices_positivos)

amostras=amostras.set_index("id_")
amostras

[60621, 63330, 84054, 99548, 42623, 62913, 64822, 833560, 45080, 45343]


Unnamed: 0_level_0,methodAnonymousClassesQty,methodAssignmentsQty,methodCbo,methodComparisonsQty,methodLambdasQty,methodLoc,methodLoopQty,methodMathOperationsQty,methodMaxNestedBlocks,methodNumbersQty,...,methodReturnQty,methodRfc,methodStringLiteralsQty,methodSubClassesQty,methodTryCatchQty,methodUniqueWordsQty,methodVariablesQty,methodWmc,bugFixCount,refactoringsInvolved
id_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60621,0,9,1,0,0,77,0,64,1,59,...,0,32,0,0,0,22,8,4,4,4
63330,1,4,13,0,0,41,3,0,1,0,...,1,12,6,0,0,36,3,6,3,11
84054,0,13,10,4,0,44,0,0,1,0,...,5,17,0,0,0,32,10,9,13,42
99548,1,12,10,0,0,40,0,0,1,0,...,1,17,0,0,1,52,8,4,51,30
42623,0,0,9,0,0,44,0,0,1,0,...,14,11,2,0,0,35,0,14,7,13
62913,1,13,9,2,0,41,0,0,1,0,...,0,17,3,0,0,48,2,7,172,67
64822,0,14,22,1,0,18,0,0,1,0,...,1,4,0,0,0,52,14,1,0,8
833560,1,19,14,3,0,48,0,4,1,6,...,3,21,3,0,0,56,17,6,13,24
45080,0,22,12,0,2,50,0,0,1,2,...,0,26,0,0,0,48,12,4,2,8
45343,0,4,12,0,0,22,0,0,1,1,...,2,23,0,0,0,24,4,4,60,25


In [21]:
def export_shap_exp(feature_names, shap_values):
    shap_values_refatora = shap_values[1]
    shap_output = dict()
    features_exp = []
    for i in range(0, len(feature_names)):
        f = dict()
        f['nome_feature'] = feature_names[i]
        f['peso_feature'] = shap_values_refatora[i]
        f['feature_ranges'] = None
        features_exp.append(f)
    shap_output['features'] = features_exp
    return shap_output

def export_lime_exp(feature_names, exp_lime):
    lime_output = dict()
    lime_output['intercept'] = exp_lime.intercept[1]
    lime_output['local_prediction'] = exp_lime.local_pred[0]
    lime_features = exp_lime.as_list()
    features_exp = []
    for value in lime_features:
        f = dict()
        any((feature_name := substring) in value[0] for substring in feature_names)
        f['nome_feature'] = feature_name
        f['peso_feature'] = value[1]
        f['feature_ranges'] = value[0]
        features_exp.append(f)
    lime_output['features'] = features_exp
    return lime_output

def export_anchors_exp(feature_names, anchors_exp):
    anchors_output = dict()
    anchors_output['precision'] = anchors_exp.precision()
    anchors_output['coverage'] = anchors_exp.coverage()
    
    features_exp = []
    for name in anchors_exp.names():
        f = dict()
        any((feature_name := substring) in str(name) for substring in feature_names)
        f['nome_feature'] = feature_name
        f['peso_feature'] = None
        f['feature_ranges'] = name
        features_exp.append(f)
    anchors_output['features'] = features_exp
    return anchors_output

In [22]:
def roda_exporta_explicadores(amostras, i):
    feature_names = list(amostras.columns)
    
    #SHAP
    row = amostras.iloc[i]
    to_predict = row.values.reshape(1, -1)
    forest_predict = sum(forest.predict_proba(to_predict))
    shap_values = explainer_shap.shap_values(row)
    shap_output = export_shap_exp(feature_names, shap_values)

    #LIME
    exp_lime = explainer_lime.explain_instance(amostras.values[i], forest.predict_proba, num_features=8)
    lime_output = export_lime_exp(feature_names, exp_lime)

    #ANCHORS
    exp_anchors = explainer_anchors.explain_instance(amostras.values[i], forest.predict, threshold=0.95)
    anchors_output = export_anchors_exp(feature_names, exp_anchors)
    return [shap_output, lime_output, anchors_output]

In [26]:
import json

def exporta_json_explicacoes(amostras, indices):
    explicacoes = dict()
    for i in range(0, len(amostras)):
        json_output = {}
        # gives values of random forest predictions
        predict = forest.predict_proba(amostras.iloc[[i]])[0]
        json_output['forest_predict'] = {'nao refatorar': predict[0], 'refatorar': predict[1]}
        # calls funtions that run and export shap, lime and anchors explanations
        json_output['shap'], json_output['lime'], json_output['anchors'] = roda_exporta_explicadores(amostras, i)
        # puts explanations in each indice
        explicacoes[indices[i]] = json_output
    return explicacoes

json_output_exp = exporta_json_explicacoes(amostras, indices_positivos)

X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names


Intercept 0.36697079934708743
Prediction_local [0.7620009]
Right: 0.72


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.4750223184078203
Prediction_local [0.73752678]
Right: 0.81


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.4367949495552883
Prediction_local [0.68666112]
Right: 0.97


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.5019894809389651
Prediction_local [0.5031356]
Right: 0.51


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.4696863069652361
Prediction_local [0.60497507]
Right: 0.9


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.5547940102138454
Prediction_local [0.46643612]
Right: 0.49


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.32916772742395256
Prediction_local [0.71287075]
Right: 0.8


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.530872689884069
Prediction_local [0.64315535]
Right: 0.84


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.3763627684303318
Prediction_local [0.7781124]
Right: 0.98


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

Intercept 0.5611195780742984
Prediction_local [0.43943439]
Right: 0.85


X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassifier was fitted with feature names
X does not have valid feature names, but RandomForestClassif

In [27]:
#pretty print for json
print(json.dumps(json_output_exp, indent=4))

with open("explicacoes_samples.json", "w") as outfile:
    json.dump(json_output_exp, outfile)

{
    "60621": {
        "forest_predict": {
            "nao refatorar": 0.28,
            "refatorar": 0.72
        },
        "shap": {
            "features": [
                {
                    "nome_feature": "methodAnonymousClassesQty",
                    "peso_feature": 0.0003608535201939072,
                    "feature_ranges": null
                },
                {
                    "nome_feature": "methodAssignmentsQty",
                    "peso_feature": 0.026450348447629835,
                    "feature_ranges": null
                },
                {
                    "nome_feature": "methodCbo",
                    "peso_feature": -0.014071973055749535,
                    "feature_ranges": null
                },
                {
                    "nome_feature": "methodComparisonsQty",
                    "peso_feature": -0.0015157261237991099,
                    "feature_ranges": null
                },
                {
                    "nome_f

In [None]:
def instancia_explicabilidade_local(i, amostras, shap_exps, lime_exps):
    # mostra row
    print("Instância nº: ", indices_positivos[i])
    print(amostras.iloc[i])
    print("Predição do Random Forest: ", forest.predict_proba(amostras.iloc[[i]]))
    
    #SHAP
    print("SHAP")
    row = amostras.iloc[i]
    to_predict = row.values.reshape(1, -1)
    forest_predict = sum(forest.predict_proba(to_predict))
    shap_values = explainer_shap.shap_values(row)
    shap.initjs()
    display(shap.force_plot(explainer_shap.expected_value[1], shap_values[1], row))
    shap_output = export_shap_exp(i, forest_predict[1], explainer_shap.expected_value, shap_values)
    #shap_exps.join(shap_output)
    shap_result = pd.concat([shap_exps, shap_output], ignore_index=True)

    #LIME
    print("LIME")
    exp_lime = explainer_lime.explain_instance(amostras.values[i], forest.predict_proba, num_features=8)
    exp_lime.show_in_notebook(show_table=True)
    lime_output = export_lime_exp(i, forest_predict[1], exp_lime.intercept[1], exp_lime.local_pred, exp_lime.as_list())
    #lime_exps.join(lime_output)
    lime_result = pd.concat([lime_exps, lime_output], ignore_index=True)

    #ANCHORS
    print("ANCHORS")
    exp_anchors = explainer_anchors.explain_instance(amostras.values[i], forest.predict, threshold=0.95)
    print('Anchor: %s' % (' AND '.join(exp_anchors.names())))
    print('Precision: %.2f' % exp_anchors.precision())
    print('Coverage: %.2f' % exp_anchors.coverage())
    exp_anchors.show_in_notebook()

    print("-----------------------------------------------------------------------------------------------")

    return (shap_result, lime_result)

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [None]:
shap_output = pd.DataFrame()
lime_output = pd.DataFrame()
for i in range(0,10):
    (shap_output, lime_output) = instancia_explicabilidade_local(i, amostras, shap_output, lime_output)

In [None]:
shap_output.to_csv('shap_explanations.csv')
lime_output.to_csv('lime_explanations.csv')

In [None]:
shap_output

In [None]:
display(lime_output)

In [None]:
shap_output = shap_output.drop(['forest prediction to refactorating', 'base value', 'row'], axis=1)

lime_output = lime_output.drop(['forest prediction to refactorating', 'intercept', 'local prediction', 'row'], axis=1)
lime_output = lime_output.loc[:, ~lime_output.columns.str.endswith('description')]
lime_output = lime_output.fillna(0)
lime_output = lime_output.apply(pd.to_numeric)

In [None]:
def display_top_5_features(output, i):
    top5 = output.iloc[i].nlargest(5).to_frame().T
    display(top5)

for f in range(0,10):
    display_top_5_features(shap_output, f)
    display_top_5_features(lime_output, f)