# Primeiros Passos

## Importação de bibliotecas

In [61]:
#!pip install --quiet -r requirements.txt

In [62]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
%matplotlib inline
from ExKMC.Tree import Tree
from IPython.display import Image

np.random.seed(1)
import sys

import sklearn
import sklearn.ensemble
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

import shap
import joblib as jbl

import lime
import lime.lime_tabular

from anchor import utils
from anchor import anchor_tabular

import json
import warnings

## Importando dataset e rodando Random Forest

In [63]:
data = pd.read_csv("21092023_Dataset_40k.csv")
data=data.set_index("id_")

In [64]:
amostras = pd.read_csv("Amostra-Dataset.csv")

indices_positivos = amostras['id_'].to_list()
print(indices_positivos)

amostras=amostras.set_index("id_")
amostras

[60621, 63330, 84054, 99548, 42623, 62913, 64822, 833560, 45080, 45343]


Unnamed: 0_level_0,methodAnonymousClassesQty,methodAssignmentsQty,methodCbo,methodComparisonsQty,methodLambdasQty,methodLoc,methodLoopQty,methodMathOperationsQty,methodMaxNestedBlocks,methodNumbersQty,...,methodReturnQty,methodRfc,methodStringLiteralsQty,methodSubClassesQty,methodTryCatchQty,methodUniqueWordsQty,methodVariablesQty,methodWmc,bugFixCount,refactoringsInvolved
id_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60621,0,9,1,0,0,77,0,64,1,59,...,0,32,0,0,0,22,8,4,4,4
63330,1,4,13,0,0,41,3,0,1,0,...,1,12,6,0,0,36,3,6,3,11
84054,0,13,10,4,0,44,0,0,1,0,...,5,17,0,0,0,32,10,9,13,42
99548,1,12,10,0,0,40,0,0,1,0,...,1,17,0,0,1,52,8,4,51,30
42623,0,0,9,0,0,44,0,0,1,0,...,14,11,2,0,0,35,0,14,7,13
62913,1,13,9,2,0,41,0,0,1,0,...,0,17,3,0,0,48,2,7,172,67
64822,0,14,22,1,0,18,0,0,1,0,...,1,4,0,0,0,52,14,1,0,8
833560,1,19,14,3,0,48,0,4,1,6,...,3,21,3,0,0,56,17,6,13,24
45080,0,22,12,0,2,50,0,0,1,2,...,0,26,0,0,0,48,12,4,2,8
45343,0,4,12,0,0,22,0,0,1,1,...,2,23,0,0,0,24,4,4,60,25


In [65]:
# Separação de treino e teste
X = data.drop(['y'], axis=1)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

feature_names=list(X_train.columns)

## Inicializando expicadores

In [66]:
# Importando shap
try:
    with open("./" + 'shap_explainer', 'rb') as f:
        explainer_shap = jbl.load(f)
except:
    explainer_shap = shap.TreeExplainer(forest)
    with open("./" + 'shap_explainer', 'wb') as f:
        jbl.dump(explainer_shap, f)

In [67]:
# LIME has one explainer for all the models
explainer_lime = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns.values.tolist(), class_names=[0, 1], verbose=True, mode='classification',  discretize_continuous=True)

In [68]:
explainer_anchors = anchor_tabular.AnchorTabularExplainer(
    [0, 1],
    X_train.columns.values.tolist(),
    X_train.values,
    {})

# Rodando instâncias e mostrando gráficos

In [69]:
# function to show explicability methods graphs
def instancia_explicabilidade_local(i, amostras):
    # mostra row
    print("Instância nº: ", indices_positivos[i])
    print(amostras.iloc[i])
    print("Predição do Random Forest: ", forest.predict_proba(amostras.iloc[[i]]))
    
    #SHAP
    print("SHAP")
    row = amostras.iloc[i]
    to_predict = row.values.reshape(1, -1)
    forest_predict = sum(forest.predict_proba(to_predict))
    shap_values = explainer_shap.shap_values(row)
    shap.initjs()
    display(shap.force_plot(explainer_shap.expected_value[1], shap_values[1], row))
    # shap_output = export_shap_exp(i, forest_predict[1], explainer_shap.expected_value, shap_values)
    # shap_exps.join(shap_output)
    # shap_result = pd.concat([shap_exps, shap_output], ignore_index=True)

    #LIME
    print("LIME")
    exp_lime = explainer_lime.explain_instance(amostras.values[i], forest.predict_proba, num_features=8)
    exp_lime.show_in_notebook(show_table=True)
    # lime_output = export_lime_exp(i, forest_predict[1], exp_lime.intercept[1], exp_lime.local_pred, exp_lime.as_list())
    # lime_exps.join(lime_output)
    # lime_result = pd.concat([lime_exps, lime_output], ignore_index=True)

    #ANCHORS
    print("ANCHORS")
    exp_anchors = explainer_anchors.explain_instance(amostras.values[i], forest.predict, threshold=0.95)
    print('Anchor: %s' % (' AND '.join(exp_anchors.names())))
    print('Precision: %.2f' % exp_anchors.precision())
    print('Coverage: %.2f' % exp_anchors.coverage())
    exp_anchors.show_in_notebook()

    print("-----------------------------------------------------------------------------------------------")

# Exportando as explicações

## Formato json

### Exporta features e dados gerais de cada explicador

In [70]:
# function to help sorting features
def compare_shap_feature_weights(feature_value):
    # highest weights first
    return -feature_value['peso_feature']

# export shap features' explanations
def export_shap_exp(feature_names, shap_values):
    # get shap values to refactor instance
    shap_values_refatora = shap_values[1]
    shap_output = dict()
    features_exp = []
    for i in range(0, len(feature_names)):
        # shap_values has positive values (to refactor) and negative ones (not to refactor)
        if shap_values_refatora[i] > 0:
            f = dict()
            f['nome_feature'] = feature_names[i]
            f['peso_feature'] = shap_values_refatora[i]
            f['feature_ranges'] = None
            features_exp.append(f)
    # sort features by feature weight
    sorted_features_exp = sorted(features_exp, key=compare_shap_feature_weights)
    #insere prioridade após ser ordenado
    prioridade = 1
    for f in sorted_features_exp:
        f['prioridade'] = prioridade
        prioridade = prioridade + 1
    shap_output['features'] = sorted_features_exp
    return shap_output

# export lime features' explanations
def export_lime_exp(feature_names, exp_lime):
    lime_output = dict()
    # general instance indices
    lime_output['intercept'] = exp_lime.intercept[1]
    lime_output['local_prediction'] = exp_lime.local_pred[0]
    lime_features = exp_lime.as_list()
    # features' values
    features_exp = []
    prioridade = 1
    for value in lime_features:
        f = dict()
        # extract feature name from feature ranges string
        any((feature_name := substring) in value[0] for substring in feature_names)
        f['nome_feature'] = feature_name
        f['peso_feature'] = value[1]
        f['feature_ranges'] = value[0]
        f['prioridade'] = prioridade #ordem de prioridade da feature
        features_exp.append(f)
        prioridade = prioridade + 1
    lime_output['features'] = features_exp
    return lime_output

# export anchors features' explanations
def export_anchors_exp(feature_names, anchors_exp):
    anchors_output = dict()
    # general instance indices
    anchors_output['precision'] = anchors_exp.precision()
    anchors_output['coverage'] = anchors_exp.coverage()
    # features' values
    features_exp = []
    prioridade = 1
    for name in anchors_exp.names():
        f = dict()
        # extract feature name from anchors' names string
        any((feature_name := substring) in str(name) for substring in feature_names)
        f['nome_feature'] = feature_name
        f['peso_feature'] = None
        f['feature_ranges'] = name
        f['prioridade'] = prioridade #ordem de prioridade da feature
        features_exp.append(f)
        prioridade = prioridade + 1
    anchors_output['features'] = features_exp
    return anchors_output

### Roda os explicadores para um conjunto de amostrar e exporta cada um

In [71]:
def roda_exporta_explicadores(amostras, i):
    feature_names = list(amostras.columns)
    
    #SHAP
    row = amostras.iloc[i]
    to_predict = row.values.reshape(1, -1)
    forest_predict = sum(forest.predict_proba(to_predict))
    shap_values = explainer_shap.shap_values(row)
    shap_output = export_shap_exp(feature_names, shap_values)

    #LIME
    exp_lime = explainer_lime.explain_instance(amostras.values[i], forest.predict_proba, num_features=8)
    lime_output = export_lime_exp(feature_names, exp_lime)

    #ANCHORS
    exp_anchors = explainer_anchors.explain_instance(amostras.values[i], forest.predict, threshold=0.95)
    anchors_output = export_anchors_exp(feature_names, exp_anchors)
    return [shap_output, lime_output, anchors_output]

### Junta as explicações em um json com dados da instância

In [72]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [73]:
def exporta_json_explicacoes(amostras, indices):
    explicacoes = dict()
    for i in range(0, len(amostras)):
        json_output = {}
        # gives values of random forest predictions
        predict = forest.predict_proba(amostras.iloc[[i]])[0]
        json_output['forest_predict'] = {'nao refatorar': predict[0], 'refatorar': predict[1]}
        # calls funtions that run and export shap, lime and anchors explanations
        json_output['shap'], json_output['lime'], json_output['anchors'] = roda_exporta_explicadores(amostras, i)
        # puts explanations in each indice
        explicacoes[indices[i]] = json_output
        #export explicacoes to json file
        with open("explicacoes_samples.json", "w") as outfile:
            json.dump(json_output, outfile)
    return explicacoes

json_output_exp = exporta_json_explicacoes(amostras, indices_positivos)
#pretty print for json
print(json.dumps(json_output_exp, indent=4))

Intercept 0.4349535690065091
Prediction_local [0.77731915]
Right: 0.72
Intercept 0.3607798134139092
Prediction_local [0.74519327]
Right: 0.81
Intercept 0.45712981731856434
Prediction_local [0.70629246]
Right: 0.97
Intercept 0.5311779229377486
Prediction_local [0.51066776]
Right: 0.51
Intercept 0.5498197315722846
Prediction_local [0.58631919]
Right: 0.9
Intercept 0.5326000693851246
Prediction_local [0.44820913]
Right: 0.49
Intercept 0.42847172418630164
Prediction_local [0.68704775]
Right: 0.8
Intercept 0.48427608166607006
Prediction_local [0.68380042]
Right: 0.84
Intercept 0.40773193467482605
Prediction_local [0.7648461]
Right: 0.98
Intercept 0.5364074963320494
Prediction_local [0.4575002]
Right: 0.85
{
    "60621": {
        "forest_predict": {
            "nao refatorar": 0.28,
            "refatorar": 0.72
        },
        "shap": {
            "features": [
                {
                    "nome_feature": "bugFixCount",
                    "peso_feature": 0.08669926474690005,

## Formato csv

In [74]:
# shap_output = pd.DataFrame()
# lime_output = pd.DataFrame()
# for i in range(0,10):
#     (shap_output, lime_output) = instancia_explicabilidade_local(i, amostras, shap_output, lime_output)
# shap_output.to_csv('shap_explanations.csv')
# lime_output.to_csv('lime_explanations.csv')

# Juntando explicações

In [103]:
print(json.dumps(json_output_exp, indent=4))

{
    "60621": {
        "forest_predict": {
            "nao refatorar": 0.28,
            "refatorar": 0.72
        },
        "shap": {
            "features": [
                {
                    "nome_feature": "bugFixCount",
                    "peso_feature": 0.08669926474690005,
                    "feature_ranges": null,
                    "prioridade": 1
                },
                {
                    "nome_feature": "methodLoc",
                    "peso_feature": 0.058553218067354224,
                    "feature_ranges": null,
                    "prioridade": 2
                },
                {
                    "nome_feature": "methodRfc",
                    "peso_feature": 0.05268806497404086,
                    "feature_ranges": null,
                    "prioridade": 3
                },
                {
                    "nome_feature": "methodVariablesQty",
                    "peso_feature": 0.04076783238029673,
                    "feature_r

In [107]:
def extract_feature_names(json_features):
    extracted_feature_names = []
    for f in json_features:
        extracted_feature_names.append(f['nome_feature'])
    return extracted_feature_names

def get_info_by_feature_name(exp_f, feature_name):
    feature_obj = next(x for x in exp_f if x["nome_feature"] == feature_name)
    return feature_obj['peso_feature'], feature_obj['feature_ranges'], feature_obj['prioridade']

def compare_feature_weights(feature_value):
    if 'rank_shap' in feature_value and 'rank_lime' in feature_value and 'rank_anchors' in feature_value:
        return feature_value['rank_anchors'], feature_value['rank_shap'], feature_value['rank_lime']
    elif 'rank_shap' in feature_value and 'rank_lime' in feature_value:
        return feature_value['rank_shap'], feature_value['rank_lime']
    elif 'rank_shap' in feature_value and 'rank_anchors' in feature_value:
        return feature_value['rank_anchors'], feature_value['rank_shap']
    elif 'rank_lime' in feature_value and 'rank_anchors' in feature_value:
        return feature_value['rank_anchors'], feature_value['rank_lime']

def intersect_feature_explanations(feature_names, instancia):
    shap_f = instancia['shap']['features']
    lime_f = instancia['lime']['features']
    anchors_f = instancia['anchors']['features']

    shap_f_names = extract_feature_names(shap_f)
    lime_f_names = extract_feature_names(lime_f)
    anchors_f_names = extract_feature_names(anchors_f)

    intersec_3, intersec_shap_lime, intersec_shap_anchors, intersec_lime_anchors = ([] for i in range(4))
    resumo_intersec = dict()
    for f in feature_names:
        # for each configuration of intersection, gets feature name and explainers priority ranking
        # when finds feature in shap, lime and anchors
        if f in shap_f_names and f in lime_f_names and f in anchors_f_names:
            resumo_feature = dict()
            resumo_feature['nome_feature'] = f
            # get info from explainers' features
            _, anchors_ranges, anchors_priority = get_info_by_feature_name(anchors_f, f)
            shap_weight, _, shap_priority = get_info_by_feature_name(shap_f, f)
            lime_weight, lime_ranges, lime_priority = get_info_by_feature_name(lime_f, f)
            # puts in dictionary
            resumo_feature['rank_anchors'] = anchors_priority
            resumo_feature['rank_shap'] = shap_priority
            resumo_feature['rank_lime'] = lime_priority
            resumo_feature['peso_shap'] = shap_weight
            resumo_feature['peso_lime'] = lime_weight
            resumo_feature['range_anchors'] = anchors_ranges
            resumo_feature['range_lime'] = lime_ranges
            intersec_3.append(resumo_feature)
        # when finds feature in shap and lime
        elif f in shap_f_names and f in lime_f_names:
            resumo_feature = dict()
            resumo_feature['nome_feature'] = f
            # get info from explainers' features
            shap_weight, _, shap_priority = get_info_by_feature_name(shap_f, f)
            lime_weight, lime_ranges, lime_priority = get_info_by_feature_name(lime_f, f)
            resumo_feature['rank_shap'] = shap_priority
            resumo_feature['rank_lime'] = lime_priority
            resumo_feature['peso_shap'] = shap_weight
            resumo_feature['peso_lime'] = lime_weight
            resumo_feature['range_lime'] = lime_ranges
            intersec_shap_lime.append(resumo_feature)
        # when finds feature in shap and anchors
        elif f in shap_f_names and f in anchors_f_names:
            resumo_feature = dict()
            resumo_feature['nome_feature'] = f
            # get info from explainers' features
            _, anchors_ranges, anchors_priority = get_info_by_feature_name(anchors_f, f)
            shap_weight, _, shap_priority = get_info_by_feature_name(shap_f, f)
            # puts in dictionary
            resumo_feature['rank_anchors'] = anchors_priority
            resumo_feature['rank_shap'] = shap_priority
            resumo_feature['range_anchors'] = anchors_ranges
            resumo_feature['peso_shap'] = shap_weight
            intersec_shap_anchors.append(resumo_feature)
        # when finds feature in lime and anchors
        elif f in lime_f_names and f in anchors_f_names:
            resumo_feature = dict()
            resumo_feature['nome_feature'] = f
            # get info from explainers' features
            _, anchors_ranges, anchors_priority = get_info_by_feature_name(anchors_f, f)
            lime_weight, lime_ranges, lime_priority = get_info_by_feature_name(lime_f, f)
            # puts in dictionary
            resumo_feature['rank_anchors'] = anchors_priority
            resumo_feature['rank_lime'] = lime_priority
            resumo_feature['peso_lime'] = lime_weight
            resumo_feature['range_anchors'] = anchors_ranges
            resumo_feature['range_lime'] = lime_ranges
            intersec_lime_anchors.append(resumo_feature)

    # order each list by importance (1st anchors, 2nd shap, 3rd lime)
    resumo_intersec['intersec_all'] = sorted(intersec_3, key=compare_feature_weights)
    resumo_intersec['intersec_shap_lime'] = sorted(intersec_shap_lime, key=compare_feature_weights)
    resumo_intersec['intersec_shap_anchors'] = sorted(intersec_shap_anchors, key=compare_feature_weights)
    resumo_intersec['intersec_lime_anchors'] = sorted(intersec_lime_anchors, key=compare_feature_weights)
    
    # print(json.dumps(resumo_intersec, indent=4))
    return resumo_intersec

In [108]:
feature_names = list(amostras.columns)

# 1st: get interseccions between all the 3 explainers
# 2nd: get interseccions between 2 explainers
#      decision between explainers will have
#      priority -> 1 - anchors, 2 - shap, 3 - lime
def generate_top_n_features_v1(feature_names, json_out, n_f):
    final_explanations = dict()
    # run each instance
    for i in json_out:
        intersections = intersect_feature_explanations(feature_names, json_out[i])
        # 1st step of algorithm
        # get intersections between all explainers
        final_n_features = intersections['intersec_all'][0:n_f]
        # verify how many features misses to complete the x top features
        missing_features = n_f - len(final_n_features)
        order_remaining_features = []
        if missing_features:
            # get x reamining features from each of intersections between 2 explainers
            i_shap_lime_f = intersections['intersec_shap_lime'][0:missing_features]
            i_shap_anchors_f = intersections['intersec_shap_anchors'][0:missing_features]
            i_lime_anchors_f = intersections['intersec_lime_anchors'][0:missing_features]
            # concat lists
            remaining_features = i_shap_lime_f+i_shap_anchors_f+i_lime_anchors_f
            # 2st step of algorithm
            # sort concatenated list 
            order_remaining_features = sorted(remaining_features, key=compare_feature_weights)
        # concat features selected
        final_n_features = final_n_features + order_remaining_features[0:missing_features]
        final_explanations[i] = final_n_features
    return final_explanations

final_explanations = generate_top_n_features_v1(feature_names, json_output_exp, 5)
print(json.dumps(final_explanations, indent=4))

{
    "60621": [
        {
            "nome_feature": "bugFixCount",
            "rank_anchors": 1,
            "rank_shap": 1,
            "rank_lime": 1,
            "peso_shap": 0.08669926474690005,
            "peso_lime": 0.1367958122599585,
            "range_anchors": "bugFixCount <= 4.00",
            "range_lime": "0.00 < bugFixCount <= 4.00"
        },
        {
            "nome_feature": "methodRfc",
            "rank_anchors": 2,
            "rank_shap": 3,
            "rank_lime": 3,
            "peso_shap": 0.05268806497404086,
            "peso_lime": 0.053828724236807686,
            "range_anchors": "methodRfc > 10.00",
            "range_lime": "methodRfc > 10.00"
        },
        {
            "nome_feature": "methodLoc",
            "rank_shap": 2,
            "rank_lime": 2,
            "peso_shap": 0.058553218067354224,
            "peso_lime": 0.05520834638214641,
            "range_lime": "methodLoc > 25.00"
        },
        {
            "nome_feature": "