# First Steps

## Importing Libraries

In [1]:
#!pip install --quiet -r requirements.txt

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
%matplotlib inline
from ExKMC.Tree import Tree
from IPython.display import Image

np.random.seed(1)
import sys

import sklearn
import sklearn.ensemble
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

import shap
import joblib as jbl

import lime
import lime.lime_tabular

from anchor import utils
from anchor import anchor_tabular

import json
import warnings

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


## Importing dataset and running Random Forest

In [3]:
# importing training dataset
data = pd.read_csv("21092023_Dataset_40k.csv")
data = data.set_index("id_")

In [4]:
# importing samples to explain
samples = pd.read_csv("Amostra-Dataset.csv")
# getting samples ids
samples_indexes = samples['id_'].to_list()
print(samples_indexes)

samples = samples.set_index("id_")
display(samples)

[60621, 63330, 84054, 99548, 42623, 62913, 64822, 833560, 45080, 45343]


Unnamed: 0_level_0,methodAnonymousClassesQty,methodAssignmentsQty,methodCbo,methodComparisonsQty,methodLambdasQty,methodLoc,methodLoopQty,methodMathOperationsQty,methodMaxNestedBlocks,methodNumbersQty,...,methodReturnQty,methodRfc,methodStringLiteralsQty,methodSubClassesQty,methodTryCatchQty,methodUniqueWordsQty,methodVariablesQty,methodWmc,bugFixCount,refactoringsInvolved
id_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60621,0,9,1,0,0,77,0,64,1,59,...,0,32,0,0,0,22,8,4,4,4
63330,1,4,13,0,0,41,3,0,1,0,...,1,12,6,0,0,36,3,6,3,11
84054,0,13,10,4,0,44,0,0,1,0,...,5,17,0,0,0,32,10,9,13,42
99548,1,12,10,0,0,40,0,0,1,0,...,1,17,0,0,1,52,8,4,51,30
42623,0,0,9,0,0,44,0,0,1,0,...,14,11,2,0,0,35,0,14,7,13
62913,1,13,9,2,0,41,0,0,1,0,...,0,17,3,0,0,48,2,7,172,67
64822,0,14,22,1,0,18,0,0,1,0,...,1,4,0,0,0,52,14,1,0,8
833560,1,19,14,3,0,48,0,4,1,6,...,3,21,3,0,0,56,17,6,13,24
45080,0,22,12,0,2,50,0,0,1,2,...,0,26,0,0,0,48,12,4,2,8
45343,0,4,12,0,0,22,0,0,1,1,...,2,23,0,0,0,24,4,4,60,25


In [5]:
# splitting train and test
X = data.drop(['y'], axis=1)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

# training random forest model
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

# getting list of feature names
feature_names = list(X_train.columns)

## Setting up explainers

In [6]:
# Importando shap
try:
    with open("./" + 'shap_explainer', 'rb') as f:
        explainer_shap = jbl.load(f)
except:
    explainer_shap = shap.TreeExplainer(forest)
    with open("./" + 'shap_explainer', 'wb') as f:
        jbl.dump(explainer_shap, f)

In [7]:
# LIME has one explainer for all the models
explainer_lime = lime.lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns.values.tolist(), class_names=[0, 1], verbose=True, mode='classification',  discretize_continuous=True)

In [8]:
explainer_anchors = anchor_tabular.AnchorTabularExplainer(
    [0, 1],
    X_train.columns.values.tolist(),
    X_train.values,
    {})

# Running instances and showing graphs

In [9]:
# function to show explicability methods graphs
def instance_local_explanations(i, samples):
    # show row content
    print("Instance n°: ", samples_indexes[i])
    print(samples.iloc[i])
    print("Random Forest Prediction: ", forest.predict_proba(samples.iloc[[i]]))
    
    #SHAP explainer
    print("SHAP")
    row = samples.iloc[i]
    to_predict = row.values.reshape(1, -1)
    forest_prediction = sum(forest.predict_proba(to_predict))
    shap_values = explainer_shap.shap_values(row)
    shap.initjs()
    display(shap.force_plot(explainer_shap.expected_value[1], shap_values[1], row))
    # shap_output = export_shap_exp(i, forest_prediction[1], explainer_shap.expected_value, shap_values)
    # shap_exps.join(shap_output)
    # shap_result = pd.concat([shap_exps, shap_output], ignore_index=True)

    #LIME explainer
    print("LIME")
    exp_lime = explainer_lime.explain_instance(samples.values[i], forest.predict_proba, num_features=8)
    exp_lime.show_in_notebook(show_table=True)
    # lime_output = export_lime_exp(i, forest_prediction[1], exp_lime.intercept[1], exp_lime.local_pred, exp_lime.as_list())
    # lime_exps.join(lime_output)
    # lime_result = pd.concat([lime_exps, lime_output], ignore_index=True)

    #ANCHORS explainer
    print("ANCHORS")
    exp_anchors = explainer_anchors.explain_instance(samples.values[i], forest.predict, threshold=0.95)
    print('Anchor: %s' % (' AND '.join(exp_anchors.names())))
    print('Precision: %.2f' % exp_anchors.precision())
    print('Coverage: %.2f' % exp_anchors.coverage())
    exp_anchors.show_in_notebook()

    print("-----------------------------------------------------------------------------------------------")

# Exporting explanations

## File format .json

### Export features e General data of each explainer

In [10]:
# function to help sorting features
def compare_shap_feature_weights(feature_value):
    # highest weights first
    return -feature_value['feature_weight']

# export shap features' explanations
def export_shap_exp(row, feature_names, shap_values):
    # get shap values to refactor instance
    shap_values_to_refactor = shap_values[1]
    shap_output = dict()
    
    features_exp = []
    for i in range(0, len(feature_names)):
        # shap_values has positive values (to refactor) and negative ones (not to refactor)
        if shap_values_to_refactor[i] > 0:
            f = dict()
            feature_name = feature_names[i]
            f['feature_name'] = feature_name
            f['feature_value'] = int(row[feature_name])
            f['feature_weight'] = shap_values_to_refactor[i]
            f['feature_ranges'] = None
            features_exp.append(f)
    # sort features by feature weight
    sorted_features_exp = sorted(features_exp, key=compare_shap_feature_weights)
    # append feature ranking after sort
    rank = 1
    for f in sorted_features_exp:
        f['feature_rank'] = rank
        rank = rank + 1
    shap_output['features'] = sorted_features_exp
    return shap_output

# export lime features' explanations
def export_lime_exp(row, feature_names, exp_lime):
    lime_output = dict()
    # general instance indices
    lime_output['intercept'] = exp_lime.intercept[1]
    lime_output['local_prediction'] = exp_lime.local_pred[0]
    lime_features = exp_lime.as_list()
    # features' values
    features_exp = []
    rank = 1
    for value in lime_features:
        # value[1] (feature_weight) has positive values (to refactor) and negative ones (not to refactor)
        if value[1] > 0:
            f = dict()
            # extract feature name from feature ranges string
            any((feature_name := substring) in value[0] for substring in feature_names)
            f['feature_name'] = feature_name
            f['feature_value'] = int(row[feature_name])
            f['feature_weight'] = value[1]
            f['feature_ranges'] = value[0]
            f['feature_rank'] = rank # feature's order of priority in explainer's result
            features_exp.append(f)
            rank = rank + 1 # ranks the positive features
    lime_output['features'] = features_exp
    return lime_output

# export anchors features' explanations
def export_anchors_exp(row, feature_names, anchors_exp):
    anchors_output = dict()
    # general instance indices
    anchors_output['precision'] = anchors_exp.precision()
    anchors_output['coverage'] = anchors_exp.coverage()
    # features' values
    features_exp = []
    rank = 1
    for name in anchors_exp.names():
        f = dict()
        # extract feature name from anchors' names string
        any((feature_name := substring) in str(name) for substring in feature_names)
        f['feature_name'] = feature_name
        f['feature_value'] = int(row[feature_name])
        f['feature_weight'] = None
        f['feature_ranges'] = name
        f['feature_rank'] = rank # feature's order of priority in explainer's result
        features_exp.append(f)
        rank = rank + 1
    anchors_output['features'] = features_exp
    return anchors_output

### Run explainers for a samples dataset and export each one

In [11]:
def run_and_export_explainers(samples, i):
    feature_names = list(samples.columns)
    
    #SHAP
    row = samples.iloc[i]
    shap_values = explainer_shap.shap_values(row)
    shap_output = export_shap_exp(row, feature_names, shap_values)

    #LIME
    exp_lime = explainer_lime.explain_instance(samples.values[i], forest.predict_proba, num_features=8)
    lime_output = export_lime_exp(row, feature_names, exp_lime)

    #ANCHORS
    exp_anchors = explainer_anchors.explain_instance(samples.values[i], forest.predict, threshold=0.95)
    anchors_output = export_anchors_exp(row, feature_names, exp_anchors)
    
    return [shap_output, lime_output, anchors_output]

### Junta as explicações em um json com dados da instância

In [12]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [18]:
def export_json_explanations(samples, indexes):
    explanations = dict()
    for i in range(0, len(samples)):
        json_output = {}
        # gives values of random forest predictions
        predict = forest.predict_proba(samples.iloc[[i]])[0]
        json_output['forest_prediction'] = {'not to refactor': predict[0], 'refactor': predict[1]}
        # calls funtions that run and export shap, lime and anchors explanations
        json_output['shap'], json_output['lime'], json_output['anchors'] = run_and_export_explainers(samples, i)
        # puts explanations in each instance index
        explanations[indexes[i]] = json_output
        #export explanations to json file
    with open("explanations_samples.json", "w") as outfile:
        json.dump(explanations, outfile)
    return explanations

json_output_exp = export_json_explanations(samples, samples_indexes)
#pretty print for json
print(json.dumps(json_output_exp, indent=4))

Intercept 0.3983520684376731
Prediction_local [0.75947097]
Right: 0.72
Intercept 0.3736842713337413
Prediction_local [0.7365066]
Right: 0.81
Intercept 0.3891187709623613
Prediction_local [0.69890339]
Right: 0.97
Intercept 0.44525973239188626
Prediction_local [0.51345533]
Right: 0.51
Intercept 0.48349241309973656
Prediction_local [0.59197496]
Right: 0.9
Intercept 0.5326300635890906
Prediction_local [0.49365503]
Right: 0.49
Intercept 0.43409841612312483
Prediction_local [0.67863806]
Right: 0.8
Intercept 0.43709530683567177
Prediction_local [0.6940152]
Right: 0.84
Intercept 0.4650824269670306
Prediction_local [0.7842747]
Right: 0.98
Intercept 0.43887803221331567
Prediction_local [0.43054244]
Right: 0.85
{
    "60621": {
        "forest_prediction": {
            "not to refactor": 0.28,
            "refactor": 0.72
        },
        "shap": {
            "features": [
                {
                    "feature_name": "bugFixCount",
                    "feature_value": 4,
            

## File format .csv

In [14]:
# shap_output = pd.DataFrame()
# lime_output = pd.DataFrame()
# for i in range(0,10):
#     (shap_output, lime_output) = instance_local_explanations(i, samples, shap_output, lime_output)
# shap_output.to_csv('shap_explanations.csv')
# lime_output.to_csv('lime_explanations.csv')

# Analysing explanations

In [15]:
print(json.dumps(json_output_exp, indent=4))

{
    "60621": {
        "forest_prediction": {
            "not to refactor": 0.28,
            "refactor": 0.72
        },
        "shap": {
            "features": [
                {
                    "feature_name": "bugFixCount",
                    "feature_value": 4,
                    "feature_weight": 0.08669926474690005,
                    "feature_ranges": null,
                    "feature_rank": 1
                },
                {
                    "feature_name": "methodLoc",
                    "feature_value": 77,
                    "feature_weight": 0.058553218067354224,
                    "feature_ranges": null,
                    "feature_rank": 2
                },
                {
                    "feature_name": "methodRfc",
                    "feature_value": 32,
                    "feature_weight": 0.05268806497404086,
                    "feature_ranges": null,
                    "feature_rank": 3
                },
                {
       

In [19]:
def extract_feature_names(json_features):
    extracted_feature_names = []
    for f in json_features:
        extracted_feature_names.append(f['feature_name'])
    return extracted_feature_names

def get_info_by_feature_name(exp_f, feature_name):
    feature_obj = next(x for x in exp_f if x["feature_name"] == feature_name)
    return feature_obj['feature_value'], feature_obj['feature_weight'], feature_obj['feature_ranges'], feature_obj['feature_rank']

def compare_feature_weights(feature_value):
    # order = ['rank_anchors', 'rank_shap', 'rank_lime']
    # rank_order = []

    # for o in order:
    #     if o in feature_value
    #         rank_order.append(feature_value[o])
        
    if 'rank_shap' in feature_value and 'rank_lime' in feature_value and 'rank_anchors' in feature_value:
        return feature_value['rank_anchors'], feature_value['rank_shap'], feature_value['rank_lime']
    elif 'rank_shap' in feature_value and 'rank_lime' in feature_value:
        return feature_value['rank_shap'], feature_value['rank_lime']
    elif 'rank_shap' in feature_value and 'rank_anchors' in feature_value:
        return feature_value['rank_anchors'], feature_value['rank_shap']
    elif 'rank_lime' in feature_value and 'rank_anchors' in feature_value:
        return feature_value['rank_anchors'], feature_value['rank_lime']

def intersect_feature_explanations(feature_names, instance):
    shap_f = instance['shap']['features']
    lime_f = instance['lime']['features']
    anchors_f = instance['anchors']['features']

    shap_f_names = extract_feature_names(shap_f)
    lime_f_names = extract_feature_names(lime_f)
    anchors_f_names = extract_feature_names(anchors_f)

    intersec_3, intersec_shap_lime, intersec_shap_anchors, intersec_lime_anchors = ([] for i in range(4))
    compiled_intersec = dict()
    for f in feature_names:
        # for each configuration of intersection, gets feature name and explainers ranking
        # when finds feature in shap, lime and anchors
        if f in shap_f_names and f in lime_f_names and f in anchors_f_names:
            feature_summary = dict()
            feature_summary['feature_name'] = f
            # get info from explainers' features
            feature_value, _, anchors_ranges, anchors_rank = get_info_by_feature_name(anchors_f, f)
            _, shap_weight, _, shap_rank = get_info_by_feature_name(shap_f, f)
            _, lime_weight, lime_ranges, lime_rank = get_info_by_feature_name(lime_f, f)
            # feature value
            feature_summary['feature_value'] = feature_value
            # puts in dictionary
            feature_summary['rank_anchors'] = anchors_rank
            feature_summary['rank_shap'] = shap_rank
            feature_summary['rank_lime'] = lime_rank
            feature_summary['weight_shap'] = shap_weight
            feature_summary['weight_lime'] = lime_weight
            feature_summary['range_anchors'] = anchors_ranges
            feature_summary['range_lime'] = lime_ranges
            intersec_3.append(feature_summary)
        # when finds feature in shap and lime
        elif f in shap_f_names and f in lime_f_names:
            feature_summary = dict()
            feature_summary['feature_name'] = f
            # get info from explainers' features
            feature_value, shap_weight, _, shap_rank = get_info_by_feature_name(shap_f, f)
            _, lime_weight, lime_ranges, lime_rank = get_info_by_feature_name(lime_f, f)
            # feature value
            feature_summary['feature_value'] = feature_value
            # puts in dictionary
            feature_summary['rank_shap'] = shap_rank
            feature_summary['rank_lime'] = lime_rank
            feature_summary['weight_shap'] = shap_weight
            feature_summary['weight_lime'] = lime_weight
            feature_summary['range_lime'] = lime_ranges
            intersec_shap_lime.append(feature_summary)
        # when finds feature in shap and anchors
        elif f in shap_f_names and f in anchors_f_names:
            feature_summary = dict()
            feature_summary['feature_name'] = f
            # get info from explainers' features
            feature_value, _, anchors_ranges, anchors_rank = get_info_by_feature_name(anchors_f, f)
            _, shap_weight, _, shap_rank = get_info_by_feature_name(shap_f, f)
            # feature value
            feature_summary['feature_value'] = feature_value
            # puts in dictionary
            feature_summary['rank_anchors'] = anchors_rank
            feature_summary['rank_shap'] = shap_rank
            feature_summary['weight_shap'] = shap_weight
            feature_summary['range_anchors'] = anchors_ranges
            intersec_shap_anchors.append(feature_summary)
        # when finds feature in lime and anchors
        elif f in lime_f_names and f in anchors_f_names:
            feature_summary = dict()
            feature_summary['feature_name'] = f
            # get info from explainers' features
            feature_value, _, anchors_ranges, anchors_rank = get_info_by_feature_name(anchors_f, f)
            _, lime_weight, lime_ranges, lime_rank = get_info_by_feature_name(lime_f, f)
            # feature value
            feature_summary['feature_value'] = feature_value
            # puts in dictionary
            feature_summary['rank_anchors'] = anchors_rank
            feature_summary['rank_lime'] = lime_rank
            feature_summary['weight_lime'] = lime_weight
            feature_summary['range_anchors'] = anchors_ranges
            feature_summary['range_lime'] = lime_ranges
            intersec_lime_anchors.append(feature_summary)

    # order each list by importance (1st anchors, 2nd shap, 3rd lime)
    compiled_intersec['intersec_all'] = sorted(intersec_3, key=compare_feature_weights)
    compiled_intersec['intersec_shap_lime'] = sorted(intersec_shap_lime, key=compare_feature_weights)
    compiled_intersec['intersec_shap_anchors'] = sorted(intersec_shap_anchors, key=compare_feature_weights)
    compiled_intersec['intersec_lime_anchors'] = sorted(intersec_lime_anchors, key=compare_feature_weights)
    
    # print(json.dumps(compiled_intersec, indent=4))
    return compiled_intersec

In [20]:
# 1st: get interseccions between all the 3 explainers
# 2nd: get interseccions between 2 explainers
#      decision between explainers will have
#      priority -> 1 - anchors, 2 - shap, 3 - lime
def generate_top_n_features_v1(feature_names, json_out, n_f):
    final_explanations = dict()
    # run each instance
    for i in json_out:
        intersections = intersect_feature_explanations(feature_names, json_out[i])
        # 1st step of algorithm
        # get intersections between all explainers
        final_n_features = intersections['intersec_all'][0:n_f]
        # verify how many features misses to complete the x top features
        missing_features = n_f - len(final_n_features)
        
        if missing_features:
            # get x reamining features from each of intersections between 2 explainers
            i_shap_lime_f = intersections['intersec_shap_lime'][0:missing_features]
            i_shap_anchors_f = intersections['intersec_shap_anchors'][0:missing_features]
            i_lime_anchors_f = intersections['intersec_lime_anchors'][0:missing_features]
            # concat lists
            remaining_features = i_shap_lime_f+i_shap_anchors_f+i_lime_anchors_f
            # 2st step of algorithm
            # sort concatenated list
            sorted_remaining_features = sorted(remaining_features, key=compare_feature_weights)
            # concat features selected
            final_n_features = final_n_features + sorted_remaining_features[0:missing_features]

        instance_final_exp = dict()
        instance_final_exp['forest_prediction_to_refactor'] = json_out[i]['forest_prediction']['refactor']
        top_n_features_string = 'top_'+str(n_f)+'_features'
        instance_final_exp[top_n_features_string] = final_n_features
        
        final_explanations[i] = instance_final_exp

    #export explanations to json file
    file_name = 'top_'+str(n_f)+'_features_explanations.json'
    with open(file_name, "w") as outfile:
        json.dump(final_explanations, outfile)
        
    return final_explanations

final_explanations = generate_top_n_features_v1(feature_names, json_output_exp, 5)
print(json.dumps(final_explanations, indent=4))

{
    "60621": {
        "forest_prediction_to_refactor": 0.72,
        "top_5_features": [
            {
                "feature_name": "bugFixCount",
                "feature_value": 4,
                "rank_anchors": 1,
                "rank_shap": 1,
                "rank_lime": 1,
                "weight_shap": 0.08669926474690005,
                "weight_lime": 0.1481879078667768,
                "range_anchors": "bugFixCount <= 4.00",
                "range_lime": "0.00 < bugFixCount <= 4.00"
            },
            {
                "feature_name": "methodRfc",
                "feature_value": 32,
                "rank_anchors": 2,
                "rank_shap": 3,
                "rank_lime": 3,
                "weight_shap": 0.05268806497404086,
                "weight_lime": 0.05023540236417073,
                "range_anchors": "methodRfc > 10.00",
                "range_lime": "methodRfc > 10.00"
            },
            {
                "feature_name": "methodLoc",
  