##### Imports

In [1]:
from  asvFormula.classesSizes.recursiveFormula import *
from asvFormula.bayesianNetworks.bayesianNetwork import *
from asvFormula.bayesianNetworks import networkSamplesPath
from asvFormula.datasetManipulation import *
import pandas as pd
from typing import List
import numpy as np
import random 
from pgmpy.readwrite import BIFReader
from pgmpy.inference import VariableElimination
import random
import sys,os

from sklearn.preprocessing import LabelEncoder

### Auxiliary Functions

In [2]:
printEnabled = True
def disablePrint():
    global printEnabled
    if printEnabled:
        sys._jupyter_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')
        printEnabled = False

def enablePrint():
    global printEnabled
    printEnabled = True
    sys.stdout.close()
    sys.stdout = sys._jupyter_stdout

def convertDictToCsv(dict, filename):

    df = pd.DataFrame.from_dict(dict, orient='index')

    # Save the DataFrame to a CSV file
    df.to_csv(filename)

## Running ASV in Python

#### Train the random forest model

In [9]:
random.seed(10)

cancerNetworkPath = networkSamplesPath + "/cancer.bif"

BNmodel = BIFReader(cancerNetworkPath).get_model()
BNInference = VariableElimination(BNmodel)

variableToPredict = "Cancer"
treeMaxDepth = 2
# Create a BNDatabaseGenerator object from the model
dataFromBN = datasetFromBayesianNetwork(BNmodel, 2000)
featureColumns = dataFromBN.columns
encodingDict, encodedDataset = encodeCategoricalColumns(dataFromBN)
dtTreeClassifier = decisionTreeFromDataset(encodedDataset, variableToPredict , treeMaxDepth)
dtAsNetwork = obtainNetworkXTreeStructure(dtTreeClassifier, featureColumns)

  0%|          | 0/5 [00:00<?, ?it/s]

#### Compute the Shapley Values

In [5]:
import shap

# Initialize the TreeExplainer
explainer = shap.TreeExplainer(dtTreeClassifier)

# Compute Shapley values for the test set
#shap_values = explainer.shap_values(X_test)

# For binary classification, shap_values returns a list with two arrays
# We'll use the values corresponding to the positive class (income >50K)
#shap_values = shap_values[1]


#### ASV Computation

In [8]:
def asvForFeature(dag : nx.DiGraph, feature : str, instance : pd.Series, model, dataset : pd.DataFrame, feature_distributions : VariableElimination) -> float:
    equivalenceClasses = equivalenceClassesFor(dag, "age")
    asvValue = 0
    for equivalenceClass in equivalenceClasses:
        classFeaturesOrder = equivalenceClass[0]
        classSize = equivalenceClass[1]
        asvValue += classSize * asvForEquivalenceClass(classFeaturesOrder, feature, instance, model, dataset, feature_distributions)

    return asvValue



def asvForEquivalenceClass(classFeaturesOrder : List[str], feature : str, instance : pd.Series, model, dataset : pd.DataFrame, feature_distributions : VariableElimination) -> float:
    asvValue = 0
    
    realFeatures = classFeaturesOrder[:classFeaturesOrder.index(feature)]
    
    for matchingInstance in matchingInstances(dataset, realFeatures, instance):
        asvValue += model.predict(matchingInstance) * probOfInstance(matchingInstance, instance, realFeatures, feature_distributions)
    return asvValue


def matchingInstances(dataset, realFeatures, instance):
    matchingInstances = dataset.copy()
    for feature in realFeatures:
        matchingInstances = matchingInstances[matchingInstances[feature] == instance[feature]]
    return matchingInstances

def probOfInstance(matchingInstance : pd.Series, instance : pd.Series, realFeatures : List[str], feature_distributions : VariableElimination) -> float:
    
    evidence = {realFeature : 'What to dooooo' for realFeature in realFeatures}
    
    return feature_distributions.query(variables=[instance.name])

#### Compute the ASV

In [None]:
asvForFeature(dtAsNetwork, "Smoker", encodedDataset.iloc[0], dtTreeClassifier, encodedDataset, BNInference)