##### Imports

In [1]:
from  ASVformula.recursiveFormula import *
import pandas as pd
from typing import List
import numpy as np
import sys, os

In [2]:
from rpy2.robjects import pandas2ri
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri as numpy2ri
import rpy2.robjects as robjects

# Import necessary R packages
ggplot2 = importr('ggplot2')
ggraph = importr('ggraph')
grid = importr('grid')

### Auxiliary Functions

In [3]:
printEnabled = True
def disablePrint():
    global printEnabled
    if printEnabled:
        sys._jupyter_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')
        printEnabled = False

def enablePrint():
    global printEnabled
    printEnabled = True
    sys.stdout.close()
    sys.stdout = sys._jupyter_stdout

def convertDictToCsv(dict, filename):

    df = pd.DataFrame.from_dict(dict, orient='index')

    # Save the DataFrame to a CSV file
    df.to_csv(filename)

## Running ASV in Python

#### Load the dataset and create the causal graph

In [4]:
import pandas as pd

# Load the dataset (assuming it is similar to the Adult Census Income dataset from UCI repository)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ["age", "workclass", "education_num", "marital_status", 
           "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", 
           "hours_per_week", "native_country", "income"]

# Load data into a DataFrame
data = pd.read_csv(url, header=None, names=columns, na_values=" ?", skipinitialspace=True)

# Convert categorical variables to numerical using one-hot encoding
categorical_features = ['workclass', 'marital_status', 'occupation', 
                        'relationship', 'race', 'sex', 'native_country']

data = pd.get_dummies(data, columns=categorical_features)

# Convert target variable to binary
data['income'] = data['income'].apply(lambda x: 0 if x == '<=50K' else 1)

X = data.drop('income', axis=1)
y = data['income']

nodes = ["marital_status", "education_num", "relationship", "native_country", 
                   "age", "sex", "race", "hours_per_week"]

parents = ["age", "age", "marital_status", "marital_status", "education_num", "education_num", "native_country"]
children = ["marital_status", "education_num", "relationship", "sex", "hours_per_week", "native_country", "race"]

causal_graph = nx.DiGraph()
causal_graph.add_nodes_from(nodes)
for i in range(len(parents)):
    causal_graph.add_edge(parents[i], children[i])

#drawGraph(causal_graph)

#### Calculate the Distribution of Each Feature + generate the modified instance

In [5]:
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()


# Create a dictionary to store distributions
feature_distributions = {}

# Calculate distributions for numerical features
for feature in numerical_features:
    # For numerical features, we'll store the array of values
    feature_distributions[feature] = data[feature].values

def obtainModifiedInstance(instance, estimatedFeatures, feature_distributions):
    """
    Returns a modified instance where values of realFeatures are kept the same,
    and values of estimatedFeatures are replaced based on their distributions.
    
    Parameters:
    - instance: pd.Series representing a single data instance.
    - estimatedFeatures: list of feature names to estimate values.
    - feature_distributions: dictionary containing distributions of features.
    
    Returns:
    - modified_instance: pd.Series with modified feature values.
    """
    # Create a copy of the instance to avoid modifying the original
    modified_instance = instance.copy()
    
    # For estimated features, replace the values
    for feature in estimatedFeatures:
        sampled_value = np.random.choice(feature_distributions[feature])
        modified_instance[feature] = sampled_value

    return modified_instance


#### Train the random forest model

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

#### Compute the Shapley Values

In [7]:
import shap

# Initialize the TreeExplainer
explainer = shap.TreeExplainer(rf_model)

# Compute Shapley values for the test set
#shap_values = explainer.shap_values(X_test)

# For binary classification, shap_values returns a list with two arrays
# We'll use the values corresponding to the positive class (income >50K)
#shap_values = shap_values[1]


  from .autonotebook import tqdm as notebook_tqdm


#### Compute the ASV

In [8]:
def asvForFeature(dag : nx.DiGraph, feature : str, instance : pd.Series, model, dataset : pd.DataFrame, feature_distributions : function) -> float:
    equivalenceClasses = equivalenceClassesFor(causal_graph, "age")
    asvValue = 0
    for equivalenceClass in equivalenceClasses:
        classFeaturesOrder = equivalenceClass[0]
        classSize = equivalenceClass[1]
        asvValue += classSize * asvForEquivalenceClass(classFeaturesOrder, feature, instance, model, dataset, feature_distributions)

    return asvValue



def asvForEquivalenceClass(classFeaturesOrder : List[str], feature : str, instance : pd.Series, model, dataset : pd.DataFrame, feature_distributions : function) -> float:
    asvValue = 0
    
    realFeatures = classFeaturesOrder[:classFeaturesOrder.index(feature)]
    
    for matchingInstance in matchingInstances(dataset, realFeatures, instance):
        asvValue += model.predict(matchingInstance) * probOfInstance(matchingInstance, instance, realFeatures, feature_distributions)
    return asvValue


def matchingInstances(dataset, realFeatures, instance):
    matchingInstances = dataset.copy()
    for feature in realFeatures:
        matchingInstances = matchingInstances[matchingInstances[feature] == instance[feature]]
    return matchingInstances

def probOfInstance(matchingInstance : pd.Series, instance : pd.Series, realFeatures : List[str], feature_distributions : function) -> float:
    prob = 0.5
    
    #Do something here that calculates the probability of the matchingInstance given the realFeatures of the instance
    # P( matchingInstance | instance[realFeatures] )

    return prob

{0: [['age',
   'marital_status',
   'education_num',
   'relationship',
   'sex',
   'hours_per_week',
   'native_country',
   'race'],
  210]}