## Analyze Counterfactual Data

In [4]:
import os
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

import xgboost
from xgboost import XGBClassifier

from sklearn.model_selection import (
    StratifiedKFold,
    RandomizedSearchCV,
    train_test_split,
)
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
    
from sklearn.linear_model import LinearRegression

In [13]:
data_dir = os.path.join('..', '..', 'data', 'csvs')
cfs = os.path.join(data_dir, 'counterfactuals-1-to-2.csv')
perturbations = os.path.join(data_dir, 'counterfactual-perturbations-1-to-2.csv')
queries = os.path.join(data_dir, 'query-instances-1-to-2.csv')

# load counterfactuals
df_cfs = pd.read_csv(cfs)
df_perturbations = pd.read_csv(perturbations)
queries = pd.read_csv(queries)

In [14]:
df_perturbations.mean()

alpha-a          0.126120
alpha-b          0.130757
alpha-c          0.121163
beta-a           0.056856
beta-b           0.062923
beta-c           0.068596
gamma-b-a        0.137744
gamma-c-a        0.132972
delta-b-a        0.125730
delta-c-a        0.138966
change-in-bea    0.065016
dtype: float64

In [15]:
# Dictionary to store the count of changes for each feature across all queries
feature_changes = {}

# Iterate through each query
for i in range(len(queries)):
    # Extract the original query and counterfactuals as dictionaries
    original_query = queries[i:i+1].to_dict('records')[0]
    counterfactuals = df_cfs[i*10:i*10+10].to_dict('records')
    
    # Initialize feature_changes for the current query
    if i not in feature_changes:
        feature_changes[i] = {feature: 0 for feature in original_query}
    
    # Iterate through each counterfactual
    for counterfactual in counterfactuals:
        # Compare each feature in the original query with the corresponding feature in the counterfactual
        for feature, value in original_query.items():
            if counterfactual[feature] != value:
                # If the feature value is different in the counterfactual, increment the change count for that feature
                feature_changes[i][feature] += 1


In [16]:
# Sum up all the changes for each feature across all queries
total_changes = {feature: sum(changes[feature] for changes in feature_changes.values()) for feature in feature_changes[0]}

# Print the total count of changes for each feature across all queries
for feature, count in total_changes.items():
    print(f"Total changes for feature '{feature}': {count}")

Total changes for feature 'alpha-a': 5860
Total changes for feature 'alpha-b': 5993
Total changes for feature 'alpha-c': 5238
Total changes for feature 'beta-a': 4331
Total changes for feature 'beta-b': 4622
Total changes for feature 'beta-c': 5120
Total changes for feature 'gamma-b-a': 6008
Total changes for feature 'gamma-c-a': 6039
Total changes for feature 'delta-b-a': 5018
Total changes for feature 'delta-c-a': 5251
Total changes for feature 'change-in-bea': 3370
