## Analyze Counterfactual Data

In [1]:
import os
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

import xgboost
from xgboost import XGBClassifier

from sklearn.model_selection import (
    StratifiedKFold,
    RandomizedSearchCV,
    train_test_split,
)
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
    
from sklearn.linear_model import LinearRegression

In [18]:
data_dir = '/home/dauenha0/murp1677/Cyclic_Dynamics/Code/ML_repo/programmable-catalysis/dev/data/'
cfs = os.path.join(data_dir, 'counterfactuals-zero-to-pos.csv')
perturbations = os.path.join(data_dir, 'counterfactual-perturbations-zero-to-pos.csv')
queries = os.path.join(data_dir, 'query-instances-zero-to-pos.csv')

# load counterfactuals
df_cfs = pd.read_csv(cfs)
df_perturbations = pd.read_csv(perturbations)
queries = pd.read_csv(queries)

In [19]:
df_perturbations.mean()

alpha-a          0.077911
alpha-b          0.081200
alpha-c          0.078822
beta-a           0.135744
beta-b           0.143900
beta-c           0.120111
gamma-b-a        0.135460
gamma-c-a        0.135356
delta-b-a        0.107787
delta-c-a        0.096787
change-in-bea    0.053977
dtype: float64

In [25]:
# Dictionary to store the count of changes for each feature across all queries
feature_changes = {}

# Iterate through each query
for i in range(len(queries)):
    # Extract the original query and counterfactuals as dictionaries
    original_query = queries[i:i+1].to_dict('records')[0]
    counterfactuals = df_cfs[i*10:i*10+10].to_dict('records')
    
    # Initialize feature_changes for the current query
    if i not in feature_changes:
        feature_changes[i] = {feature: 0 for feature in original_query}
    
    # Iterate through each counterfactual
    for counterfactual in counterfactuals:
        # Compare each feature in the original query with the corresponding feature in the counterfactual
        for feature, value in original_query.items():
            if counterfactual[feature] != value:
                # If the feature value is different in the counterfactual, increment the change count for that feature
                feature_changes[i][feature] += 1


In [27]:
# Sum up all the changes for each feature across all queries
total_changes = {feature: sum(changes[feature] for changes in feature_changes.values()) for feature in feature_changes[0]}

# Print the total count of changes for each feature across all queries
for feature, count in total_changes.items():
    print(f"Total changes for feature '{feature}': {count}")

Total changes for feature 'alpha-a': 324
Total changes for feature 'alpha-b': 330
Total changes for feature 'alpha-c': 321
Total changes for feature 'beta-a': 813
Total changes for feature 'beta-b': 905
Total changes for feature 'beta-c': 781
Total changes for feature 'gamma-b-a': 622
Total changes for feature 'gamma-c-a': 621
Total changes for feature 'delta-b-a': 533
Total changes for feature 'delta-c-a': 484
Total changes for feature 'change-in-bea': 319
