In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Import necessary modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics.pairwise import cosine_distances

In [None]:
f=pd.read_csv(r'.\F.csv')

In [None]:
print('FACTUALS')
f=pd.read_csv(r'.\F.csv') #load factuals
f=f.drop('sogliaFRS',axis=1) #drop output class (if present)
print(f.shape)
print(f.head())

print('COUNTERFACTUALS-1')
c1=pd.read_csv(r'.\CF_31.csv') #load counterfactual explanations of class 1 (high->low transition)
print(c1.head())
c1=c1.drop('sogliaFRS',axis=1) #drop output class (if present)
c1.iloc[:, :4] = c1.iloc[:, :4].round(0).abs()


print('COUNTERFACTUALS-2') #load counterfactual explanations of class 1 (high->moderate transition)
c2=pd.read_csv(r'.\CF_32.csv')
print(c2.head())
c2=c2.drop('sogliaFRS',axis=1) #drop output class (if present)
c2.iloc[:, :4] = c2.iloc[:, :4].round(0).abs()

for col in f.columns[:4]: #convert categoricals
    f_MUCH_double[col] = f[col].astype('category')
    c1[col] = c1[col].astype('category')
    c2[col] = c2[col].astype('category')

In [None]:
m=f_MUCH_double.shape[1] #number of input features
h=f_MUCH_double.select_dtypes(include=['category']).shape[1]#number of categorical features

### Mixed distance

d(x_i,x_j)= (h/m)* HammingDistance(x_i,x_j)+((m-h)/m)* CosineDistance(x_i,x_j)

In [None]:
def computeMixedDistance(row1, row2,h,m):   
    '''Compute mixed distance between 2 observations with both categorical and numerical attributes
    Parameters:
    row1: observartion 1
    row2: observation 2
    h: number of categorical features
    m: total number of input features

    Returns: 
    d1+d2: weighted sum of Hamming and Cosine distances
    '''
    
    row1_cat = row1[:h]
    row2_cat = row2[:h]
    row1_num=np.array(row1[ h:]).reshape(1, -1)
    row2_num=np.array(row2[h:]).reshape(1, -1)
    hammingDist=sum(el1 != el2 for el1, el2 in zip(row1_cat, row2_cat))
    cosineDist = cosine_distances(row1_num, row2_num)
    d1=(h/m)*hammingDist
    d2= ((m-h)/m)*cosineDist
    
    return d1+d2


## PROXIMITY
Average distance between 
x and the counterfactual x (the lower the better)d 

In [None]:
df_f_MUCH_double_values = f_MUCH_double.values
df_c1_MUCH_double_values = c1_MUCH_double.values
df_c2_MUCH_double_values = c2_MUCH_double.values

# Calculate the distance between each factual x and each corresponding counterfactual explanation of class 1
distances = [computeMixedDistance(row1, row2,h,m) for row1, row2 in zip(f.values , c1.values )]
# Calculate the average distance
average_distance = np.mean(distances)
print("Proximity F-c1:", average_distance)
print('-----------')
# Calculate the distance between each factual x and each corresponding counterfactual explanation of class 2
distances = [computeMixedDistance(row1, row2,h,m) for row1, row2 in zip(f.values , c2.values)]
average_distance = np.mean(distances)
print("Proximity F-c2:", average_distance)

## SPARSITY
Average number of features changed between a counterfactual x′ and x (the lower the better)

In [None]:
def computeSparsity(df1,df2,m):
    '''Compute average number of features changed
    Parameters:
    df1: full set of factual observations
    df2: corresponding counterfactual explanations
    m: number of input features
   
    Returns: 
    distance: average number of times two features are not equal (tolerance = 10% of the factual observation)
    '''
    total_distance = 0
    
    for index in range(len(df1)): # Iterate over each factual row
        row1 = df1.iloc[index]
        row2 = df2.iloc[index]
        dist = 0
        
        for col in df1.columns:  # Iterate over each feature
            
            x1 = row1[col]
            x2 = row2[col]
            # Calculate tolerance (10% of the factual value x1)
            tolerance = 0.1 * x1
            
            # Compare each feature value
            if abs(x1 - x2) > tolerance: #if the distance between the 2 values is greater than the tolerance the 2 values are considered different and the distance increases by 1
                dist = 1
            else:
                dist=0

            total_distance += dist      
    
    distance=total_distance/(len(df1)*m)
    return distance

print("Sparsity F-c1:", computeSparsity(f,c1,m)) #the lower the better
print("Sparsity F-c2:", computeSparsity(f,c2,m))

## ROBUSTNESS (IMPLAUSIBILITY)
It can be computed in 2 ways (the lower, the better):
<li> Distance of x' from the closest instance 
in the reference set X   (e.g., test set) </li>
<li> <b> OR </b> Distance of x' from the barycenter of the target class </li>

In [None]:
test_df=pd.read_csv("Test.csv")
test_df=test_df.rename(columns={"tot_chol_mmol_L_": "totChol"})

def computeImplausibility(test_df, cf,h,m):
    '''Compute average  distance of x' of the counterfactuals from the closest instance in the reference set X
    Parameters:
    test_df: reference set
    cf: set of counterfactual explanations
    h: number of categorical features
    m: number of input features
    Returns: 
    average_min_distance: average of the minimum distances across the number of counterfactuals
    '''
    distances = []
    test_df=test_df.iloc[:, :-1]
    
    for i, row1 in cf.iterrows():  # Iterate over each counterfactual in cf
        # Compute distances from the i-th cf to all the rows in the test set using the mixed distance
        dists = test_df.apply(lambda row2: computeMixedDistance(row1.values, row2.values,h,m), axis=1)
        # Take the minimum distance
        min_dist = dists.min()
        distances.append(min_dist)
    
    # Compute the average of the minimum distances across the number of counterfactuals
    average_min_distance = np.mean(distances)
    return average_min_distance

print("Implausibility F-c1:", computeImplausibility(test_df, c1,h,m))  #the lower the better
print("Implausibility F-c2:", computeImplausibility(test_df, c2,h,m))
print('------------------------')

bar_df = pd.read_csv("Barycenters.csv") #load barycenters
bar_df.iloc[:, :4] = bar_df.iloc[:, :4].round(0).abs()
for col in bar_df.columns[:4]:
    bar_df[col] = bar_df[col].astype('category')

distances = [computeMixedDistance(row, bar_df.iloc[0,:].values,h,m) for row in c1.values ]
average_distance = np.mean(distances)
print("Implausibility(BARYCENTER) F-c1:", average_distance)
distances = [computeMixedDistance(row, bar_df.iloc[1,:].values,h,m) for row in c2.values]
average_distance = np.mean(distances)
print("Implausibility(BARYCENTER) F-c2:", average_distance)

## DIVERSITY
Average distance between 
the set of found counterfactual (the higher, the better).

In [None]:
def computeDiversity (cf, h,m ): 
    '''Compute average between pairs of counterfactuals x' and x''
    Parameters:
    cf: set of counterfactual explanations of a certain class
    h: number of categorical features
    m: number of input features
    Returns: 
    diversity: average of the distances between all pairs of counterfactual explanations
    '''
    dist=0
    for i in range(len(cf)):
        for j in range(len(cf)):
            row1 = cf[i]
            row2 = cf[j]
            dist += computeMixedDistance(row1, row2,h,m)
    diversity=dist/(len(cf)*len(cf))
    return diversity

print("Diversity F-c1:", computeDiversity(c1.values,h,m))  #the higher the better
print("Diversity F-c2:", computeDiversity(c2.values,h,m))


## DISCRIMINATIVE POWER
It measure the ability to distinguish the set of counterfactuals from the points in the factual class (e.g., points from the training set) using a binary classifier (the higher, the better)

In [None]:
training_set=pd.read_csv(r"Training.csv",sep=',') #read training set
training_set=training_set.rename({'tot_chol_mmol_L_': 'totChol'},axis='columns')

In [None]:
#extract desired factual class
real_positives=training_set[training_set['sogliaFRS']==3]
real_positives

In [None]:
def discriminative_power(X,y,neighbors, n_cv):
    # Split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify =y)
    scaler = StandardScaler()
    # Fit only on X_train
    scaler.fit(X_train)

    # Scale both X_train and X_test
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    knn = KNeighborsClassifier(n_neighbors = 5)
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
    
    print('Cross-validation_scores')
    print(scores)
  
    print('Discriminative Power:')
    print(scores.mean())   # average the five scores.
    # choose k between 1 to 31
    k_range = range(1, 31)
    k_scores = []
    
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
        k_scores.append(scores.mean())
    plt.plot(k_range, k_scores)
    plt.xlabel('Value of K for KNN')
    plt.ylabel('Cross-Validated Accuracy')
    plt.show()

In [None]:
print('Class 1')
df_1=pd.concat([real_positives,c1],ignore_index=True)
df_1 = df_1.sample(frac=1).reset_index(drop=True) #shuffle
y=df_DICE_double['sogliaFRS']
X=df_DICE_double.drop(['sogliaFRS'],axis=1)
discriminative_power(X,y,neighbors=5, n_cv=5)

In [None]:
print('Class 2')
df_2=pd.concat([real_positives,c2],ignore_index=True)
df_2 = df_2.sample(frac=1).reset_index(drop=True) #shuffle
y=df_DICE_double['sogliaFRS']
X=df_DICE_double.drop(['sogliaFRS'],axis=1)
discriminative_power(X,y,neighbors=5, n_cv=5)