In [1]:
import dice_ml

import pandas as pd

import torch.nn as nn
import torch

import copy

import os
import numpy as np
from pickle import dump , load
import seaborn as sns

In [2]:
class TaiwaneseCreditClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(23, 64)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(64, 128)
        self.act2 = nn.ReLU()
        self.hidden3 = nn.Linear(128, 32)
        self.act3 = nn.ReLU()
        self.hidden4 = nn.Linear(32, 16)
        self.act4 = nn.ReLU()
        self.output = nn.Linear(16, 1)
        self.act_output = nn.Sigmoid()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.dropout(self.act1(self.hidden1(x)))
        x = self.dropout(self.act2(self.hidden2(x)))
        x = self.dropout(self.act3(self.hidden3(x)))
        x = self.dropout(self.act4(self.hidden4(x)))
        # x = self.output(x)
        x = self.act_output(self.output(x))
        return x
    
    
def load_model():
    model = torch.load('./../model_training/Taiwanese_credit_model')
    model.eval()
    return model

def eval_model(model, input, scaler, columns_to_standardize):
    model.eval()
    # print(input)
    input = input.reshape(-1, 23)
    input = torch.tensor(input, dtype=torch.float32)
    input = input.numpy()
    input[:, columns_to_standardize] = scaler.transform(input[:, columns_to_standardize])
    input = torch.from_numpy(input).type(torch.float)
    with torch.no_grad():
        prob = model(input)
    return prob.tolist()[0][0]

def load_scaler(scaler_loc):
    return load(open(scaler_loc, 'rb'))

columns_to_standardize = list(range(23))

In [3]:
model = load_model()

In [4]:
def load_data(data):
    return np.load(data)


def create_dataframe_from_list_of_arrays(arrays, column_names=None):
    data= []
    if not arrays:
        raise ValueError("The list of arrays is empty")
    
    # Check if all arrays have the same length
    length = len(arrays[0])
    if not all(len(arr) == length for arr in arrays):
        raise ValueError("All arrays must have the same length")
    
    # Create a dictionary for DataFrame creation

    if len(column_names) != length:
        print(len(column_names), len(arrays))
        raise ValueError("Number of column names must match the number of arrays")
    for i in arrays:
        data.append({column_names[j]: i[j] for j in range(len(i))})
    return pd.DataFrame(data)


def create_data(folder_location, column_names):
    input_list = []
    for i in os.listdir(folder_location):
        if i=='.DS_Store':
            continue
        input_array_location = folder_location + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        print(input)
        input_list.append(input)
    
    df = create_dataframe_from_list_of_arrays(input_list, column_names)
    return df


In [7]:
def scale_input(scaler, input, columns_to_standardize):
    input = input.reshape(-1, 23)
    input = torch.tensor(input, dtype=torch.float32)
    input = input.numpy()
    input[:, columns_to_standardize] = scaler.transform(input[:, columns_to_standardize])
    return input


def calculate_o_1(phenotype, input, model, scaler, columns_to_standardize):
    mod_in = apply_phenotype(input, phenotype)
    o_1 = eval_model(model, mod_in, scaler, columns_to_standardize)
    return o_1

In [43]:
scaler = load_scaler('./../model_training/StandardScaler.pkl')
bounds= [(10000, 1000000),
              (0, 1),
              (0, 6),
              (0, 3),
              (20, 80),
              (-2, 8),
              (-2, 8),
              (-2, 8),
              (-2, 8),
              (-2, 8),
              (-2, 8),
              (-154973, 964511),
              (-67526, 983931),
              (-61506, 855086),
              (-81334, 891586),
              (53007, 927171),
              (-339603, 961664),
              (0, 873552),
              (0, 1215471),
              (0, 846040),
              (0, 621000),
              (0, 417990),
              (0, 403500)]
min_array = np.array([i[0] for i in bounds])
max_array = np.array([i[1] for i in bounds])
scaled_min = scale_input(scaler, min_array, columns_to_standardize)
scaled_max = scale_input(scaler, max_array, columns_to_standardize)

feature = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11','X12', 'X13',
            'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']


feature_dict = {}

for i, j  in enumerate(bounds):
    feature_dict[feature[i]] = [scaled_min[0][i], scaled_max[0][i]]

feature_dict

{'X1': [-1.1320454, 6.6168604],
 'X2': [-0.83086455, 1.4070086],
 'X3': [-2.4233167, 5.3331966],
 'X4': [-2.944624, 2.7857614],
 'X5': [-1.6543822, 4.725081],
 'X6': [-1.7442763, 6.0858283],
 'X7': [-1.5366384, 5.869949],
 'X8': [-1.5043159, 5.9613605],
 'X9': [-1.4739374, 6.0869617],
 'X10': [-1.4650515, 6.2978134],
 'X11': [-1.4285109, 6.285492],
 'X12': [-2.7501092, 12.224778],
 'X13': [-1.6045824, 12.884915],
 'X14': [-1.5517751, 11.600624],
 'X15': [-1.8997005, 12.944077],
 'X16': [0.2083898, 14.328156],
 'X17': [-6.233661, 15.194207],
 'X18': [-0.34022838, 60.5825],
 'X19': [-0.27402094, 67.93915],
 'X20': [-0.2800343, 52.843742],
 'X21': [-0.3147646, 47.895416],
 'X22': [-0.29640576, 28.63767],
 'X23': [-0.28059798, 24.657183]}

In [44]:
print(scaled_max)

[[ 6.6168604  1.4070086  5.3331966  2.7857614  4.725081   6.0858283
   5.869949   5.9613605  6.0869617  6.2978134  6.285492  12.224778
  12.884915  11.600624  12.944077  14.328156  15.194207  60.5825
  67.93915   52.843742  47.895416  28.63767   24.657183 ]]


In [45]:
print(scaled_min)

[[-1.1320454  -0.83086455 -2.4233167  -2.944624   -1.6543822  -1.7442763
  -1.5366384  -1.5043159  -1.4739374  -1.4650515  -1.4285109  -2.7501092
  -1.6045824  -1.5517751  -1.8997005   0.2083898  -6.233661   -0.34022838
  -0.27402094 -0.2800343  -0.3147646  -0.29640576 -0.28059798]]


In [53]:
def apply_phenotype(array, phenotype):
    # Split the phenotype string into individual operations
    operations = phenotype.split(';')
    x = copy.deepcopy(array)
    # Iterate over each operation and execute it
    for operation in operations:
        # Strip any leading/trailing whitespace from the operation
        operation = operation.strip()
        
        # Use the exec function to execute the operation on the array
        if operation:
            exec(operation)

    return x


def inverse(dice_df_scaled, scaler, columns_to_standardize):
    df_values = dice_df_scaled.values
    print(df_values.shape)
    df_values[:, columns_to_standardize] = scaler.inverse_transform(df_values[:, columns_to_standardize])
    df_original = pd.DataFrame(df_values, columns=dice_df_scaled.columns)
    return df_original
    


def create_dice_cf(input_folder, scaler, model, columns_to_standardize):
    
    average_cf_count = []
    
    features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11','X12', 'X13',
            'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']
    backend = 'PYT'  # needs pytorch installed
    ML_modelpath = './../model_training/Taiwanese_credit_model'
    m = dice_ml.Model(model_path=ML_modelpath, backend=backend)
    
    d = dice_ml.Data(features={'X1': [-1.1320454, 6.6168604],
                                 'X2': [-0.83086455, 1.4070086],
                                 'X3': [-2.4233167, 5.3331966],
                                 'X4': [-2.944624, 2.7857614],
                                 'X5': [-1.6543822, 4.725081],
                                 'X6': [-1.7442763, 6.0858283],
                                 'X7': [-1.5366384, 5.869949],
                                 'X8': [-1.5043159, 5.9613605],
                                 'X9': [-1.4739374, 6.0869617],
                                 'X10': [-1.4650515, 6.2978134],
                                 'X11': [-1.4285109, 6.285492],
                                 'X12': [-2.7501092, 12.224778],
                                 'X13': [-1.6045824, 12.884915],
                                 'X14': [-1.5517751, 11.600624],
                                 'X15': [-1.8997005, 12.944077],
                                 'X16': [0.2083898, 14.328156],
                                 'X17': [-6.233661, 15.194207],
                                 'X18': [-0.34022838, 60.5825],
                                 'X19': [-0.27402094, 67.93915],
                                 'X20': [-0.2800343, 52.843742],
                                 'X21': [-0.3147646, 47.895416],
                                 'X22': [-0.29640576, 28.63767],
                                 'X23': [-0.28059798, 24.657183]}, 
                     outcome_name='outcome')

    exp = dice_ml.Dice(d, m)
    path_to_DICE_cf = 'Dice_cf'
    path_to_GE_cf = 'Ge_cf'
    
    for i in os.listdir(input_folder):
        if i=='.DS_Store':
            continue
        filename = input_folder + '/' + i + '/' "final_gen.csv"
        input_array_location = input_folder + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        print(input)
        input_scaled = scale_input(scaler, input, columns_to_standardize)
        # print(input_scaled[0])
        data = pd.read_csv(filename)
        
        data['o_1_mod'] = data.apply(lambda row: calculate_o_1(row['Phenotype'], input, model, scaler, columns_to_standardize), axis=1)
        data = data.sort_values(by=['o_1_mod'])
        data = data[data['o_1_mod'] > 0.50]
        
        number_of_counterfactuals = data.shape[0]
        average_cf_count.append(number_of_counterfactuals)
        input_dice = pd.DataFrame(data=[input_scaled[0]], columns=features)
        print(number_of_counterfactuals)
        # print(input_dice.head())
        # print(feature_dict)
        dice_exp = exp.generate_counterfactuals(input_dice, total_CFs=number_of_counterfactuals, desired_class="opposite")
        isExist = os.path.exists(path_to_DICE_cf)
        if not isExist:
            os.makedirs(path_to_DICE_cf)
        # print(dice_exp.cf_examples_list[0].final_cfs_df.head())
        dice_df_scaled = dice_exp.cf_examples_list[0].final_cfs_df
        dice_df_original = inverse(dice_df_scaled, scaler, columns_to_standardize)
        dice_df_original.to_csv(path_or_buf= path_to_DICE_cf + "/" + str(i) + '.csv'  , index=False)
        
        evolved_x = []
    
        for phenotype in data['Phenotype'].tolist():
            x_out = apply_phenotype(input, phenotype)
            evolved_x.append(x_out)
            
        GE_df = pd.DataFrame(data=evolved_x, columns=features)
        isExist = os.path.exists(path_to_GE_cf)
        if not isExist:
            os.makedirs(path_to_GE_cf)
        GE_df.to_csv(path_to_GE_cf + "/" + str(i) + '.csv'  , index=False)
        
    print(sum(average_cf_count)/len(average_cf_count))
        
    
        
    

In [54]:
input_folder = './../output/NSGAIII_multi/'
create_dice_cf(input_folder, scaler, model, columns_to_standardize)

[330000      1      1      1     34      0      0      0      0      0
      0 138009 141253 130567 128257 116988 105961   7015   7000  10007
   6500   6600   9000]
226


100%|██████████| 1/1 [00:00<00:00, 38.15it/s]


(226, 24)
[390000      0      1      1     38      0      0      0      0      0
      0 164418 167501 134282 128701 131529 135242   9000   7027   5000
   5000   6000   5000]
262


100%|██████████| 1/1 [00:00<00:00, 34.71it/s]

(262, 24)





[170000      0      1      2     28      0      0      0      0      0
      0  56766  59811  61987  63849  66276  68207   4000   3638   3500
   3500   3000   3000]
256


100%|██████████| 1/1 [00:00<00:00, 34.92it/s]


(256, 24)
[420000      0      2      1     29      0      0      0      0      0
      0  48455  34993  35340  54763  59037  60290   2011   3000  20000
   5000   2000   3000]
162


100%|██████████| 1/1 [00:00<00:00, 35.81it/s]

(162, 24)





[640000      0      2      2     39      0      0      0      0      0
      0 119887 123223 119211 118722 105197  93921  10000  10000  10535
  15000   5000  13627]
177


100%|██████████| 1/1 [00:00<00:00, 35.22it/s]


(177, 24)
[280000      0      2      1     30      0      0      0      0      0
      0  71770  74066  75907  77723  79550  81801   4000   3000   3000
   3000   3500   4000]
172


100%|██████████| 1/1 [00:00<00:00, 33.44it/s]

(172, 24)





[360000      0      1      2     27      0      0      0      0      0
      0 130640 120058 110795 101668  78730  66682   4500   4100   4208
   3000   2300   1800]
268


100%|██████████| 1/1 [00:00<00:00, 36.55it/s]


(268, 24)
[180000      0      2      1     28      0      0      0      0      0
      0  85557  73121  68650  67895  68442  70131   3200   2500   3000
   2500   3000   5000]
328


100%|██████████| 1/1 [00:00<00:00, 35.46it/s]

(328, 24)





[230000      0      1      1     27      0      0      0      0      0
      0 104001 106155 111244 116300 121346 130318   4000   6000   6000
   6000  10000  11058]
226


100%|██████████| 1/1 [00:00<00:00, 36.08it/s]


(226, 24)
[310000      0      2      2     26      0      0      0      0      0
      0  87717  93707  83632  81133  75499  73540   9156   2782   4013
   2688   2651   2652]
255


100%|██████████| 1/1 [00:00<00:00, 35.87it/s]

(255, 24)





[340000      1      2      1     31      0      0      0      0      0
      0  63098  64417  65752  69639  80057  83713   2338   2406   5000
  11610   5000   2971]
133


100%|██████████| 1/1 [00:00<00:00, 36.20it/s]


(133, 24)
[200000      1      1      2     33      0      0      0      2      0
      0  54713  58184  62286  61305  62846  42634   5000   5700   2000
   3000   2000   1000]
215


100%|██████████| 1/1 [00:00<00:00, 35.70it/s]

(215, 24)





[310000      1      1      2     32      0      0      0      0      0
      0  59901  62147  62102  65875  60387  43328  10020   6031  10057
   5028   5060   4223]
174


100%|██████████| 1/1 [00:00<00:00, 36.56it/s]


(174, 24)
[480000      0      1      2     32     -2     -2     -2     -2     -2
     -2  11872  38933  23479  52177  54005  53853  40000  23479  52209
  54005  54500  42321]
4


100%|██████████| 1/1 [00:00<00:00, 36.96it/s]


(4, 24)
[430000      0      2      1     42      0      0      0      0      0
      0  89395  90052  90604  91200  92134  92834   3243   3200   3185
   3500   3500   3420]
163


100%|██████████| 1/1 [00:00<00:00, 36.96it/s]


(163, 24)
[280000      0      2      2     38      0      0      0      0      0
      0  92579  94451  96041  98301  74624  75173   4300   4000   4033
   3000   3300   3500]
264


100%|██████████| 1/1 [00:00<00:00, 35.34it/s]


(264, 24)
[320000      0      1      2     31      0      0      0      0      0
      0  77052  65457  62680  62597  60080  55314   3000   3000   3017
   2100   3000   3000]
277


100%|██████████| 1/1 [00:00<00:00, 36.01it/s]

(277, 24)
[300000      1      1      2     45      0      0      0      0      0
      0  62296  64460  56439  53637  55981  58270   3000   3000   3000
   3000   3000   3000]





231


100%|██████████| 1/1 [00:00<00:00, 35.48it/s]

(231, 24)
[500000      0      2      1     40      0      0      0      0      0
      0 215508 214460 220047 217920 159393 149626  10004  10025  10294
   6046   5076   5000]





129


100%|██████████| 1/1 [00:00<00:00, 36.28it/s]

(129, 24)
[270000      1      2      1     35      0      0      0      0      0
      0  48465  45898  46765  55438  54285  54198   1737   2000  10000
   1461   1492    899]





184


100%|██████████| 1/1 [00:00<00:00, 35.53it/s]

(184, 24)
[500000      0      1      2     37      0      0      0      0      0
      0  80846  74776  73558  71055  61248  40737   3032   3200   1946
   1734   2000   4128]





241


100%|██████████| 1/1 [00:00<00:00, 36.59it/s]

(241, 24)
[300000      0      1      1     31      0      0      0      0      0
      0 147277 149893 152529 152882 113347 114554   7000   7000   6000
   4084   4056   4300]





227


100%|██████████| 1/1 [00:00<00:00, 35.59it/s]

(227, 24)
[470000      1      1      2     31      0      0      0      0      0
      0  99931  95096  80388  75631  71464  65915   5070   4019   2019
   2120   2096   1744]





171


100%|██████████| 1/1 [00:00<00:00, 35.76it/s]

(171, 24)
[390000      0      1      2     43      0      0      0      0      0
      0  90294  87292  88335  89396  90769  91825   3200   3200   3200
   3500   3300   4000]





175


100%|██████████| 1/1 [00:00<00:00, 35.99it/s]

(175, 24)
[500000      1      2      2     33      0      0      0      0      0
      0 134558 134503 114682 124240 131006 137911   5185  20031  20029
  20058  20127  10000]





142


100%|██████████| 1/1 [00:00<00:00, 36.82it/s]

(142, 24)
[340000      0      1      1     46      0      0      0      0      0
      0  94915  92955  89689  81996  83439  71239   3224   4000   3504
   4349   3000   3098]





206


100%|██████████| 1/1 [00:00<00:00, 35.39it/s]

(206, 24)
[500000      0      1      1     46      0      0      0      0      0
     -1 196606  64144  49722  67909  61613  16932  10000  10000  20025
  10000  20000  18000]





134


100%|██████████| 1/1 [00:00<00:00, 36.92it/s]

(134, 24)
[210000      0      1      1     34      0      0      0      0      0
      0  74261  75602  76062  76287  76971  77737   3500   3300   3000
   3000   3000   3000]





189


100%|██████████| 1/1 [00:00<00:00, 36.98it/s]

(189, 24)
[450000      1      1      2     26      0      0      0     -1     -1
     -1  20571  37283  43482   5626  73516  11854  20000  20006   5676
  74306  11889  13347]





154


100%|██████████| 1/1 [00:00<00:00, 36.11it/s]

(154, 24)
[180000      0      6      2     47      0      0      0      0      0
      0 167915 163279 166994 150812 123957  55778   6028   7758   5188
   4570   1876   1701]





154


100%|██████████| 1/1 [00:00<00:00, 36.42it/s]

(154, 24)
196.63333333333333





In [56]:
def create_NSGAII_cf(input_folder, model, scaler, columns_to_standardize):
    
    features =  ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11','X12', 'X13',
            'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']

    

    path_to_NSGAII_cf = 'Ge_NSGAII_cf'
    
    for i in os.listdir(input_folder):
        if i=='.DS_Store':
            continue
        filename = input_folder + '/' + i + '/' "final_gen.csv"
        input_array_location = input_folder + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        data = pd.read_csv(filename)

        
        # print(number_of_counterfactuals)
        
        data['o_1_mod'] = data.apply(lambda row: calculate_o_1(row['Phenotype'], input, model, scaler, columns_to_standardize), axis=1)
        data = data.sort_values(by=['o_1_mod'])
        data = data[data['o_1_mod'] > 0.50]
        number_of_counterfactuals = data.shape[0]
        
        evolved_x = []
    
        for phenotype in data['Phenotype'].tolist():
            x_out = apply_phenotype(input, phenotype)
            evolved_x.append(x_out)
            
        GE_df = pd.DataFrame(data=evolved_x, columns=features)
        isExist = os.path.exists(path_to_NSGAII_cf)
        if not isExist:
            os.makedirs(path_to_NSGAII_cf)
        GE_df.to_csv(path_to_NSGAII_cf + "/" + str(i) + '.csv'  , index=False)
        
        
input_folder = './../output/NSGAII_multi/'
create_NSGAII_cf(input_folder, model, scaler, columns_to_standardize)
        