In [1]:
import dice_ml

import pandas as pd

import torch.nn as nn
import torch

import copy

import os
import numpy as np

import seaborn as sns

In [2]:
class RiskClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(9, 16)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(16, 32)
        self.act2 = nn.ReLU()
        self.hidden3 = nn.Linear(32, 64)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(64, 1)
        self.act_output = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act2(self.hidden2(x))
        x = self.act3(self.hidden3(x))
        x = self.act_output(self.output(x))
        return x
    
    
def load_model(input_folder = 'model_training/credit_risk_model'):
    model = torch.load(input_folder)
    model.eval()
    return model

In [3]:
model = load_model(input_folder = './../model_training/credit_risk_model')

In [4]:
def load_data(data):
    return np.load(data)


def create_dataframe_from_list_of_arrays(arrays, column_names=None):
    data= []
    if not arrays:
        raise ValueError("The list of arrays is empty")
    
    # Check if all arrays have the same length
    length = len(arrays[0])
    if not all(len(arr) == length for arr in arrays):
        raise ValueError("All arrays must have the same length")
    
    # Create a dictionary for DataFrame creation

    if len(column_names) != length:
        print(len(column_names), len(arrays))
        raise ValueError("Number of column names must match the number of arrays")
    for i in arrays:
        data.append({column_names[j]: i[j] for j in range(len(i))})
    return pd.DataFrame(data)


def create_data(folder_location, column_names):
    input_list = []
    for i in os.listdir(folder_location):
        if i=='.DS_Store':
            continue
        input_array_location = folder_location + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        print(input)
        input_list.append(input)
    
    df = create_dataframe_from_list_of_arrays(input_list, column_names)
    return df


In [7]:
def apply_phenotype(array, phenotype):
    # Split the phenotype string into individual operations
    operations = phenotype.split(';')
    x = copy.deepcopy(array)
    # Iterate over each operation and execute it
    for operation in operations:
        # Strip any leading/trailing whitespace from the operation
        operation = operation.strip()
        
        # Use the exec function to execute the operation on the array
        if operation:
            exec(operation)

    return x


def create_dice_cf(input_folder):
    
    features = ['Age','Sex','Job','Housing','Saving accounts','Checking account','Credit amount','Duration','Purpose']
    backend = 'PYT'  # needs pytorch installed
    ML_modelpath = './../model_training/credit_risk_model'
    m = dice_ml.Model(model_path=ML_modelpath, backend=backend)
    
    d = dice_ml.Data(features={'Age': [18,80],
                           'Sex' : [0, 1],
                           'Job' : [0, 3],
                           'Housing' : [0, 2],
                           'Saving accounts' : [0, 3],
                           'Checking account' : [0, 2],
                           'Credit amount': [100.0, 50000.0],
                           'Duration' : [3, 100],
                           'Purpose' : [0, 7]  },
                 outcome_name='outcome')

    exp = dice_ml.Dice(d, m)
    path_to_DICE_cf = 'Dice_cf'
    path_to_GE_cf = 'Ge_cf'
    
    for i in os.listdir(input_folder):
        if i=='.DS_Store':
            continue
        filename = input_folder + '/' + i + '/' "final_gen.csv"
        input_array_location = input_folder + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        data = pd.read_csv(filename)
        data = data.sort_values(by=['o_1'])
        data = data[data['o_1'] < 0.50]
        number_of_counterfactuals = data.shape[0]
        # print(number_of_counterfactuals)
        input_dice = pd.DataFrame(data=[input], columns=features)
        print(number_of_counterfactuals)
        dice_exp = exp.generate_counterfactuals(input_dice, total_CFs=number_of_counterfactuals, desired_class="opposite")
        isExist = os.path.exists(path_to_DICE_cf)
        if not isExist:
            os.makedirs(path_to_DICE_cf)
        # print(dice_exp.cf_examples_list[0].final_cfs_df.head())
        dice_exp.cf_examples_list[0].final_cfs_df.to_csv(path_or_buf= path_to_DICE_cf + "/" + str(i) + '.csv'  , index=False)
        
        evolved_x = []
    
        for phenotype in data['Phenotype'].tolist():
            x_out = apply_phenotype(input, phenotype)
            evolved_x.append(x_out)
            
        GE_df = pd.DataFrame(data=evolved_x, columns=features)
        isExist = os.path.exists(path_to_GE_cf)
        if not isExist:
            os.makedirs(path_to_GE_cf)
        GE_df.to_csv(path_to_GE_cf + "/" + str(i) + '.csv'  , index=False)
        
    
        
    

In [8]:
input_folder = './../output/NSGAIII_multi/'
create_dice_cf(input_folder)

275


100%|██████████| 1/1 [00:00<00:00, 22.91it/s]


112


100%|██████████| 1/1 [00:00<00:00, 40.24it/s]


315


100%|██████████| 1/1 [00:00<00:00, 28.63it/s]


146


100%|██████████| 1/1 [00:00<00:00, 39.89it/s]


319


100%|██████████| 1/1 [00:00<00:00, 28.84it/s]


231


100%|██████████| 1/1 [00:00<00:00, 28.43it/s]


397


100%|██████████| 1/1 [00:00<00:00, 28.05it/s]


109


100%|██████████| 1/1 [00:00<00:00, 40.28it/s]


242


100%|██████████| 1/1 [00:00<00:00, 28.58it/s]


243


100%|██████████| 1/1 [00:00<00:00, 28.94it/s]


232


100%|██████████| 1/1 [00:00<00:00, 28.69it/s]


304


100%|██████████| 1/1 [00:00<00:00, 29.03it/s]


115


100%|██████████| 1/1 [00:00<00:00, 39.08it/s]


232


100%|██████████| 1/1 [00:00<00:00, 40.85it/s]


25


100%|██████████| 1/1 [00:00<00:00, 40.27it/s]


278


100%|██████████| 1/1 [00:00<00:00, 28.41it/s]


186


100%|██████████| 1/1 [00:00<00:00, 39.12it/s]


223


100%|██████████| 1/1 [00:00<00:00, 36.22it/s]


In [9]:
def create_NSGAII_cf(input_folder):
    
    features = ['Age','Sex','Job','Housing','Saving accounts','Checking account','Credit amount','Duration','Purpose']

    

    path_to_NSGAII_cf = 'Ge_NSGAII_cf'
    
    for i in os.listdir(input_folder):
        if i=='.DS_Store':
            continue
        filename = input_folder + '/' + i + '/' "final_gen.csv"
        input_array_location = input_folder + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        data = pd.read_csv(filename)
        data = data.sort_values(by=['o_1'])
        data = data[data['o_1'] < 0.50]
        number_of_counterfactuals = data.shape[0]
        # print(number_of_counterfactuals)
        
        evolved_x = []
    
        for phenotype in data['Phenotype'].tolist():
            x_out = apply_phenotype(input, phenotype)
            evolved_x.append(x_out)
            
        GE_df = pd.DataFrame(data=evolved_x, columns=features)
        isExist = os.path.exists(path_to_NSGAII_cf)
        if not isExist:
            os.makedirs(path_to_NSGAII_cf)
        GE_df.to_csv(path_to_NSGAII_cf + "/" + str(i) + '.csv'  , index=False)
        
        
input_folder = './../output/NSGAII_multi/'
create_NSGAII_cf(input_folder)
        