In [1]:
import dice_ml

import pandas as pd

import torch.nn as nn
import torch

import copy

import os
import numpy as np
from pickle import dump , load
import seaborn as sns

In [2]:
class AdultClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(13, 64)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(64, 128)
        self.act2 = nn.ReLU()
        self.hidden3 = nn.Linear(128, 32)
        self.act3 = nn.ReLU()
        self.hidden4 = nn.Linear(32, 16)
        self.act4 = nn.ReLU()
        self.output = nn.Linear(16, 1)
        self.act_output = nn.Sigmoid()
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.dropout(self.act1(self.hidden1(x)))
        x = self.dropout(self.act2(self.hidden2(x)))
        x = self.dropout(self.act3(self.hidden3(x)))
        x = self.dropout(self.act4(self.hidden4(x)))
        # x = self.output(x)
        x = self.act_output(self.output(x))
        return x
    
    
def load_model():
    model = torch.load('./../model_training/adult_credit__model')
    model.eval()
    return model

def eval_model(model, input, scaler, columns_to_standardize):
    model.eval()
    # print(input)
    input = input.reshape(-1, 13)
    input = torch.tensor(input, dtype=torch.float32)
    input = input.numpy()
    input[:, columns_to_standardize] = scaler.transform(input[:, columns_to_standardize])
    input = torch.from_numpy(input).type(torch.float)
    with torch.no_grad():
        prob = model(input)
    return prob.tolist()[0][0]

def load_scaler(scaler_loc):
    return load(open(scaler_loc, 'rb'))

columns_to_standardize = [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]

In [3]:
model = load_model()

In [4]:
def load_data(data):
    return np.load(data)


def create_dataframe_from_list_of_arrays(arrays, column_names=None):
    data= []
    if not arrays:
        raise ValueError("The list of arrays is empty")
    
    # Check if all arrays have the same length
    length = len(arrays[0])
    if not all(len(arr) == length for arr in arrays):
        raise ValueError("All arrays must have the same length")
    
    # Create a dictionary for DataFrame creation

    if len(column_names) != length:
        print(len(column_names), len(arrays))
        raise ValueError("Number of column names must match the number of arrays")
    for i in arrays:
        data.append({column_names[j]: i[j] for j in range(len(i))})
    return pd.DataFrame(data)


def create_data(folder_location, column_names):
    input_list = []
    for i in os.listdir(folder_location):
        if i=='.DS_Store':
            continue
        input_array_location = folder_location + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        print(input)
        input_list.append(input)
    
    df = create_dataframe_from_list_of_arrays(input_list, column_names)
    return df


In [5]:
def scale_input(scaler, input, columns_to_standardize):
    input = input.reshape(-1, 13)
    input = torch.tensor(input, dtype=torch.float32)
    input = input.numpy()
    input[:, columns_to_standardize] = scaler.transform(input[:, columns_to_standardize])
    return input


def calculate_o_1(phenotype, input, model, scaler, columns_to_standardize):
    mod_in = apply_phenotype(input, phenotype)
    o_1 = eval_model(model, mod_in, scaler, columns_to_standardize)
    return o_1

In [6]:
scaler = load_scaler('./../model_training/StandardScaler.pkl')
bounds = [(17, 90), (0, 8), (12285, 1455436), (1, 16), (0, 6), (0, 14), (0, 5), (0, 4), (0, 1), (0, 99999), (0, 4365),
              (1, 99), (0, 41)]
min_array = np.array([17, 0, 12285, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])
max_array = np.array([90, 8, 1455436, 16, 6, 14, 5, 4, 1, 99999, 4365, 99, 41])
scaled_min = scale_input(scaler, min_array, columns_to_standardize)
scaled_max = scale_input(scaler, max_array, columns_to_standardize)

In [7]:
print(scaled_max)

[[ 3.8279953   2.9207141  12.163467    2.05989     2.6726549   1.7228366
   2.3551893   0.3582251   1.          9.047189    8.810054    4.685987
   0.54877317]]


In [8]:
print(scaled_min)

[[-1.8245884  -2.8898754  -1.6928626  -3.6334774  -1.8149168  -1.6631873
  -0.72823167 -4.633004    0.         -0.1972487  -0.25733045 -3.4018977
  -5.0974035 ]]


In [9]:
def apply_phenotype(array, phenotype):
    # Split the phenotype string into individual operations
    operations = phenotype.split(';')
    x = copy.deepcopy(array)
    # Iterate over each operation and execute it
    for operation in operations:
        # Strip any leading/trailing whitespace from the operation
        operation = operation.strip()
        
        # Use the exec function to execute the operation on the array
        if operation:
            exec(operation)

    return x


def inverse(dice_df_scaled, scaler, columns_to_standardize):
    df_values = dice_df_scaled.values
    print(df_values.shape)
    df_values[:, columns_to_standardize] = scaler.inverse_transform(df_values[:, columns_to_standardize])
    df_original = pd.DataFrame(df_values, columns=dice_df_scaled.columns)
    return df_original
    


def create_dice_cf(input_folder, scaler, model, columns_to_standardize):
    
    features = ['age','workclass','fnlwgt','education-num','marital-status','occupation','relationship','race','sex',
            'capital-gain','capital-loss','hours-per-week','native-country']
    backend = 'PYT'  # needs pytorch installed
    ML_modelpath = './../model_training/adult_credit__model'
    m = dice_ml.Model(model_path=ML_modelpath, backend=backend)
    
    d = dice_ml.Data(features={'age': [-1.8245884,3.8279953],
                           'workclass' : [-2.8898754, 2.9207141],
                           'fnlwgt' : [-1.6928626, 12.163467],
                           'education-num' : [-3.6334774,   2.05989 ],
                           'marital-status' : [-1.8149168, 2.6726549],
                           'occupation' : [-1.6631873,  1.7228366],
                           'relationship': [-0.72823167, 2.3551893],
                           'race' : [-4.633004,  0.3582251],
                           'sex' : [0.0, 1.0],
                           'capital-gain' : [ -0.1972487,  9.047189 ],
                           'capital-loss': [  -0.25733045,    8.810054],
                           'hours-per-week' : [ -3.4018977,  4.685987],
                           'native-country' : [-5.0974035, 0.54877317]},
                 outcome_name='outcome')

    exp = dice_ml.Dice(d, m)
    path_to_DICE_cf = 'Dice_cf'
    path_to_GE_cf = 'Ge_cf'
    
    for i in os.listdir(input_folder):
        if i=='.DS_Store':
            continue
        filename = input_folder + '/' + i + '/' "final_gen.csv"
        input_array_location = input_folder + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        # print(input.shape)
        input_scaled = scale_input(scaler, input, columns_to_standardize)
        # print(input_scaled[0].shape)
        data = pd.read_csv(filename)
        
        data['o_1_mod'] = data.apply(lambda row: calculate_o_1(row['Phenotype'], input, model, scaler, columns_to_standardize), axis=1)
        data = data.sort_values(by=['o_1_mod'])
        data = data[data['o_1_mod'] > 0.50]
        
        number_of_counterfactuals = data.shape[0]
        # print(number_of_counterfactuals)
        input_dice = pd.DataFrame(data=[input_scaled[0]], columns=features)
        print(number_of_counterfactuals)
        dice_exp = exp.generate_counterfactuals(input_dice, total_CFs=number_of_counterfactuals, desired_class="opposite")
        isExist = os.path.exists(path_to_DICE_cf)
        if not isExist:
            os.makedirs(path_to_DICE_cf)
        # print(dice_exp.cf_examples_list[0].final_cfs_df.head())
        dice_df_scaled = dice_exp.cf_examples_list[0].final_cfs_df
        dice_df_original = inverse(dice_df_scaled, scaler, columns_to_standardize)
        dice_df_original.to_csv(path_or_buf= path_to_DICE_cf + "/" + str(i) + '.csv'  , index=False)
        
        evolved_x = []
    
        for phenotype in data['Phenotype'].tolist():
            x_out = apply_phenotype(input, phenotype)
            evolved_x.append(x_out)
            
        GE_df = pd.DataFrame(data=evolved_x, columns=features)
        isExist = os.path.exists(path_to_GE_cf)
        if not isExist:
            os.makedirs(path_to_GE_cf)
        GE_df.to_csv(path_to_GE_cf + "/" + str(i) + '.csv'  , index=False)
        
    
        
    

In [10]:
input_folder = './../output/NSGAIII_multi/'
create_dice_cf(input_folder, scaler, model, columns_to_standardize)

3


100%|██████████| 1/1 [00:00<00:00, 26.59it/s]


(3, 14)
2


100%|██████████| 1/1 [00:00<00:00, 39.61it/s]

(2, 14)





5


100%|██████████| 1/1 [00:00<00:00, 37.89it/s]


(5, 14)
5


100%|██████████| 1/1 [00:00<00:00, 39.80it/s]


(5, 14)
5


100%|██████████| 1/1 [00:00<00:00, 39.58it/s]


(5, 14)
16


100%|██████████| 1/1 [00:00<00:00, 39.37it/s]


(16, 14)
5


100%|██████████| 1/1 [00:00<00:00, 40.42it/s]


(5, 14)
10


100%|██████████| 1/1 [00:00<00:00, 39.09it/s]


(10, 14)
16


100%|██████████| 1/1 [00:00<00:00, 40.04it/s]


(16, 14)
2


100%|██████████| 1/1 [00:00<00:00, 40.46it/s]


(2, 14)
10


100%|██████████| 1/1 [00:00<00:00, 39.53it/s]


(10, 14)
4


100%|██████████| 1/1 [00:00<00:00, 39.95it/s]


(4, 14)
2


100%|██████████| 1/1 [00:00<00:00, 40.43it/s]


(2, 14)
5


100%|██████████| 1/1 [00:00<00:00, 39.49it/s]


(5, 14)
4


100%|██████████| 1/1 [00:00<00:00, 39.87it/s]


(4, 14)
68


100%|██████████| 1/1 [00:00<00:00, 39.87it/s]


(68, 14)
4


100%|██████████| 1/1 [00:00<00:00, 40.66it/s]


(4, 14)
16


100%|██████████| 1/1 [00:00<00:00, 40.11it/s]


(16, 14)
24


100%|██████████| 1/1 [00:00<00:00, 40.59it/s]


(24, 14)
8


100%|██████████| 1/1 [00:00<00:00, 39.90it/s]


(8, 14)
7


100%|██████████| 1/1 [00:00<00:00, 39.98it/s]


(7, 14)
2


100%|██████████| 1/1 [00:00<00:00, 40.64it/s]


(2, 14)
5


100%|██████████| 1/1 [00:00<00:00, 40.55it/s]


(5, 14)
7


100%|██████████| 1/1 [00:00<00:00, 40.70it/s]


(7, 14)
3


100%|██████████| 1/1 [00:00<00:00, 40.28it/s]


(3, 14)
4


100%|██████████| 1/1 [00:00<00:00, 40.74it/s]


(4, 14)
1


100%|██████████| 1/1 [00:00<00:00, 40.18it/s]


(1, 14)
13


100%|██████████| 1/1 [00:00<00:00, 40.50it/s]


(13, 14)
2


100%|██████████| 1/1 [00:00<00:00, 40.55it/s]


(2, 14)
8


100%|██████████| 1/1 [00:00<00:00, 40.58it/s]

(8, 14)





In [11]:
def create_NSGAII_cf(input_folder, model, scaler, columns_to_standardize):
    
    features = ['age','workclass','fnlwgt','education-num','marital-status','occupation','relationship','race','sex',
            'capital-gain','capital-loss','hours-per-week','native-country']

    

    path_to_NSGAII_cf = 'Ge_NSGAII_cf'
    
    for i in os.listdir(input_folder):
        if i=='.DS_Store':
            continue
        filename = input_folder + '/' + i + '/' "final_gen.csv"
        input_array_location = input_folder + '/' + i + '/' + 'input_data.npy'
        input = load_data(input_array_location)
        data = pd.read_csv(filename)

        
        # print(number_of_counterfactuals)
        
        data['o_1_mod'] = data.apply(lambda row: calculate_o_1(row['Phenotype'], input, model, scaler, columns_to_standardize), axis=1)
        data = data.sort_values(by=['o_1_mod'])
        data = data[data['o_1_mod'] > 0.50]
        number_of_counterfactuals = data.shape[0]
        
        evolved_x = []
    
        for phenotype in data['Phenotype'].tolist():
            x_out = apply_phenotype(input, phenotype)
            evolved_x.append(x_out)
            
        GE_df = pd.DataFrame(data=evolved_x, columns=features)
        isExist = os.path.exists(path_to_NSGAII_cf)
        if not isExist:
            os.makedirs(path_to_NSGAII_cf)
        GE_df.to_csv(path_to_NSGAII_cf + "/" + str(i) + '.csv'  , index=False)
        
        
input_folder = './../output/NSGAII_multi/'
create_NSGAII_cf(input_folder, model, scaler, columns_to_standardize)
        