In [1]:
import pandas as pd
import numpy as np
import imblearn
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import time
import sklearn
import pickle

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from copy import deepcopy
from collections import Counter
from imblearn.over_sampling import SMOTE
from random import gauss
from scipy.spatial import distance_matrix

from scipy.optimize import minimize
from numpy.random import rand
from scipy.spatial import distance_matrix

from dataset import *

## NeMe

In [2]:
def get_actionable_feature_idxs(continuous_features, categorical_features):
    feature_names = continuous_feature_names + categorical_feature_names
    actionable_idxs = list() 
    for i, f in enumerate(feature_names):
        if action_meta[f]['actionable']:
            actionable_idxs.append( [i, action_meta[f]['can_increase'], action_meta[f]['can_decrease']] )
    return actionable_idxs


action_meta = actionability_constraints()

df_train = pd.read_csv('data/df_train.csv')
df_test = pd.read_csv('data/df_test.csv')

X_train = np.load('data/X_train.npy', )
X_test = np.load('data/X_test.npy', )
y_train = np.load('data/y_train.npy', )
y_test = np.load('data/y_test.npy', )

# ## Normalization
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

with open('data/enc.pkl', 'rb') as file:
    enc = pickle.load(file)

# ## Generate Training Column Label
#### Logistic Regression
with open('data/clf.pkl', 'rb') as file:
    clf = pickle.load(file)

test_preds = clf.predict(X_test)
train_preds = clf.predict(X_train)

test_probs = clf.predict_proba(X_test)
train_probs = clf.predict_proba(X_train)

df_test['preds'] = test_preds
df_test['probs'] = test_probs.T[1]

df_train['preds'] = train_preds
df_train['probs'] = train_probs.T[1]

In [3]:
def generate_cat_idxs():
    """
    Get indexes for all categorical features that are one hot encoded
    """

    cat_idxs = list()
    start_idx = len(continuous_feature_names)
    for cat in enc.categories_:
        cat_idxs.append([start_idx, start_idx + cat.shape[0]])
        start_idx = start_idx + cat.shape[0]
    return cat_idxs

In [4]:
action_meta = actionability_constraints()

In [5]:
cat_idxs = generate_cat_idxs()

In [6]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [7]:
def neme_bounds(x):
    
    bounds = list()

    for i in range(len(continuous_feature_names)):

        cat_name = continuous_feature_names[i]
        value = x[i]

        # If the continuous feature can take any value
        if action_meta[ cat_name ]['can_increase'] and action_meta[ cat_name ]['can_decrease']:
            f_range = (0,1)
            bounds.append(f_range)

        # If the continous feature can only go up
        elif action_meta[ cat_name ]['can_increase'] and not action_meta[ cat_name ]['can_decrease']:
            f_range = (value,1)
            bounds.append(f_range)

        # if the continuous features can only go down
        elif not action_meta[ cat_name ]['can_increase'] and action_meta[ cat_name ]['can_decrease']:
            f_range = (0, value)
            bounds.append(f_range)

        # If it's not actionable
        else:
            f_range = (value, value)
            bounds.append(f_range)
            
    for i in range(len(cat_idxs)):
                
        if action_meta[categorical_feature_names[i]]['actionable'] == False:
            for j in range(cat_idxs[i][1] - cat_idxs[i][0]):
                bounds.append((x[cat_idxs[i][0]+j], x[cat_idxs[i][0]+j]))
                
        else:
            for j in range(cat_idxs[i][1] - cat_idxs[i][0]):
                bounds.append((0,1))
                
    return tuple(bounds)

In [8]:
def round_neme_cats(x):
    for i in range(len(categorical_feature_names)):
        cat_values = x[cat_idxs[i][0]: cat_idxs[i][1]]
        max_value_idx = np.argmax(cat_values)
        cat_values *= 0.
        cat_values[max_value_idx] = 1.
    return x

In [27]:
cat_values = np.array([0,1,0.95, 0.9]) > 0.9

In [32]:
# df_test.iloc[test_idx]

In [25]:
def get_actionable_range(x):

    dice_action = {}

    for feature in action_meta.keys():

        # Only add actionable features for DiCE's constraints
        if action_meta[feature]['actionable']:

            if feature in continuous_feature_names:
                query_min_value = float(x[feature])
                query_max_value = float(x[feature])
                min_value = min([float(xxx) for xxx in pd.concat([x_train, x_test])[feature].values])
                max_value = max([float(xxx) for xxx in pd.concat([x_train, x_test])[feature].values])
            else:
                query_min_value = int(x[feature][0])
                query_max_value = int(x[feature][0])
                min_value = min([int(xxx[0]) for xxx in pd.concat([x_train, x_test])[feature].values])
                max_value = max([int(xxx[0]) for xxx in pd.concat([x_train, x_test])[feature].values])

            # Is it up or down mutable?
            if action_meta[feature]['can_increase']:
                query_max_value = max_value

            if action_meta[feature]['can_decrease']:
                query_min_value = min_value

            # If it is a continuous feature
            if feature in numerical:
                dice_action[feature] = [float(query_min_value), float(query_max_value)]
            else:
                dice_action[feature] = [str(x) + '-Cat' for x in list(range(query_min_value, query_max_value+1))]

    return dice_action

In [26]:
get_actionable_range(x)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [10]:
np.random.choice(np.flatnonzero(cat_values == cat_values.max()))

1

In [33]:
def get_actionable_range(original_x, cat_name):

    for i in range(len(cat_idxs)):

        if action_meta[categorical_feature_names[i]]['actionable'] == False:
            for j in range(cat_idxs[i][1] - cat_idxs[i][0]):
                bounds.append((x[cat_idxs[i][0]+j], x[cat_idxs[i][0]+j]))



In [36]:
original_x

array([0.07142857, 0.19924692, 0.35185185, 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 1.        , 1.        , 0.        , 1.        ,
       0.        ])

In [44]:
x = np.array([0,1,2,3,4,5])

x[3:] = 0
x

array([0, 1, 2, 0, 0, 0])

In [71]:
	def clip_cats_to_actionable(instance, original_x):
		
		for i in range(len(categorical_feature_names)):

			cat_name = categorical_feature_names[i]
			cat_values = instance[cat_idxs[i][0]: cat_idxs[i][1]] > 0.95
	#         value_idx = np.argmax(cat_values)
			value_idx = np.random.choice(np.flatnonzero(cat_values == cat_values.max()))
			
			
			org_value_idx = np.argmax(original_x[cat_idxs[i][0]: cat_idxs[i][1]])

			# If actionable
			if action_meta[categorical_feature_names[i]]['actionable'] == True:

				# If the feature can take any value
				if action_meta[ cat_name ]['can_increase'] and action_meta[ cat_name ]['can_decrease']:
					instance[cat_idxs[i][0]: cat_idxs[i][1]] = [0. for _ in range(len(cat_values))]
					instance[cat_idxs[i][0]: cat_idxs[i][1]][value_idx] = 1.

				# If the feature can only go up
				elif action_meta[ cat_name ]['can_increase'] and not action_meta[ cat_name ]['can_decrease']:
					if value_idx < org_value_idx:
						instance[cat_idxs[i][0]: cat_idxs[i][1]] = original_x[cat_idxs[i][0]: cat_idxs[i][1]]
					else:
						instance[cat_idxs[i][0]: cat_idxs[i][1]] = [0. for _ in range(len(cat_values))]
						instance[cat_idxs[i][0]: cat_idxs[i][1]][value_idx] = 1.

				# if the feature can only go down
				elif not action_meta[ cat_name ]['can_increase'] and action_meta[ cat_name ]['can_decrease']:
					if value_idx > org_value_idx:
						instance[cat_idxs[i][0]: cat_idxs[i][1]] = original_x[cat_idxs[i][0]: cat_idxs[i][1]]
					else:
						instance[cat_idxs[i][0]: cat_idxs[i][1]] = [0. for _ in range(len(cat_values))]
						instance[cat_idxs[i][0]: cat_idxs[i][1]][value_idx] = 1.
						
	#             print(instance)
						
		return instance

In [72]:
# x = np.array([0,0,1,1])

In [73]:
# np.random.choice(np.flatnonzero(x == x.max()))

In [74]:
def objective(x):
    """
    probability of semi-factual class
    l2 distance matrix of m samples
    minimize negative of both to maximize objective
    """
    
    sf_class = clf.predict(original_x.reshape(1,-1)).item()
    is_sf_loss = clf.predict(x.reshape(1,-1)).item() == sf_class
    
    similarity_orig_loss = -1. * C_reg * np.linalg.norm(x - original_x, 2)
    diversity_loss = (-C_diversity * sum([np.linalg.norm(x - np.array(sf), 2) for sf in CURRENT_SFS])) / m
    loss = similarity_orig_loss + diversity_loss 
    return loss * is_sf_loss


In [115]:
C_simple=.1 
C_reg=1. 
C_diversity=5.
C_feasibility=1.
C_sf=1.
sparsity_upper_bound=2.
solver="Nelder-Mead"
max_iter=None
non_zero_threshold_sparsity = 1e-5

In [116]:
X_test.shape

(500, 71)

In [165]:
ga_df = pd.read_csv('data/GA_Xps_diverse.csv')
# test_idxs = np.sort(np.array(ga_df.test_idx.value_counts().index.tolist()))
m = 10
final_data = list()
found_sfs = list()

for test_idx in [3]:

    # Compute diverse sfs
    CURRENT_SFS = list()
    original_x = deepcopy(X_test[test_idx])
    bnds = neme_bounds(original_x) 
    
    for i in range(m):
        x = deepcopy(original_x)
        result = minimize(objective, x, method='nelder-mead', bounds=bnds, options={'maxiter': 5000})
        result = clip_cats_to_actionable(result['x'].reshape(1, -1)[0], original_x)
        CURRENT_SFS.append(result.tolist())
#         CURRENT_SFS.append(result['x'].reshape(1, -1)[0])

    CURRENT_SFS = np.array(CURRENT_SFS)
    
    for i, pred in enumerate(clf.predict(CURRENT_SFS).tolist()):
        if pred == 0:
            found_sfs.append(0)
            final_data.append(original_x.tolist())
        else:
            found_sfs.append(1)
            final_data.append(CURRENT_SFS[i].tolist())

In [166]:
final_df = pd.DataFrame(final_data)

In [167]:
final_df['test_idx'] = [2]*10 #+ [2]*6 + [3]*6
final_df['sf_found'] = found_sfs

In [168]:
final_df.to_csv('data/neme_diverse.csv')

In [169]:
final_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,test_idx,sf_found
0,0.071429,0.199247,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,0
1,0.072159,1.0,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1
2,0.562994,1.0,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1
3,0.522915,1.0,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1
4,0.071429,1.0,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1
5,0.071429,1.0,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1
6,0.60148,1.0,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1
7,0.077023,1.0,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1
8,1.0,0.199247,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1
9,0.071429,1.0,0.351852,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2,1


In [170]:
[sum(final_df.values[:, :-2][i] - original_x) for i in range(m)]

[0.0,
 0.8014832604836231,
 1.2923188870835571,
 1.2522391394224863,
 0.8007530911232972,
 0.8007530793286106,
 1.3308040422987149,
 0.8063475711925506,
 0.9285714285714286,
 0.8007530793286106]

In [171]:
bnds

((0.07142857142857142, 1),
 (0.19924692067138938, 1),
 (0.35185185185185186, 0.35185185185185186),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (0, 1),
 (1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0))