In [1]:
import pandas as pd
import numpy as np
import imblearn
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import time
import sklearn
import pickle

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from copy import deepcopy
from collections import Counter
from imblearn.over_sampling import SMOTE
from random import gauss
from scipy.spatial import distance_matrix

from scipy.optimize import minimize
from numpy.random import rand
from scipy.spatial import distance_matrix

from dataset import *

## NeMe

In [2]:
def generate_cat_idxs():
    """
    Get indexes for all categorical features that are one hot encoded
    """

    cat_idxs = list()
    start_idx = len(continuous_feature_names)
    for cat in enc.categories_:
        cat_idxs.append([start_idx, start_idx + cat.shape[0]])
        start_idx = start_idx + cat.shape[0]
    return cat_idxs

In [3]:
def neme_bounds(x):
    
    bounds = list()

    for i in range(len(continuous_feature_names)):

        cat_name = continuous_feature_names[i]
        value = x[i]

        # If the continuous feature can take any value
        if action_meta[ cat_name ]['can_increase'] and action_meta[ cat_name ]['can_decrease']:
            f_range = (0,1)
            bounds.append(f_range)

        # If the continous feature can only go up
        elif action_meta[ cat_name ]['can_increase'] and not action_meta[ cat_name ]['can_decrease']:
            f_range = (value,1)
            bounds.append(f_range)

        # if the continuous features can only go down
        elif not action_meta[ cat_name ]['can_increase'] and action_meta[ cat_name ]['can_decrease']:
            f_range = (0, value)
            bounds.append(f_range)

        # If it's not actionable
        else:
            f_range = (value, value)
            bounds.append(f_range)
            
    for i in range(len(cat_idxs)):
                
        if action_meta[categorical_feature_names[i]]['actionable'] == False:
            for j in range(cat_idxs[i][1] - cat_idxs[i][0]):
                bounds.append((x[cat_idxs[i][0]+j], x[cat_idxs[i][0]+j]))
                
        else:
            for j in range(cat_idxs[i][1] - cat_idxs[i][0]):
                bounds.append((0,1))
                
    return tuple(bounds)

In [4]:
def round_neme_cats(x):
    for i in range(len(categorical_feature_names)):
        cat_values = x[cat_idxs[i][0]: cat_idxs[i][1]]
        max_value_idx = np.argmax(cat_values)
        cat_values *= 0.
        cat_values[max_value_idx] = 1.
    return x

In [5]:
def clip_cats_to_actionable(result, original_x):

    for instance in result:
        
        for i in range(len(categorical_feature_names)):
            
            cat_name = categorical_feature_names[i]
            cat_values = instance[cat_idxs[i][0]: cat_idxs[i][1]]
            value_idx = np.argmax(cat_values)
            org_value_idx = np.argmax(original_x[cat_idxs[i][0]: cat_idxs[i][1]])
            
            # If actionable
            if action_meta[categorical_feature_names[i]]['actionable'] == True:
                
                # If the continuous feature can take any value
                if action_meta[ cat_name ]['can_increase'] and action_meta[ cat_name ]['can_decrease']:
                    continue

                # If the continous feature can only go up
                elif action_meta[ cat_name ]['can_increase'] and not action_meta[ cat_name ]['can_decrease']:
                    if value_idx < org_value_idx:
                        instance[cat_idxs[i][0]: cat_idxs[i][1]] = original_x[cat_idxs[i][0]: cat_idxs[i][1]]

                # if the continuous features can only go down
                elif not action_meta[ cat_name ]['can_increase'] and action_meta[ cat_name ]['can_decrease']:
                    if value_idx > org_value_idx:
                        instance[cat_idxs[i][0]: cat_idxs[i][1]] = original_x[cat_idxs[i][0]: cat_idxs[i][1]]
            
    return result

In [6]:
def objective(x):
    """
    probability of semi-factual class
    l2 distance matrix of m samples
    minimize negative of both to maximize objective
    """
            
    max_prob_of_sf = -clf.predict_proba(x.reshape(m,-1)).T[1].sum()
    max_div = -distance_matrix(x.reshape(m,-1), x.reshape(m,-1)).sum()
    dists, _ = knn.kneighbors(X=x.reshape(m, -1), n_neighbors=1, return_distance=True)
    dists = dists.sum()
    
    return max_prob_of_sf + max_div + dists

In [7]:
def get_actionable_feature_idxs(continuous_features, categorical_features):
    feature_names = continuous_feature_names + categorical_feature_names
    actionable_idxs = list() 
    for i, f in enumerate(feature_names):
        if action_meta[f]['actionable']:
            actionable_idxs.append( [i, action_meta[f]['can_increase'], action_meta[f]['can_decrease']] )
    return actionable_idxs


action_meta = actionability_constraints()

df_train = pd.read_csv('data/df_train.csv')
df_test = pd.read_csv('data/df_test.csv')

X_train = np.load('data/X_train.npy', )
X_test = np.load('data/X_test.npy', )
y_train = np.load('data/y_train.npy', )
y_test = np.load('data/y_test.npy', )

# ## Normalization
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

with open('data/enc.pkl', 'rb') as file:
    enc = pickle.load(file)

# ## Generate Training Column Label
#### Logistic Regression
with open('data/clf.pkl', 'rb') as file:
    clf = pickle.load(file)

test_preds = clf.predict(X_test)
train_preds = clf.predict(X_train)

test_probs = clf.predict_proba(X_test)
train_probs = clf.predict_proba(X_train)

df_test['preds'] = test_preds
df_test['probs'] = test_probs.T[1]

df_train['preds'] = train_preds
df_train['probs'] = train_probs.T[1]

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
action_meta = actionability_constraints()

In [9]:
cat_idxs = generate_cat_idxs()

In [10]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [13]:
ga_df = pd.read_csv('data/GA_Xps_diverse.csv')
test_idxs = np.sort(np.array(ga_df.test_idx.value_counts().index.tolist()))
m = 3
final_data = list()
found_sfs = list()

for test_idx in test_idxs[:3]:

    original_x = deepcopy(X_test[test_idx])
    x = deepcopy(X_test[test_idx])

    bnds = neme_bounds(x)
    bnds = np.array(list(bnds) * m).reshape(-1, 2).tolist()
    bnds = tuple([tuple(ele) for ele in bnds])

    x = np.tile(x, m).reshape(m, -1)
    print(x.shape)
    result = minimize(objective, x, method='nelder-mead', bounds=bnds)

    #### Clip categories to actionable
    result = clip_cats_to_actionable(result['x'].reshape(m, -1), original_x)
            
    for i, pred in enumerate(clf.predict(result).tolist()):
        if pred == 0:
            found_sfs.append(1)
            final_data.append(original_x.tolist())
        else:
            found_sfs.append(1)
            final_data.append(result[i].tolist())

(3, 71)


  result = minimize(objective, x, method='nelder-mead', bounds=bnds)


(3, 71)


  result = minimize(objective, x, method='nelder-mead', bounds=bnds)


(3, 71)


  result = minimize(objective, x, method='nelder-mead', bounds=bnds)


In [11]:
# final_df = pd.DataFrame(final_data)

In [12]:
# final_df['test_idx'] = ga_df.test_idx
# final_df['sf_found'] = found_sfs

In [20]:
final_df.to_csv('data/neme_diverse.csv')

In [21]:
final_df[final_df.test_idx.isin(test_idxs)]

AttributeError: 'DataFrame' object has no attribute 'test_idx'