In [1]:
from matplotlib import pyplot as plt
import math
import numpy as np
import pandas as pd
import random
import seaborn as sns
from sklearn.metrics.pairwise import manhattan_distances, pairwise_distances
import time


# Introduction

In [2]:
import load_dataset
X, y = load_dataset.main('generate', 10000, 
                         n_features=30,
                         n_informative=12,
                         n_redundant=3,
                         n_repeated=1,
                         n_clusters_per_class=3)

print('Shape: ', X.shape)

0.5045
Shape:  (10000, 30)


In [3]:
from __future__ import print_function
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier, BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score



X_train, X_test, y_train, y_test = train_test_split(X, y)

#clf = RandomForestClassifier(100)
#clf = SVC(probability=True)
#clf = SGDClassifier('modified_huber')
clf = MLPClassifier(activation='identity')
#clf = BernoulliRBM()
'''logistic = LogisticRegression()
rbm = BernoulliRBM(random_state=0, verbose=True)
clf = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
'''
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_p = clf.predict_proba(X_test)

print('accuracy ', sum(y_pred == y_test)/len(y_test))
print('auc ', roc_auc_score(y_test, y_pred))

#clf.feature_importances_

accuracy  0.632
auc  0.631579217066


In [4]:
def pred(obs):
    if len(obs.shape) > 1 and obs.shape[1] > 1:
        return np.array([x[1] for x in clf.predict_proba(obs)])
    else:
        return clf.predict_proba(obs.reshape(1, -1))[0][1]

In [5]:
def distance_first_ennemy(observation, n=1):
    D = pairwise_distances(X, observation.reshape(1, -1), metric='euclidean')
    idxes = sorted(enumerate(D), key=lambda x:x[1])
    out = []
    dists = []
    k = 0
    while len(out) < n:
        i = idxes[k]
        if clf.predict(X[i[0]].reshape(1, -1)) != clf.predict(observation.reshape(1, -1)):
            out.append(X[i[0]])
            dists.append(pairwise_distances(X[i[0]].reshape(1, -1), observation.reshape(1, -1))[0][0])
        k += 1
    return out, dists

# Work

In [6]:
from scipy.stats import pearsonr, spearmanr

def interpretability_scores(obs_to_interprete, ennemy):
    eucl = pairwise_distances(obs_to_interprete.reshape(1,- 1), ennemy.reshape(1, -1))[0][0]
    var_non0 = sum((obs_to_interprete - ennemy) != 0)
    pearson = pearsonr(obs_to_interprete, ennemy)[0]
    #profil =
    return {'distance': eucl, 'nb_directions_move': var_non0, 'pearson':pearson}

# Benchmark

In [132]:
import exploration.path_to_ennemies
import imp
imp.reload(exploration.path_to_ennemies)

<module 'exploration.path_to_ennemies' from '/home/laugel/Documents/thesis/code/highgarden/highgarden/exploration/path_to_ennemies.py'>

In [7]:

def interpretability_metrics(X, prediction_function, interpretability_method, obs_to_interprete, **kwargs):
    t1 = time.time()
    nearest_ennemy = interpretability_method(X, prediction_function, obs_to_interprete, **kwargs)
    scores = interpretability_scores(obs_to_interprete, nearest_ennemy)
    scores['time'] = time.time() - t1
    return scores, nearest_ennemy


def benchmark_oneobs(X, prediction_function, modules, obs_to_interprete):
    df_benchmark = []
    ennemies = []
    index = []
    for mod_name, mod in modules.items():
        row, ennemy = interpretability_metrics(X, prediction_function, mod.main, obs_to_interprete)
        ennemies.append(ennemy)
        cols = list(row.keys())
        row = list(row.values())
        index.append(mod_name)
        df_benchmark.append(row)
    index_distance = cols.index('distance') 
    df_s = sorted(zip(df_benchmark, ennemies, index), key=lambda pair:pair[0][index_distance])
    df_benchmark, ennemies, index = [x[0] for x in df_s], [x[1] for x in df_s], [x[2] for x in df_s]
    ennemies = np.array(ennemies)
    df_benchmark = pd.DataFrame(df_benchmark, columns=cols, index=index)
    
    return df_benchmark, ennemies

def benchmark_several(X, prediction_function, modules_to_aggregate, number_observations):
    i = 0
    for i in range(number_observations):
        print('observation ', i)
        idx_test = np.random.randint(X_test.shape[0])
        obs_to_interprete = X_test[idx_test]
        benchmark, _ = benchmark_oneobs(X, prediction_function, modules_to_aggregate, obs_to_interprete)
        if i == 0:
            out = benchmark/float(number_observations)
        else:
            out += benchmark/float(number_observations)
    return out
    


In [8]:
import exploration.uniform_growing_spheres
import exploration.path_to_ennemies
import exploration.uniform_growing_spheres_featsel
import exploration.path_to_ennemies_featsel


modules_ = {'gs': exploration.uniform_growing_spheres,
            'gs_feat_sel': exploration.uniform_growing_spheres_featsel,
          'pte': exploration.path_to_ennemies,
          'pte_feat_sel': exploration.path_to_ennemies_featsel}  
          

modules2 = {'gs': exploration.uniform_growing_spheres,
            'gs_feat_sel': exploration.uniform_growing_spheres_featsel,
           'pte':exploration.path_to_ennemies,
           'pte_feat_sel': exploration.path_to_ennemies_featsel}

idx_test = np.random.randint(X_test.shape[0])
idx = int(np.where(np.all(X_test[idx_test] == X, axis=1))[0])
obs_to_interprete = X[idx]
fe, dfe = distance_first_ennemy(obs_to_interprete)
fe = fe[0]
dfe = dfe[0]

bench, ennemies = benchmark_oneobs(X, pred, modules_, obs_to_interprete)

#benchmark_dataset = benchmark_several(X, pred, modules2, 50)

print(pred(obs_to_interprete))
bench


zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  4 Final radius (array([  1.78416034e-19]), array([  1.90310436e-19]))
zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  3 Final radius (array([  8.32608159e-20]), array([  9.51552182e-20]))
0.603385030993


Unnamed: 0,time,nb_directions_move,pearson,distance
gs_feat_sel,2.046218,7,0.992124,0.182054
gs,2.232459,30,0.98795,0.23134
pte_feat_sel,3.301854,5,0.986319,0.252913
pte,3.22511,30,0.891019,0.768394


# Results

In [11]:
def distance(obs1, obs2):
    return pairwise_distances(obs1.reshape(1,- 1), obs2.reshape(1, -1))[0][0]
def generate_between(obs1, obs2, n=10000):
    news = []
    for i in range(n):
        a = random.random() * 1
        new = a * obs2 + (1 - a)*obs1
        news.append(new)
    return news
def pred_segment(obs1, obs2):
    preds = [int(pred(x)>= 0.5) for x in generate_between(obs1, obs2)]
    return float(sum(preds))/float(len(preds))

pred_segment(ennemies[2], ennemies[0])

0.0

# Test

In [12]:
def distance_first_ally(observation, n=1):
    D = pairwise_distances(X, observation.reshape(1, -1), metric='euclidean')
    idxes = sorted(enumerate(D), key=lambda x:x[1])
    out = []
    dists = []
    k = 0
    while len(out) < n:
        i = idxes[k]
        if clf.predict(X[i[0]].reshape(1, -1)) == clf.predict(observation.reshape(1, -1)):
            out.append(X[i[0]])
            dists.append(pairwise_distances(X[i[0]].reshape(1, -1), observation.reshape(1, -1))[0][0])
        k += 1
    return out, dists

In [13]:
#trouve le plus proche ennemi sur un segment obs, ennemy
def aze(obs_to_interprete, ennemy, n_layer=10000):
    PRED_OBS = int(pred(obs_to_interprete)>=0.5)
    min_d = 9999999999
    closest = []
    for i in range(n_layer):
        alpha = random.random()
        new = alpha * obs_to_interprete + (1 - alpha) * ennemy
        new_d = distance(new, obs_to_interprete)
        if (int(pred(new)>=0.5) != PRED_OBS) & (new_d < min_d):
            closest = new
            min_d = new_d
    return closest

In [14]:
idx_test = np.random.randint(X_test.shape[0])
idx = int(np.where(np.all(X_test[idx_test] == X, axis=1))[0])
obs_to_interprete = X[idx]
fe, _ = distance_first_ennemy(obs_to_interprete, n=50)
fa, _ = distance_first_ally(obs_to_interprete, n=50)
#closests_from_ennemies = [aze(obs_to_interprete, x) for x in fe]

"""center_a = sum(fa)/len(fa)
center_e = sum(fe)/len(fe)"""

e_gsf = exploration.uniform_growing_spheres_featsel.main(X, pred, obs_to_interprete)
e_ptef = exploration.path_to_ennemies_featsel.main(X, pred, obs_to_interprete)

'''for i in closests_from_ennemies + [e_gsf, e_ptef]:
    print(interpretability_scores(obs_to_interprete, i))'''

zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  2 Final radius (array([  2.95331174e-42]), array([  3.93774898e-42]))


'for i in closests_from_ennemies + [e_gsf, e_ptef]:\n    print(interpretability_scores(obs_to_interprete, i))'

# New Algorithm

# Other: Robustness tests

In [28]:
idx_test = np.random.randint(X_test.shape[0])
idx = int(np.where(np.all(X_test[idx_test] == X, axis=1))[0])
obs_to_interprete = X[idx]

e_gsf = exploration.uniform_growing_spheres_featsel.main(X, pred, obs_to_interprete)
'''
On veut tester que e_gsf est robuste, cest à dire que si je le bouge un peu je garde le meme ennemi
CHANGER DE METHODE OU DE TEST
'''
e_news = []
for n in range(50):
    epsilon = 2
    alpha = (np.random.random(X.shape[1]) - 0.5)*epsilon
    new = e_gsf + alpha
    if int(pred(new)>=0.5) != int(pred(obs_to_interprete)>=0.5):
        e_news.append(exploration.uniform_growing_spheres_featsel.main(X, pred, obs_to_interprete))

print(len(e_news))
from scipy.stats import f_oneway
t, p =  f_oneway(*e_news)
p

zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  5 Final radius (array([  2.78905970e-39]), array([  2.87902937e-39]))
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  4 Final radius (array([  1.34954502e-39]), array([  1.43951469e-39]))
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  3 Final radius (array([  6.29787675e-40]), array([  7.19757343e-40]))
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  1 Final radius (array([  8.99696678e-39]), array([  1.79939336e-38]))
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  6 Final radius (array([  5.66808907e-39]), array([  5.75805874e-39]))
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
zoom
Final nb of iterations  5 Final radius (array([  2.7890

0.99999999999999989

# Other: Convexity tests

In [None]:
score = 0
for i in range(100):
    print('observation ', i)
    idx_test = np.random.randint(X_test.shape[0])
    obs_to_interprete = X_test[idx_test]
    pte = exploration.path_to_ennemies.main(X, pred, obs_to_interprete)
    gs = exploration.uniform_growing_spheres.main(X, pred, obs_to_interprete)
    score += ((pred_segment(pte, gs) - 0.5)**2)*4
score/100
