In [1]:
import sklearn.datasets as ds
import pandas as pd

class Custom_dataset():
    def __init__(self,name):
        self.data = pd.read_csv('datasets/'+name+'.csv')
        self.data= pd.get_dummies(self.data,  prefix = "one_hot")
        self.DESCR = 'Description'
        self.feature_names = self.data.columns
        self.data = self.data.to_numpy()
    
    def data(self):
        return self.data.to_numpy()
    
DATASETS ={
    'wine':[ds.load_wine(),[]],
    'breast_cancer':[ds.load_breast_cancer(),[]],
    #'digits':[ds.load_digits(),[]],
    'analysis':[Custom_dataset('classifiers_analysis_results'),[]],
    'TEST1':[Custom_dataset('1_informative'),[]],
    'TEST2':[Custom_dataset('Three_clusters'),[]],
    'TEST3':[Custom_dataset('TEST3_5_informative_50_features'),[]],
    'TEST4':[ds.load_iris(),[]]
}



In [2]:
import sklearn.metrics as metrics

METRICS ={
    'accuracy_score' : metrics.accuracy_score,
    'balanced_accuracy_score' : metrics.balanced_accuracy_score,
    #'top_k_accuracy_score' : metrics.top_k_accuracy_score,
    'f1_score' : metrics.f1_score,
    'roc_auc_score' : metrics.roc_auc_score
}

import dimensionality_reduction

D_R = {
    'pca':dimensionality_reduction.Pca,
    'tsne':dimensionality_reduction.Tsne,
    'Feature_agglomeration':dimensionality_reduction.Feature_agglomeration,
}

import classifiers

CLASSIFIERS = {
    #'Tree':classifiers.Tree_classifier,
    'Knn':classifiers.Knn_classifier,
    'Svc':classifiers.SVC_classifier,
    'Decision_tree':classifiers.DecisionTree_classifier,
    'Naive_Bayes':classifiers.Naive_Bayes_classifier
}

In [3]:
class hashmap():
    d = dict({})
    i=0
    
    def add(self,element):
        self.i+=1
        self.d[self.i]=element
        return self.i
    
    def get(self,i):
        print('hello')
        return self.d[i]
    
h = hashmap()

In [12]:
from bokeh.layouts import column,row
from bokeh.models import ColumnDataSource, CustomJS, Slider, Select
from bokeh.plotting import Figure, output_file, save
from bokeh.models.widgets import Select,Button
from numpy.random import randint
from bokeh import events
from bokeh.embed import components

def get_vis(df,dr,ds):
    source = ColumnDataSource(df)
    TOOLS = "box_select,lasso_select"

    plot = Figure(tools = TOOLS,title=dr+' of '+ds,name ='scatter')
    plot.scatter('x', 'y', source=source,color='color',line_color='black',line_width=0.3)
    select_cluster = Select(title="Selected cluster:", value="1", options=['%s'%(i)for i in range(0,2)])
    button = Button(label="Next")
    
    select_cluster.js_on_change("value", CustomJS(args=dict(source=source),code="""
    cl = +this.value;
    """))
    source.selected.js_on_change('indices',CustomJS(args=dict(source=source),code="""
        console.log("source changed");
        var data = source.data;
        cb_obj.indices.forEach(element =>(data.color[element]=col(cl),data.label[element]=cl))
        source.change.emit();
    """))
    button.js_on_event(events.ButtonClick,CustomJS(args=dict(source=source),code="""
    console.log("Button pressed");
    //window.location.href = "/explanation?dataset="+JSON.stringify(source.data['label']).replaceAll('#','%23') 
    //window.open("/explanation?dataset="+JSON.stringify(source.data['label']).replaceAll('#','%23'),'_self',source.data)

    //openWindowWithPost(source.data['label'])
    new_open_window(source.data['label'])

    """))

    return row(column(select_cluster,button),plot)

# SERVER

In [13]:
#Analyze submodularpick parameter selection for convergence with different explanation count values
import classifiers
from werkzeug.serving import run_simple
from werkzeug.wrappers import Request, Response
from flask import Flask,render_template,send_file,make_response,request
import json
from json import JSONEncoder 

import pandas as pd
import numpy as np

from sklearn import ensemble, model_selection
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from lime.lime_tabular import LimeTabularExplainer
from lime import submodular_pick

from my_util import filter_arguments,decode_parameters

import matplotlib.pyplot as plt

#h = hashmap()
app = Flask(__name__)

def index_containing_substring(the_list, substring):
    for i, s in enumerate(the_list):
        if substring in s:
              return i
    return -1

def reduce_dim(df):
    pca = PCA()
    x_pca = np.array(pca.fit_transform(df))
    display(pca.explained_variance_ratio_)
    return pd.DataFrame(x_pca[:,0:2],columns=['x','y'])

@app.route('/datasets_entries')
def dataset_entries():
    print(list(DATASETS.keys()))
    return json.dumps(list(DATASETS.keys()))

@app.route('/dataset_<dataset>_details')
def dataset_details(dataset):
    return json.dumps(DATASETS[dataset][0].DESCR)

@app.route('/get_DR_<dr_method>')
def get_dr_info(dr_method):
    print('DR: ',dr_method)
    if(dr_method == '*'):
        return json.dumps(list(D_R.keys()))
    
    response = []
    defaults = D_R[dr_method].get_parameters_default()
    for name,value in D_R[dr_method].get_parameters_type().items():
        if(isinstance(value,list)):
            response.append(['categorical',name,defaults[name],value])
        if(isinstance(value,str)):
            response.append([value,name,defaults[name]])
    return json.dumps(response)

@app.route('/test_scatter_flask')
def test_scatter():
    dr = D_R[request.args.get('D_R')]
    initial_df = DATASETS[request.args.get('dataset')][0].data
    init_params =decode_parameters(request.args.get('D_R_params'),dr.args)
    
    df = dr.get_reduced(initial_df,init_params)
    if(request.args.get('selection') != None):
        color = np.array(['blue','yellow'])
        c=h.get(int(request.args.get('selection')))
        try:
            df['color'] = color[c]
            df['label'] = c
        except:
            df['color'] = 'blue'    
            df['label'] = 0
    else:
        df['color'] = 'blue'    
        df['label'] = 0
    
    layout = get_vis(df,request.args.get('D_R'),request.args.get('dataset'))
    script_bok, div_bok = components(layout)
    return render_template('test_scatter_flask.html',script_bok=script_bok, div_bok= div_bok)

@app.route('/get_classifier_<classifier>')
def get_classifier_info(classifier):
    print('Classifier: ',classifier)
    if(classifier == '*'):
        print('* selected')
        return json.dumps(list(CLASSIFIERS.keys()))
    
    response = []
    defaults = CLASSIFIERS[classifier].get_parameters_default()
    for name,value in CLASSIFIERS[classifier].get_parameters_type().items():
        if(isinstance(value,list)):
            response.append(['categorical',name,defaults[name],value])
        if(isinstance(value,str)):
            response.append([value,name,defaults[name]])
    return json.dumps(response)

@app.route('/get_classifier_performances_<metrics>-<classifiers>')
def get_classifier_perf(metrics,classifiers):
    metrics = metrics.split(',')
    classifiers = classifiers.split(',')
    
    if(len(metrics)==0 or len(classifiers)==0):
        resp = make_response(pd.Dataframe({}).to_csv())
        resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
        resp.headers["Content-Type"] = "text/csv"
        return resp

    df = DATASETS[request.args.get('dataset')][0].data
    target = h.get(int(request.args.get('selection')))
    X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.20, random_state=420)    
    
    if(len(classifiers)==1 and classifiers[0] == '*'):
        trained_classifiers = [[c,CLASSIFIERS[c].get_model().fit(X_train,y_train)] for c in CLASSIFIERS.keys()]
    elif(len(classifiers)==1):
        params = decode_parameters(request.args.get('classifier_params'),CLASSIFIERS[classifiers[0]].args)
        print(params)
        trained_classifiers = [[classifiers[0],CLASSIFIERS[classifiers[0]].get_model(params).fit(X_train,y_train)]]
    else:
        trained_classifiers = [[c,CLASSIFIERS[c].get_model().fit(X_train,y_train)] for c in classifiers]

    display(trained_classifiers)
    [display(c[0],confusion_matrix(y_test, c[1].predict(X_test))) for c in trained_classifiers]
    
    results=[]
    for metric in metrics:
        results.extend([[c[0],metric, METRICS[metric](y_test,c[1].predict(X_test))] for c in trained_classifiers ])
    results = np.array(results)
    df= pd.DataFrame(results,columns = ['classifier','metric','value'])
    display(df)
    resp = make_response(df.to_csv())
    resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
    resp.headers["Content-Type"] = "text/csv"
    return resp

@app.route('/compare_default_<metrics>-<classifiers>')
def compare_default_perfomance(metrics,classifiers):
    metrics = metrics.split(',')
    c = classifiers.split(',')

    
    if(len(metrics)==0 or len(classifiers)==0):
        resp = make_response(pd.Dataframe({}).to_csv())
        resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
        resp.headers["Content-Type"] = "text/csv"
        return resp
    c=c[0]
    df = DATASETS[request.args.get('dataset')][0].data
    target = h.get(int(request.args.get('selection')))
    X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.20, random_state=420)    
    
    
    print(CLASSIFIERS[c].args)
    print(request.args.get('classifier_params'))
    init_params =decode_parameters(request.args.get('classifier_params'),CLASSIFIERS[c].args)
    print(init_params,type(init_params))
    trained_classifiers=[]
    trained_classifiers.append(['default_'+c,CLASSIFIERS[c].get_model().fit(X_train,y_train)])
    trained_classifiers.append([c,CLASSIFIERS[c].get_model(init_params).fit(X_train,y_train)])

    results=[]
    
    

    for metric in metrics:
        [print(c) for c in trained_classifiers]
        results.extend([[c[0],metric, METRICS[metric](y_test,c[1].predict(X_test))] for c in trained_classifiers ])
    results = np.array(results)
    print(results)
    df= pd.DataFrame(results,columns = ['classifier','metric','value'])
    display(df)
    resp = make_response(df.to_csv())
    resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
    resp.headers["Content-Type"] = "text/csv"
    return resp

@app.route('/get_metrics')
def get_metrics():
    return json.dumps(list(METRICS.keys()))

@app.route('/scatter_d3')
def test_scatter_d3():
    print(request.args)
    print(request.args.get('D_R'))

    dr = D_R[request.args.get('D_R')]
    initial_df = DATASETS[request.args.get('dataset')][0].data
    init_params =decode_parameters(request.args.get('D_R_params'),dr.args)
    display(request.args.get('D_R_params'))
    display(init_params)
    df = dr.get_reduced(initial_df,init_params)
    sel = int(request.args.get('selection'))
    try:
        df['selected'] = h.get(sel)
    except:
        df['selected'] = [0]*initial_df.shape[0]
        
    resp = make_response(df.to_csv())
    resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
    resp.headers["Content-Type"] = "text/csv"
    return resp


@app.route('/favicon.ico')
def icon():
    return None

@app.route('/')
def route():
    return render_template('dataset_selection.html')

@app.route('/save_test', methods=['GET','POST'])
def save_test():
    target = list(map(int, request.data.decode("utf-8").split(',')))
    i=h.add(target)
    response = make_response(str(i), 200)
    response.mimetype = "text/plain"
    return response

@app.route('/get_hist_<hist>', methods=['GET','POST'])
def get_exp_hist(hist):
    df,cat = DATASETS[request.args.get('dataset')]
    data = pd.DataFrame(df.data,columns = df.feature_names)
    
    if('=' in hist):
        hist = hist.split('=')[0]
        
    data = np.array(data[hist])
    target = np.array(h.get(int(request.args.get('selection'))))

    df = pd.DataFrame(data[target==0])
    df.hist(bins=10)
    df = pd.DataFrame(data[target==1])
    df.hist(bins=10)
    plt.show()

    resp = {'0':list(data[target==0]),'1':list(data[target==1])}
    return json.dumps(resp)

@app.route('/corr_matrix')
def corr_matrix():
    df,cat = DATASETS[request.args.get('dataset')]
    data = pd.DataFrame(df.data,columns = df.feature_names)
    data = data.corr().round(2)
    data=data.set_index(df.feature_names[0],drop=True)
    resp = make_response(data.to_csv())
    resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
    resp.headers["Content-Type"] = "text/csv"
    return resp


@app.route('/elaborate_dataset_id_<my_id>')
def evaluate_id(my_id):
    df,cat = DATASETS[request.args.get('dataset')]
    data = df.data
    target = h.get(int(my_id))
    
    rf = CLASSIFIERS[request.args.get('classifier')]
    init_params =decode_parameters(request.args.get('classifier_params'),rf.args)
    print(init_params)
    rf= rf.get_model(init_params)

    train, test, labels_train, labels_test = train_test_split(data,target,train_size=0.80, test_size=0.20)
    rf.fit(train, labels_train)

    explainer = LimeTabularExplainer(train, feature_names=df.feature_names, class_names=['target'], 
                                     categorical_features=cat, verbose=False, mode='classification',
                                     discretize_continuous=False)

    predict_fn = lambda x: rf.predict_proba(x)

    #Single explanation test
    exp = explainer.explain_instance(test[1,:], predict_fn, num_features=2)
    exp.show_in_notebook(show_all=False)
    print(exp.available_labels())
    
    sp_obj = submodular_pick.SubmodularPick(data = train,explainer=explainer,
                                                predict_fn=rf.predict_proba ,sample_size=100,
                                                #num_features=10,
                                                num_exps_desired=30,top_labels=2)
    #plt.figure(figsize=(13,13))
    #tree.plot_tree(rf,filled=True)
    #plt.show()
    #print(export_text(rf, feature_names=df.feature_names))
    display(confusion_matrix(labels_test, rf.predict(test)))
    #[exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in sp_obj.sp_explanations];
    
    W=pd.DataFrame([dict(this.as_list()) for this in sp_obj.explanations])
    display(W)
    #Sort matrix
    W=W.fillna(0)#W.mean()
    
    #Order by abs mean
    order = W.apply(lambda c: c.abs().sum(),axis = 0)
    ordered = np.argsort(-order)
    display(ordered)
    cols = [W.columns[i] for i in ordered]
    W = W[cols]
    #Reduce W if too many columns
    if(W.shape[1]>21):
        W=W.iloc[:, 0:21]
    W =  W.set_index(W.columns[0])
    
    resp = make_response(W.to_csv())
    resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
    resp.headers["Content-Type"] = "text/csv"
    return resp

@app.route('/<page>', methods=['GET', 'POST'])
def rout(page):
    print(page)
    return render_template(page+'.html')#,params=request.form['json']

if __name__ == '__main__':
    run_simple('localhost', 9001, app,reloader_interval = 5000)


 * Running on http://localhost:9001/ (Press CTRL+C to quit)
127.0.0.1 - - [07/May/2021 12:56:17] "[37mGET /test_scatter_flask?dataset=wine&D_R_params=[]&D_R=pca HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:56:17] "[36mGET /static/Navigation.js HTTP/1.1[0m" 304 -
127.0.0.1 - - [07/May/2021 12:56:43] "[37mGET /test_scatter_flask?dataset=wine&D_R_params=[]&D_R=pca HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:57:42] "[37mGET /test_scatter_flask?dataset=wine&D_R_params=[]&D_R=pca HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:57:42] "[36mGET /static/Navigation.js HTTP/1.1[0m" 304 -
127.0.0.1 - - [07/May/2021 12:57:44] "[37mPOST /save_test HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:57:44] "[37mGET /classifier_selection?dataset=wine&D_R_params=[]&D_R=pca&selection=1 HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:57:44] "[36mGET /static/Performance.js HTTP/1.1[0m" 304 -


classifier_selection


127.0.0.1 - - [07/May/2021 12:57:44] "[37mGET /get_metrics HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:57:44] "[37mGET /get_classifier_* HTTP/1.1[0m" 200 -
[2021-05-07 12:57:44,343] ERROR in app: Exception on /get_classifier_performances_-* [GET]
Traceback (most recent call last):
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\_compat.py", line 39, in reraise
    raise value
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnv

Classifier:  *
* selected
Classifier:  performances_-*
hello


[2021-05-07 12:57:44,513] ERROR in app: Exception on /get_classifier_performances_accuracy_score,balanced_accuracy_score,f1_score,roc_auc_score-* [GET]
Traceback (most recent call last):
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\_compat.py", line 39, in reraise
    raise value
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 1950, in full_dispatch_request
    rv = self.dispatch_requ

hello


127.0.0.1 - - [07/May/2021 12:57:46] "[37mGET /test_scatter_flask?dataset=wine&D_R_params=[]&D_R=pca HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:57:51] "[37mPOST /save_test HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:57:51] "[37mGET /classifier_selection?dataset=wine&D_R_params=[]&D_R=pca&selection=2 HTTP/1.1[0m" 200 -
127.0.0.1 - - [07/May/2021 12:57:52] "[37mGET /get_metrics HTTP/1.1[0m" 200 -


classifier_selection


[2021-05-07 12:57:52,072] ERROR in app: Exception on /get_classifier_performances_-* [GET]
Traceback (most recent call last):
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\_compat.py", line 39, in reraise
    raise value
  File "C:\Users\Cristian\Miniconda3\envs\visualizationEnviroment\lib\site-packages\flask\app.py", line 1950, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\Cristian\Miniconda3\envs\visualization

Classifier:  performances_-*
Classifier:  *
* selected
hello


[['Knn', KNeighborsClassifier()],
 ['Svc',
  Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('svc', SVC(probability=True))])],
 ['Decision_tree', DecisionTreeClassifier()],
 ['Naive_Bayes', GaussianNB()]]

'Knn'

array([[13,  0],
       [ 0, 23]], dtype=int64)

'Svc'

array([[13,  0],
       [ 1, 22]], dtype=int64)

'Decision_tree'

array([[13,  0],
       [ 0, 23]], dtype=int64)

'Naive_Bayes'

array([[13,  0],
       [ 3, 20]], dtype=int64)

Unnamed: 0,classifier,metric,value
0,Knn,accuracy_score,1.0
1,Svc,accuracy_score,0.9722222222222222
2,Decision_tree,accuracy_score,1.0
3,Naive_Bayes,accuracy_score,0.9166666666666666
4,Knn,balanced_accuracy_score,1.0
5,Svc,balanced_accuracy_score,0.9782608695652174
6,Decision_tree,balanced_accuracy_score,1.0
7,Naive_Bayes,balanced_accuracy_score,0.934782608695652
8,Knn,f1_score,1.0
9,Svc,f1_score,0.9777777777777776


127.0.0.1 - - [07/May/2021 12:57:52] "[37mGET /get_classifier_performances_accuracy_score,balanced_accuracy_score,f1_score,roc_auc_score-*?dataset=wine&D_R_params=[]&D_R=pca&selection=2&classifier_params=[]&classifier=Knn HTTP/1.1[0m" 200 -


hello


[['Knn', KNeighborsClassifier()],
 ['Svc',
  Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('svc', SVC(probability=True))])],
 ['Decision_tree', DecisionTreeClassifier()],
 ['Naive_Bayes', GaussianNB()]]

'Knn'

array([[13,  0],
       [ 0, 23]], dtype=int64)

'Svc'

array([[13,  0],
       [ 1, 22]], dtype=int64)

'Decision_tree'

array([[13,  0],
       [ 0, 23]], dtype=int64)

'Naive_Bayes'

array([[13,  0],
       [ 3, 20]], dtype=int64)

Unnamed: 0,classifier,metric,value
0,Knn,accuracy_score,1.0
1,Svc,accuracy_score,0.9722222222222222
2,Decision_tree,accuracy_score,1.0
3,Naive_Bayes,accuracy_score,0.9166666666666666
4,Knn,balanced_accuracy_score,1.0
5,Svc,balanced_accuracy_score,0.9782608695652174
6,Decision_tree,balanced_accuracy_score,1.0
7,Naive_Bayes,balanced_accuracy_score,0.934782608695652
8,Knn,f1_score,1.0
9,Svc,f1_score,0.9777777777777776


127.0.0.1 - - [07/May/2021 12:57:52] "[37mGET /get_classifier_performances_accuracy_score,balanced_accuracy_score,f1_score,roc_auc_score-*?dataset=wine&D_R_params=[]&D_R=pca&selection=2&classifier_params=[]&classifier=Knn HTTP/1.1[0m" 200 -


In [6]:
np.array(h.get(10))

hello


KeyError: 10

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
plt.figure(figsize=(7,7))
tree.plot_tree(rf,filled=True)
plt.show()
display()

In [None]:
from sklearn.tree import export_text

print(export_text(rf, feature_names=['sepal length','sepal width','petal length']))

In [None]:
i = 2
explainer = LimeTabularExplainer(train, feature_names=[ 'sepal length','sepal width','petal length'], class_names=['petal width'], categorical_features=[], 
                                 verbose=False, mode='regression',discretize_continuous=False)
exp = explainer.explain_instance(test.to_numpy()[i], rf.predict, num_features=5)
exp.show_in_notebook(show_table=True)



In [None]:
d.get_dataset('test').corr()

In [None]:
YE OLD CODE
"""@app.route('/elaborate_dataset', methods=['GET', 'POST'])
def evaluate():
    df = d.get_dataset('test')
    target = json.loads(request.form['json'])
    target = list(map(int, target['selected'].split(',')))
    print(h.add(target))

    train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(df.drop(['petal width'], axis=1),
                                                                                      target,
                                                                                      train_size=0.80, test_size=0.20)
    rf=c.get_classifier('tree').fit(train, labels_train);
    display(train)
    iris_features =['sepal length', 'sepal width', 'petal length', 'petal width', 'species__Iris-setosa', 'species__Iris-versicolor','species__Iris-virginica']
    explainer = LimeTabularExplainer(train, feature_names=iris_features, class_names=['target'], categorical_features=[], verbose=False, mode='regression',discretize_continuous=False)
    display(explainer)
    sp_obj = submodular_pick.SubmodularPick(explainer, train.to_numpy(), rf.predict,sample_size=10
                                            , num_features=4, num_exps_desired=10)
    [exp.as_pyplot_figure() for exp in sp_obj.sp_explanations];
    plt.show()
    W=pd.DataFrame([dict(this.as_list()) for this in sp_obj.explanations])
    W =  W.set_index('sepal length')
    display(W)
    resp = make_response(W.to_csv())
    resp.headers["Content-Disposition"] = "attachment; filename=export.csv"
    resp.headers["Content-Type"] = "text/csv"
    return resp"""

In [None]:
def test(my_id =1):
    df,cat = d.get_data_cat('test')
    target = h.get(1)
    train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(df,target,train_size=0.80, test_size=0.20)
    rf=c.get_classifier('tree').fit(train, labels_train);

    iris_features =['sepal length', 'sepal width', 'petal length', 'petal width','one_hot_Iris-setosa', 'one_hot_Iris-versicolor', 'one_hot_Iris-virginica']

    #reshaped_decision = np.reshape(decision, (len(decision),1))
    explainer = LimeTabularExplainer(train, feature_names=iris_features, class_names=['non_selected','selected'], 
                                     categorical_features=[], verbose=False, mode='classification',
                                     discretize_continuous=False)

    predict_fn = lambda x: rf.predict_proba(x)#
    exp = explainer.explain_instance(test.iloc[1], predict_fn, num_features=5)
    exp.show_in_notebook(show_all=False)
    print(exp.available_labels())

    sp_obj = submodular_pick.SubmodularPick(data = train.to_numpy(),explainer=explainer, 
                                            predict_fn=rf.predict_proba ,sample_size=10,num_features=5,
                                            num_exps_desired=10,top_labels=3)
    display(sp_obj.sp_explanations[0].as_list(),dir(sp_obj.sp_explanations[0]))
    W=pd.DataFrame([dict(this.as_list()) for this in sp_obj.explanations])
    #display(W)
    #[exp.as_pyplot_figure() for exp in sp_obj.sp_explanations]
    df=pd.DataFrame({})
    
    for this_label in range(2):
        dfl=[]
        for i,exp in enumerate(sp_obj.sp_explanations):
            l=exp.as_list(label=this_label)
            l.append(("exp number",i))
            dfl.append(dict(l))
        dftest=pd.DataFrame(dfl)
        df=df.append(pd.DataFrame(dfl,index=[this_label for i in range(len(sp_obj.sp_explanations))]))
    display(df)
    [exp.as_pyplot_figure(label=exp.available_labels()[0]) for exp in sp_obj.sp_explanations];
test()


In [None]:
d = DATASETS['iris'][0]
temp = pd.DataFrame(d.data,columns =d.feature_names)
temp2= np.array(h.get(6))

def plt_diff(col):
    db = temp[col]
    x = db[np.where(temp2 == 0)[0]].values
    y = db[np.where(temp2 == 1)[0]].values
    bins = np.linspace(0, 10, 100)

    plt.hist(x, bins, alpha=0.5, label='0')
    plt.hist(y, bins, alpha=0.5, label='1')
    plt.legend(loc='upper right')
    plt.title(col)
    plt.show()
    
[plt_diff(col) for col in DATASETS['iris'][0].feature_names]

In [None]:
"{criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, ccp_alpha=0.0}".replace('=',':')
"n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None".replace('=',':')

In [None]:
target = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
#DATASETS['imp_3_red_2'][0].feature_names

In [None]:

d = DATASETS['imp_4'][0]

dr= D_R['pca']
temp = dr.get_reduced(d.data,{})
temp2= np.array(h.get(2))
colors = ['blue','yellow']
colors2 = ['blue','yellow','green']

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(20,10))

ax2.scatter(temp['x'],temp['y'],c = [colors2[c] for c in temp2])

ax1.scatter(temp['x'],temp['y'],color = [colors2[int(c)] for c in target])
plt.show()

In [None]:
df,cat = DATASETS['test']
data = pd.DataFrame(df.data,columns = df.feature_names)
data = data.corr().round(2)
data=data.set_index(df.feature_names[0],drop=True)
data

In [None]:
a=[1,2,3]
a.append([4,5,6])
a