In [33]:
!pip install lime --user
!pip install tabulate --user
!pip install pdfkit --user

from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

from lime import lime_tabular

import os
from multiprocessing import Process
import pandas as pd
import numpy as np
import sklearn
import lime
import pickle

PICKLES_PATH = "/afs/inesc-id.pt/home/aof/thesis/Parkinson-speech-detection/classification/pickles"
SUBSETS_PATH = "/afs/inesc-id.pt/home/aof/thesis/Parkinson-speech-detection/subsets_useful"
EXPLANATIONS_PATH = "/afs/inesc-id.pt/home/aof/public_html"

BOOTSTRAP = '<head><link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous"></head>'

EXPLANATIONS = {
    'f0': 'The number of open/close cycles of the glottis.',
    'hnr': 'Measures the frequency variation between cycles. Affected by lack of control on the vocal cords vibration.',
    'jitter': 'Measures the amplitude variation between cycles.',
    'shimmer': 'The ratio between periodic (associated with normal speed production) and non-periodic (associated with noise) speech components.',
    'mfcc': 'Features that approximate to our perception of the audio quality.',
    'plp': 'Features that approximate to our perception of the audio quality.',  
}


NORMAL_VALUES = {
    'f0': '105-160 (male) | 175-245 (Female)',
    'hnr': '< 1.04',
    'jitter': '< 3.81',
    'shimmer': '< 40',
    'mfcc': '',
    'plp': '',  
}



In [34]:
def write_html_to_file(html, path):
    f = open(path, "w")
    f.write(html)
    f.close()

In [35]:
def get_explanation_text(feature):
    return get_explanation_or_values(feature, EXPLANATIONS)

def get_normal_values(feature):
    return get_explanation_or_values(feature, NORMAL_VALUES)

def get_explanation_or_values(feature, obj):
    lower = feature.lower()
    
    for key in obj:
        if key in lower:
            return obj[key]
    
    print("No explanation or normal value found for feature {}".format(feature))
    return None

In [37]:
import sys
sys.path.append('../')

from classification.mlp import MLP
import traceback

'''
    Model: MLPClassifier instance
    Patient_Lines: lines of CSV corresponding to the patient to be tested
    Train: Training fractionf of the CSV
    Feature_names: list of features (columns of csv)
    Output path: file path to write the output, None for writing on screen
'''
def explain_patient(experiment, patient, model, patient_lines, train, feature_names, params, pickle_path, output_path=None):
    # Classify using model and ignore if it doesn't match label in patient lines
    mlp = MLP(None,None,None,None,model=model)
    try:
        diagnostic_correct = mlp.score(patient_lines, 0.5, None, '')
        if not diagnostic_correct:
            return []
    except:
        print(traceback.format_exc())
        return []
    
    num_features = 5
    patient_lines = patient_lines[patient_lines.columns.difference(['frameTime', 'label', 'name'])]

    #Build tabular
    try:
        explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=feature_names, class_names=['HC', 'PD'], discretize_continuous=False)
    except:
        print(traceback.format_exc())

    # Pass each line to LimeTabularExplainer
    explanations = []
    for i, row in patient_lines.iterrows():
        #Run only 1 in every 10 time frames (10 samples / second)
        if i % 10 == 0:
            try:
                explanations.append(explainer.explain_instance(row, model.predict_proba, num_features=num_features, top_labels=1))
            except:
                print(traceback.format_exc())
                return []

    # Sum all local_exp
    total_local_exp = {}
    for explanation in explanations:
        #local_exp = {1: [(12, 0.139704949653412), (44, 0.10278682378094679)]}
        local_exp = explanation.local_exp
        # will produce {<feature>: (count, sum_weight), ...
        for tup in local_exp[0 if 0 in local_exp else 1]:
            #tup = (12, 0.139704949653412)
            feature = feature_names[tup[0]]
            if feature in total_local_exp:
                total_local_exp[feature][0] += 1
                total_local_exp[feature][1] += tup[1]
            else:
                total_local_exp[feature] = [1, tup[1]]
        
    #Average local_exp
    # Key --> feature
    # value --> average value
    averaged_local_exp = {}

    for key in total_local_exp.keys():
        tup = total_local_exp[key]
        averaged_local_exp[key] = tup[1] / tup[0]
    
    # Normalize predict_proba
    # [P(HC), P(PD)]
    average_predict_proba = [sum(items) for items in zip(*[explanation.predict_proba for explanation in explanations ])]
    average_predict_proba /= np.float64(sum(average_predict_proba))

    # Each feature contribution represents its contribution to 
    # Classify the patient as PD. Therefore, suming them all will 
    # Add to P(PD). By dividing it by P(PD) we will project this
    # In the [-1, 1] interval
    prob_pd = average_predict_proba[1]
    for key in averaged_local_exp:
        averaged_local_exp[key] /= prob_pd

    # Get mean values for all features
    average_feature_values = {}
    for column in patient_lines:
        average_feature_values[column] = patient_lines[column].mean()
        
    result = display_or_save_patient(averaged_local_exp, average_predict_proba, average_feature_values, output_path, num_features, patient)

    f = open("/afs/inesc-id.pt/home/aof/thesis/Parkinson-speech-detection/explainability/logs/done_patients/{}/{}_{}.txt".format(experiment, generate_suffix(params), patient.split("_")[0]), "w")
    f.write("Done")
    f.close()
    return result

In [38]:
def generate_suffix(params):
    return "{}_{}_{}_{}".format(params['solver'], params['alpha'], params['max_iter'], params['activation'])

In [39]:
from IPython.display import HTML, display
import tabulate
import json
import traceback

def display_or_save_patient(averaged_local_exp, average_predict_proba, average_feature_values, output_path, num_features, patient):
    prediction_proba = \
            '<table class="table" style="margin: 0 auto;"> \
                <thead> \
                    <tr> \
                        <td style="margin: 2px; text-align:center; font-weight: bold;">HC</td> \
                        <td style="margin: 2px; text-align:center; font-weight: bold;">PD</td> \
                    </tr> \
                </thead> \
                <tr> \
                    <td style="margin: 2px; text-align: center;">{}</td> \
                    <td style="margin: 2px; text-align: center;">{}</td> \
                </tr> \
            </table>' \
            .format("{:.2f}".format(average_predict_proba[0]), "{:.2f}".format(average_predict_proba[1]))
    
    averaged_local_exp = {k: v for k, v in sorted(averaged_local_exp.items(), key=lambda item: -abs(item[1]))}
    feature_weights = averaged_local_exp.items()
    feature_weight_table = '<table class="table" style="margin: 0 auto;"> \
                                <thead> \
                                    <tr> \
                                        <td style="margin: 2px; text-align:center; font-weight: bold;">Feature</td> \
                                        <td style="margin: 2px; text-align:center; font-weight: bold;">PD probability contribution</td> \
                                        <td style="margin: 2px; text-align:center; font-weight: bold;">Subject\'s average value</td> \
                                        <td style="margin: 2px; text-align:center; font-weight: bold;">Healthy values</td> \
                                        <td style="margin: 2px; text-align:center; font-weight: bold;">Description</td> \
                                    </tr> \
                                </thead>'

    accounted = 0
    for feature_weight in feature_weights:
        s = '<tr> \
                <td style="margin: 2px; text-align:center; font-weight: bold;">{}</td> \
                <td style="margin: 2px; text-align:center;">{}</td> \
                <td style="margin: 2px; text-align:center;">{}</td> \
                <td style="margin: 2px; text-align:center;">{}</td> \
                <td style="margin: 2px; text-align:center;">{}</td> \
            </tr>' \
            .format(feature_weight[0], feature_weight[1], average_feature_values[feature_weight[0]], get_normal_values(feature_weight[0]), get_explanation_text(feature_weight[0]))
        feature_weight_table += s
        
        accounted += 1
        if accounted == num_features:
            break

    feature_weight_table += '</table>'
    
    #for feature_index in averaged_local_exp:
    #    feature_weight.append([feature_index, averaged_local_exp[feature_index]])
    
    
    table = '<table class="table" style="margin: 0 auto;"> \
                <thead> \
                    <tr> \
                        <td style="width: 45%; margin: 2px; text-align:center; font-weight: bold;">Prediction Probabilities</td> \
                        <td style="width: 45%; margin: 2px; text-align:center; font-weight: bold;">Feature weight</td> \
                    </tr> \
                </thead> \
                <tr> \
                    <td style="width: 45%; margin: 2px; text-align:center">{}</td> \
                    <td style="width: 45%; margin: 2px; text-align:center">{}</td> \
                </tr> \
            </table>' \
        .format(prediction_proba, feature_weight_table)

    if output_path == None:
        display(HTML(table))
    else:
        write_html_to_file(BOOTSTRAP + table, output_path)
        
    dic = {'name': patient.split("_")[0], 'hc': average_predict_proba[0], 'pd': average_predict_proba[1], 'features': [json.dumps(averaged_local_exp)]}
    results = pd.DataFrame(dic, index=None)
    
    return results

In [40]:
def generate_mlp_params_list(dry_run=False):
    if (dry_run):
        return {
            "independent_200": [
                { # Got it
                    'dataset': 'gita',
                    'solver': 'lbfgs',
                    'activation': 'tanh',
                    'alpha': 0.0001,
                    'max_iter': 2000,
                },
            ]
        }
        
    return {
        "baseline": [
            { # Got it
                'dataset': 'gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 2000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 2000,
            },
            { # Got it, on the hlt machine
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.01,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.01,
                'max_iter': 2000,
            },
        ],
        'baseline_200': [
            { # Got it
                'dataset': 'gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.01,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 2000,
            },
        ],
        'semi': [
            { # Got it, on the hlt machine
                'dataset': 'mdvr_kcl_gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 2000,
            },
            { # Got it
                'dataset': 'fralusopark_gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'gita_fralusopark',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 2000,
            },
            { # Got it, on the hlt machine
                'dataset': 'gita_fralusopark',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.01,
                'max_iter': 5000,
            },
            { # Got it, on the hlt machine
                'dataset': 'gita_fralusopark',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 2000,
            },
        ],
        'semi_200': [
            { # Got it, on the hlt machine
                'dataset': 'mdvr_kcl_gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'fralusopark_gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 2000,
            },
            { # Cheated | Got it, on the hlt machine
                'dataset': 'mdvr_kcl_gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 2000,
            },
            { # Got it, on the hlt machine
                'dataset': 'mdvr_kcl_gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 5000,
            },
            { # Got it, on the hlt machine
                'dataset': 'mdvr_kcl_gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 5000,
            },
        ],
        'independent': [
            { # Got it, on the hlt machine
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 5000,
            },
            { # Got it, on the hlt machine
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.01,
                'max_iter': 2000,
            },
            { # Got it, on the hlt machine
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 2000,
            },
            { # Cheated, train and test are from lbfgs, 0.01, 5000 | Got it
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 2000,
            },
            { # Got it, on the hlt machine
                'dataset': 'gita',
                'solver': 'adam',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 5000,
            },
        ],
        'independent_200': [
            { # Got it, on the hlt machine
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.01,
                'max_iter': 5000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 2000,
            },
            { # Got it
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 5000,
            },
            { # Got it, on the hlt machine
                'dataset': 'gita',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.001,
                'max_iter': 2000,
            },
            { # Got it
                'dataset': 'fralusopark',
                'solver': 'lbfgs',
                'activation': 'tanh',
                'alpha': 0.0001,
                'max_iter': 5000,
            },
        ]
    }

In [41]:
import traceback

def explain(train_path, test_path, experiment, param_set, logs_file, dry_run=False):
    pickle_file = '{}/{}/{}/{}.pkl'.format(PICKLES_PATH, experiment, param_set['dataset'], generate_suffix(param_set))
    if os.path.isfile(pickle_file):
        with open(pickle_file, 'rb') as filename:
            model = pickle.load(filename)
    else:
        logs_file.write("WARNING: No pickle found @ {}. Skipping...\n".format(pickle_file))
        return None

    if not os.path.isfile(train_path):
        logs_file.write("WARNING: No train file found @ {}. Skipping...\n".format(train_path))
        return train_path
    if not os.path.isfile(test_path):
        logs_file.write("WARNING: No test file found @ {}. Skipping...\n".format(test_path))
        return test_path

    train = None
    test = None
    try:
        train = pd.read_csv(train_path, sep=";")
    except Exception as e:
        logs_file.write('Test file reading error: ' + str(e))
    try:
        test = pd.read_csv(test_path , sep=";")
    except Exception as e:
        logs_file.write('Test file reading error: ' + str(e))
                

    train = train[train.columns.difference(['frameTime', 'label', 'name'])]

    patients = list(set(test['name']))
    feature_names = [feature for feature in list(test.columns) if feature not in ['label', 'name', 'frameTime']]
    
    output_dic = {'name': [], 'hc': [], 'pd': [], 'features': []}
    output_csv = pd.DataFrame(output_dic)
    
    try:
        for patient in patients:
            patient_lines = test[test["name"] == patient]
            #patient_lines = patient_lines[patient_lines.columns.difference(['frameTime', 'label', 'name'])]

            output_path_html = "{}/{}/{}/{}_{}.html".format(EXPLANATIONS_PATH, experiment, param_set['dataset'], patient.split('_')[0], generate_suffix(param_set))

            if os.path.isfile(output_path_html):
                print("Explanation exists, ignoring...")
                continue

            logs_file.write("Explaining patient {} to {}\n".format(patient.split('_')[0], output_path_html))

            try:
                patient_explanation = explain_patient(experiment, patient, model, patient_lines, train.to_numpy(), feature_names, param_set, pickle_file, output_path=output_path_html)
                output_csv = output_csv.append(patient_explanation, ignore_index=True)
            except:
                logs_file.write('Error explaining patient {} for experiment {}\n'.format(patient, experiment))
    except Exception:
        output_csv.to_csv('/afs/inesc-id.pt/home/aof/thesis/Parkinson-speech-detection/explainability/csv/{}/{}'.format(experiment, generate_suffix(param_set)), index=False)
        message = traceback.format_exc()
        logs_file.write(message + '\n')
        print(message)

    output_csv.to_csv('/afs/inesc-id.pt/home/aof/thesis/Parkinson-speech-detection/explainability/csv/{}/{}.csv'.format(experiment, generate_suffix(param_set)), index=False)
    f = open("/afs/inesc-id.pt/home/aof/thesis/Parkinson-speech-detection/explainability/logs/done_experiments/{}.txt".format(experiment), "w")
    f.write("Done\n")
    f.close()
    return None

In [42]:
def get_train_test_path(experiment, param_set):
    train = "{}/{}/{}/{}_train.csv".format(SUBSETS_PATH, experiment, param_set['dataset'], generate_suffix(param_set))
    test =  "{}/{}/{}/{}_test.csv".format (SUBSETS_PATH, experiment, param_set['dataset'], generate_suffix(param_set))  
    
    return train, test

In [43]:
def get_datasets_names():
    return {
        'baseline': ['fralusopark', 'gita', 'mdvr_kcl'],
        'baseline_200': ['fralusopark', 'gita', 'mdvr_kcl'],
        'independent': ['fralusopark', 'gita', 'mdvr_kcl'],
        'independent_200': ['fralusopark', 'gita', 'mdvr_kcl'],
        'semi': ['fralusopark_gita', 'fralusopark_mdvr_kcl', 'gita_fralusopark', 'gita_mdvr_kcl', 'mdvr_kcl_fralusopark', 'mdvr_kcl_gita'],
        'semi_200': ['fralusopark_gita', 'fralusopark_mdvr_kcl', 'gita_fralusopark', 'gita_mdvr_kcl', 'mdvr_kcl_fralusopark', 'mdvr_kcl_gita']
    }

In [44]:
def run_experiment_patients(experiment, mlp_params):
    missing_models = []
    f = open("/afs/inesc-id.pt/home/aof/thesis/Parkinson-speech-detection/explainability/logs/logs_{}.txt".format(experiment), "w")
    for param_set in mlp_params:
        train_path, test_path = get_train_test_path(experiment, param_set)
        result = explain(train_path, test_path, experiment, param_set, f, dry_run=False)
        
        if result:
            missing_models.append(result)
    f.close()
      
    if missing_models != []:
        f = open('/afs/inesc-id.pt/home/aof/thesis/Parkinson-speech-detection/explainability/logs/missing_modules/{}.txt'.format(experiment), "w")
        f.write('\n'.join(missing_models))
        f.close()


In [45]:
def main():
    mlp_params = generate_mlp_params_list(dry_run=False)

    experiment_processes = []
    for experiment in mlp_params.keys():
        experiment_processes.append(Process(target=run_experiment_patients, args=(experiment, mlp_params[experiment],)))

    for experiment_process in experiment_processes:
        experiment_process.start()

    for experiment_process in experiment_processes:
        experiment_process.join()


In [None]:
if __name__ == '__main__':
    main()

