# 1. Imports

In [1]:
# imports
import collections
import copy
import json
import matplotlib.pyplot as plt
from statistics import median
import numpy as np
import os
import pandas as pd
import pickle
import scipy.stats
import seaborn as sns
import warnings
%matplotlib
warnings.filterwarnings("ignore")

Using matplotlib backend: Qt5Agg


# 2. Functions

In [2]:
def rank_dict(dictionary, reverse=False):
    '''
    Get a dictionary and return a rank dictionary
    for example dic={'a':10,'b':2,'c':6}
    will return dic={'a':1.0,'b':3.0,'c':2.0}
    
    '''
    dictionary = copy.copy(dictionary)
    
    if reverse:
        
        for key in dictionary.keys():
            dictionary[key] = 1 - dictionary[key]
                      
    sortdict = collections.OrderedDict(sorted(dictionary.items()))
    ranks = scipy.stats.rankdata(list(sortdict.values()))
    result = {}
    
    for idx, (key, value) in enumerate(sortdict.items()):
        result[key] = ranks[idx]
        
    return result

In [3]:
def sum_dict_values(a, b, allow_subsets=False):
    '''
    Get two dictionary sum them together!
    '''
    result = {}
    a_total = sum(a.values())
    b_total = sum(b.values())
    a_min_b = set(a.keys()) - set(b.keys())
    b_min_a = set(b.keys()) - set(a.keys())
    
#     if len(b_min_a) > 0:
#         raise ValueError('dict b got illegal keys: %s' %str(b_min_a))
        
#     if not allow_subsets and len(a_min_b):
#         raise ValueError('keys not the same')
        
    for idx in a.keys():
        if idx in b:
            result[idx] = a[idx] + b[idx]
        else:
            result[idx] = a[idx]
            
#     if sum(result.values()) != a_total + b_total:
#         raise ValueError()
        
    return result

In [4]:
def divide_dict_values(d, denominator):
    ''' 
    divide d/demoniator
    '''
    result = {}
    
    for idx in d.keys():
        result[idx] = d[idx] / denominator
        
    return result

In [5]:
def determine_relevant(data, max_items=None, max_interactions=None):



    sorted_values = []
    keys = []
    interactions_seen = 0


    for key in sorted(data, key=lambda k: median(data[k]), reverse=True):
        if '__' in key:
            interactions_seen += 1
            if interactions_seen > max_interactions:
                continue

        sorted_values.append(data[key])
        keys.append(key)


    if max_items is not None:
        sorted_values = sorted_values[:max_items]
        keys = keys[:max_items]

    return sorted_values, keys

In [6]:
def marginal_plots(sorted_values, keys, fig_title):
    
    sorted_values=sorted_values[0:7]
    plt.figure(figsize=(12,10))
    plt.violinplot(list(sorted_values), list(range(len(sorted_values))))
    plt.plot([-0.5, len(sorted_values) - 0.5], [0, 0], 'k-', linestyle='--', lw=1)
    keys = [format_name(key) for key in keys]
    plt.xticks(list(range(len(sorted_values))), list(keys), rotation=45, ha='right')
    plt.ylabel('marginal contribution')
    plt.xlabel("Hyperparameters")
    plt.title(fig_title)
    plt.show()
    plt.savefig("../output_plots/"+fig_title+".jpg" ,bbox_inches = 'tight',pad_inches = 0)
    # plt.close()

In [7]:
def format_name(name):
    '''
    Format hyperparameter names!
    '''
    mapping_plain = {
        'strategy': 'imputation',
        'max_features': 'max. features',
        'min_samples_leaf': 'min. samples leaf',
        'min_samples_split': 'min. samples split',
        'criterion': 'split criterion',
        'learning_rate': 'learning rate',
        'max_depth': 'max. depth',
        'n_estimators': 'iterations',
        'algorithm': 'algorithm',
    }
    
    mapping_short = {
        'strategy': 'imputation',
        'max_features': 'max. feat.',
        'min_samples_leaf': 'samples leaf',
        'min_samples_split': 'samples split',
        'criterion': 'split criterion',
        'learning_rate': 'learning r.',
        'max_depth': 'max. depth',
        'n_estimators': 'iterations',
        'algorithm': 'algo.',
    }

    parts = name.split('__')
    
    for idx, part in enumerate(parts):
        if part in mapping_plain:
            if len(parts) < 3:
                parts[idx] = mapping_plain[part]
            else:
                parts[idx] = mapping_short[part]

                
    return ' / '.join(parts)

In [8]:

#df=df.drop(["imputation","time_taken"],axis=1)


def obtain_marginal_contributions(df):
    '''
    This is the main function that calls Top functions
    '''
    
    all_ranks = dict()
    all_tasks = list()
    total_ranks = None
    num_tasks = 0
    marginal_contribution = collections.defaultdict(list)

    lst_datasets=list(df.dataset.unique())

    for dataset in lst_datasets:


        a=df[df.dataset==dataset]
        a=a.drop("dataset",axis=1)
        param=dict()


        for index, row in a.iterrows():
            marginal_contribution[row["param"]].append(row["importance"])
            param.update( {row["param"] : row["importance"]} )

        ranks = rank_dict(param, reverse=True)
        if total_ranks is None:
            total_ranks = ranks
        else:
            total_ranks = sum_dict_values( ranks,total_ranks, allow_subsets=False)
            num_tasks += 1
    total_ranks = divide_dict_values(total_ranks, num_tasks)
    return total_ranks, marginal_contribution, lst_datasets

## 3. Result: 6 plot over 200 datasets for 6 classifiers(obtain_marginal_contributions)

In [9]:
# {"RF","SVM","ET","DT","AB","GB"}
for cls in {"RF","SVM","ET","DT","AB","GB"}:
    df=pd.read_csv("../PerformanceData/"+cls+"_fANOVA_results.csv")
    total_ranks, marginal_contribution, _ = obtain_marginal_contributions(df)
    sorted_values, keys = determine_relevant(marginal_contribution, max_interactions=3)
    marginal_plots(sorted_values, keys, cls+" classifier")