# 1. Imports

In [13]:
# imports
import collections
import copy
import json
import matplotlib.pyplot as plt
from statistics import median
import numpy as np
import os
import pandas as pd
import pickle
import scipy.stats
import seaborn as sns
import warnings
import matplotlib
import matplotlib.font_manager as fm
%matplotlib
warnings.filterwarnings("ignore")

sns.set_style("darkgrid")
 
 
font = {
        'family' : 'serif',
        'size'   : 26}

matplotlib.rc('font', **font)

# Rebuild the matplotlib font cache
fm._rebuild()
"""
all function of this notebook(and some other code.) is based on fallowing amazing github repo:

https://github.com/janvanrijn/openml-pimp

So all credits goes for it.
"""

Using matplotlib backend: Qt5Agg


'\nall function of this notebook(and some other code.) is based on fallowing amazing github repo:\n\nhttps://github.com/janvanrijn/openml-pimp\n\nSo all credits goes for it.\n'

# 2. Functions

In [15]:
def rank_dict(dictionary, reverse=False):
    '''
    Get a dictionary and return a rank dictionary
    for example dic={'a':10,'b':2,'c':6}
    will return dic={'a':1.0,'b':3.0,'c':2.0}
    
    '''
    dictionary = copy.copy(dictionary)
    
    if reverse:
        
        for key in dictionary.keys():
            dictionary[key] = 1 - dictionary[key]
                      
    sortdict = collections.OrderedDict(sorted(dictionary.items()))
    ranks = scipy.stats.rankdata(list(sortdict.values()))
    result = {}
    
    for idx, (key, value) in enumerate(sortdict.items()):
        result[key] = ranks[idx]
        
    return result

In [16]:
def sum_dict_values(a, b, allow_subsets=False):
    '''
    Get two dictionary sum them together!
    '''
    result = {}
    a_total = sum(a.values())
    b_total = sum(b.values())
    a_min_b = set(a.keys()) - set(b.keys())
    b_min_a = set(b.keys()) - set(a.keys())
    
#     if len(b_min_a) > 0:
#         raise ValueError('dict b got illegal keys: %s' %str(b_min_a))
        
#     if not allow_subsets and len(a_min_b):
#         raise ValueError('keys not the same')
        
    for idx in a.keys():
        if idx in b:
            result[idx] = a[idx] + b[idx]
        else:
            result[idx] = a[idx]
            
#     if sum(result.values()) != a_total + b_total:
#         raise ValueError()
        
    return result

In [17]:
def divide_dict_values(d, denominator):
    ''' 
    divide d/demoniator
    '''
    result = {}
    
    for idx in d.keys():
        result[idx] = d[idx] / denominator
        
    return result

In [18]:
def determine_relevant(data, max_items=None, max_interactions=None):



    sorted_values = []
    keys = []
    interactions_seen = 0


    for key in sorted(data, key=lambda k: median(data[k]), reverse=True):
        if '__' in key:
            interactions_seen += 1
            if interactions_seen > max_interactions:
                continue

        sorted_values.append(data[key])
        keys.append(key)


    if max_items is not None:
        sorted_values = sorted_values[:max_items]
        keys = keys[:max_items]

    return sorted_values, keys

In [19]:
def marginal_plots(sorted_values, keys, fig_title):
    sorted_values=sorted_values[0:9]
    plt.figure(figsize=(12,10))
    lst_arr=[]
    lst_arr.append(list(sorted_values))
    lst_arr.append(list(range(len(sorted_values))))
    
    sns.violinplot(data=list(sorted_values),inner='box', scale='width', cut=0, linewidth=2  )
    
    



   
    keys = [format_name(key) for key in keys]
    # plt.set_xticklabels(list(range(len(sorted_values))))
    plt.xticks(list(range(len(sorted_values))), list(keys), rotation=45, ha='right')
    plt.ylabel('Variance Contribution')
    sns.set_palette("RdBu")
    sns.set_style("darkgrid")
    ax=plt.gcf()
 
 
    # plt.show()
    ax.savefig("../output_plots/"+fig_title+".pdf" ,bbox_inches = 'tight',pad_inches = 0, format='pdf')
    plt.close()

In [20]:
def format_name(name):
    '''
    Format hyperparameter names!
    '''
    mapping_short = {
        'strategy': 'imputation',
        'max_features': 'max. features',
        'min_samples_leaf': 'min. samples leaf',
        'min_samples_split': 'min. samples split',
        'criterion': 'criterion',
        'learning_rate': 'learning rate',
        'max_depth': 'max. depth',
        'n_estimators': 'n. estimators',
        'algorithm': 'algorithm',
        "('max_features', 'min_samples_leaf')":"max. feat., min. s. leaf",
        "('criterion', 'max_features', 'min_samples_leaf')":"criterion, max. feat., min. s. leaf",
        "('max_features', 'min_samples_split')":"criterion, max. feat., min. s. split",
        "('bootstrap', 'max_features', 'min_samples_leaf')": "boots., max. feat., min. s. leaf",
        "('bootstrap', 'max_features')": "boots., max. feat.",
        "('bootstrap', 'min_samples_leaf')":"boots., min. s. leaf",
        "('imputation', 'max_features', 'min_samples_leaf')": "imput., max. feat., min. s. leaf",
        "('imputation', 'max_features')": "imput., max. feat.",
        "('imputation', 'min_samples_leaf')": "imput., min. s. leaf",
        "('min_samples_leaf', 'min_samples_split')":"min. s. leaf, min. s. split",
        "('min_samples_leaf', 'n_estimators')":"min. s. leaf, n. esti.",
        "('learning_rate', 'max_depth')": "learning r., max. depth",
        "('learning_rate', 'max_features')": "learning r., max. feat.",
        "('max_depth', 'n_estimators')":"max. depth, n. esti.",
        "('learning_rate', 'n_estimators')": "learning r., n esti.",
        "('learning_rate', 'min_samples_leaf')": "learning r., min. s.leaf",
        "('algorithm', 'max_depth', 'learning_rate')":"alg., max. depth, learning r.",
        "('algorithm', 'max_depth')": "alg., max_depth",
        "('imputation', 'max_depth', 'learning_rate')":"imput., max. depth, learning r.",
        "('gamma', 'kernel')":"gamma, kernel",
        "('imputation', 'kernel')":"imput., kernel",
        "('imputation', 'tol')":"imput., tol",
        "('C', 'imputation')":"C, imput.",
        "('coef0', 'gamma')":"coef0, gamma",
        "('coef0', 'imputation')":"coef0, imput.",
        "('gamma', 'imputation')":"gamma, imput.",



        }

    parts = name.split('__')
    for idx, part in enumerate(parts):
        if part in mapping_short:
            parts[idx] = mapping_short[part]
 
                
    return ' / '.join(parts)

In [21]:

#df=df.drop(["imputation","time_taken"],axis=1)
def obtain_marginal_contributions(df):
    '''
    This is the main function that calls Top functions
    '''
    
    all_ranks = dict()
    all_tasks = list()
    total_ranks = None
    num_tasks = 0
    marginal_contribution = collections.defaultdict(list)

    lst_datasets=list(df.dataset.unique())

    for dataset in lst_datasets:


        a=df[df.dataset==dataset]
        a=a.drop("dataset",axis=1)
        param=dict()


        for index, row in a.iterrows():
            marginal_contribution[row["param"]].append(row["importance"])
            param.update( {row["param"] : row["importance"]} )

        ranks = rank_dict(param, reverse=True)
        if total_ranks is None:
            total_ranks = ranks
        else:
            total_ranks = sum_dict_values( ranks,total_ranks, allow_subsets=False)
            num_tasks += 1
    total_ranks = divide_dict_values(total_ranks, num_tasks)
    return total_ranks, marginal_contribution, lst_datasets

## 3. Result: 6 plot over 200 datasets for 6 classifiers(obtain_marginal_contributions)

In [22]:
# {"RF","SVM","ET","DT","AB","GB"}
for cls in {"RF","SVM","ET","DT","AB","GB"}:
    df=pd.read_csv("../PerformanceData/"+cls+"_fANOVA_results.csv")
    total_ranks, marginal_contribution, _ = obtain_marginal_contributions(df)
    sorted_values, keys = determine_relevant(marginal_contribution, max_interactions=3)
    marginal_plots(sorted_values, keys, cls+" classifier")