In [None]:
import pandas as pd
import os
import numpy as np

import seaborn as sns 
from matplotlib import pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'



In [None]:
# utils 

joinpath = os.path.join


def create_dir(path): 
    if not os.path.exists(path): 
        os.mkdir(path)

In [None]:
# definitions 

FITNESS_MEAN = 'fitness_mean'
FITNESS_SD = 'fitness_sd'
POP_SIZE = 'pop_size'
GENS = 'gens'
REPRESENTATION = 'representation'

ANALYSIS_DIR = 'analysis/'
create_dir(ANALYSIS_DIR)




In [None]:
overview_mr = pd.read_csv('results_mr/overview_mr.csv', sep=';')
overview_mr.insert(0, 'run_name', [f'{run}_mr' for run in overview_mr.run_id])
#[f'{run}_mr' for run in overview_mr.run_id]
overview_mr

In [None]:
overview_dp = pd.read_csv(os.path.join('results','overview.csv'), sep=';')
overview_dp.insert(0, 'run_name', [f'{run}_dp' for run in overview_dp.run_id])

overview_dp

In [None]:
overview = pd.concat([overview_dp, overview_mr], axis=0)
overview.info()

In [None]:
overview.info()

In [None]:
overview['fitness_mean']

In [None]:
overview.loc[overview.run_id == 23, ].sort_values(by='fitness_mean').head(10)

In [None]:


configdims = [
    'co_p', 'crossover', 'difficulty', 'diversity_measure','early_stopping_patience',
    'elitism', 'epochs', 'fitness_sharing','gens', 'mu_p', 
    'mutation', 'optim', 'pop_size', 'representation','selection'
]

techdims = [
    'user_id', 'comments'
]

iddims = [
    'run_name', 'run_id', 'gs_id'
]

metricdims = [
    'duration', 'fitness_mean', 'fitness_sd',
       'stopped_early'
]

def analyse_config(overview, configdims, analysis_name): 
    outpath = joinpath(ANALYSIS_DIR, analysis_name)
    create_dir(outpath)

    configdims_count = overview[configdims].apply(pd.Series.nunique)
    configdims_vary = configdims_count[configdims_count != 1].index.tolist()
    configdims_constant = configdims_count[configdims_count == 1].index.tolist()

    configdims_vary

    configdims_vary = overview[configdims_vary].apply(pd.Series.unique).reset_index().rename(columns = {'index': 'param', 0: 'values'})
    configdims_vary
    configdims_vary.to_csv(joinpath(outpath, 'configdims_vary.csv'), index=False, sep=';')


    configdims_constant = overview[configdims_constant].apply(pd.Series.unique).reset_index().rename(columns = {'index': 'param', 0: 'values'})
    configdims_constant
    configdims_constant.to_csv(joinpath(outpath, 'configdims_constant.csv'), index=False, sep=';')

    info = pd.DataFrame({
        'n_combos': [overview.shape[0]]
    })

    
    info.to_csv(joinpath(outpath, 'info.csv'), index=False, sep=';')

    
    print(f'N_combos: {overview.shape[0]}')
    print(f'configdims_constant:\n{configdims_constant}')
    print(f'configdims_vary:\n{configdims_vary}')

    
    return configdims_vary.param.tolist()

def cetris_paribus(overview_df, configdims_vary, dim, mode='pivot', verbose=False): 
    cols_hold_constant = [d for d in configdims_vary if d not in [dim]] 
    
    repr_grid1 = pd.pivot(overview_df, index=cols_hold_constant, columns=[dim], values=[FITNESS_MEAN])
    
    if verbose: print(repr_grid1)
    repr_grid1_comp = repr_grid1.idxmin(axis=1).values.tolist()
    repr_grid1_comp
    
    if mode == 'pivot': 
        res = repr_grid1
    elif mode == 'winner':
        #res = repr_grid1_comp
        res = repr_grid1.idxmin(axis=1)#.value_counts()
    elif mode == 'value_counts':
        res = repr_grid1.idxmin(axis=1).value_counts()

    return res
    

# grid run 0 
- compare selection 
- pop_size

In [None]:
ex0 = overview.loc[overview.run_name == '0_dp']

ex0.sort_values(by=[REPRESENTATION, FITNESS_MEAN])

In [None]:
configdims_vary_ex0 = analyse_config(ex0, configdims, 'ex0_popSize_selectio')

In [None]:
dim ='pop_size'
cetris_paribus(ex0, configdims_vary_ex0, dim, 'value_counts')

In [None]:
cetris_paribus(ex0, configdims_vary_ex0, dim, 'pivot')

In [None]:
dim = 'selection'

cetris_paribus(ex0, configdims_vary_ex0, dim, 'value_counts')

In [None]:
cetris_paribus(ex0, configdims_vary_ex0, dim, 'winner')

In [None]:
dim = 'selection'

res = cetris_paribus(ex0, configdims_vary_ex0, dim, 'pivot')
#res.apply(lambda row: row[1])

#res['new'] = res.min(axis=1)
res = res.apply(lambda x: x-min(x), axis=1)
res

#res.index.map('_'.join)

#res.index = res.index.map(lambda x: '_'.join(map(str, x)))
#res = res.reset_index()
res.columns
#.join(map(str, value_list))
res.values
res.columns.get_level_values(1).tolist()

res = pd.DataFrame(
    res.values, 
    columns=res.columns.get_level_values(1).tolist()

)

plotdata = pd.melt(res, var_name=dim, value_name=FITNESS_MEAN)
plotdata

print(f'Mean: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.mean()}') 
print(f'Sd: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.std()}') 
print(f'Min: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.min()}') 


#sns.lineplot(data=plotdata,x=None, y=FITNESS_MEAN, hue=dim)

In [None]:
dim = 'representation'

cetris_paribus(ex0, configdims_vary_ex0, dim, 'value_counts')

In [None]:
cb_distance_to_winner(ex0, configdims_vary_ex0, dim,)

In [None]:
ncols = res.shape[1]
fig, ax = plt.subplots(ncols=ncols)

for idx, dim in enumerate(res.columns): 
    print(dim)
    print(res[dim])
    plotdata = res[dim]
    sns.lineplot(data=plotdata, x=plotdata.index, y=dim, ax=ax[idx])
    #ax.plot(plotdata.index, y=plotdata[dim])
    
    



# first grid run


In [None]:
grid1 = overview.loc[(overview.run_name.isin(['24_dp'])),:]

configdims_vary_g1 = analyse_config(
    overview=grid1,
    configdims=configdims,
    analysis_name='01_grid1')




In [None]:
grid1.sort_values(FITNESS_MEAN).head(20)

In [None]:
dim = 'fitness_sharing'

cetris_paribus(grid1, configdims_vary_g1, dim, 'pivot')

In [None]:
cetris_paribus(grid1, configdims_vary_g1, dim, 'value_counts')

## representation

In [None]:
# representation 

fig, ax = plt.subplots(1,1)
#group = (plotdata.representation == 'maintain_init_puzzle').values
#group = [1 if row.representation == 'maintain_init_puzzle' else 0 for row in plotdata.iterrows()]
group = 'representation'
h_order = grid1.groupby(group)[FITNESS_MEAN].mean().sort_values().index.tolist()
h_order
sns.boxplot(data = grid1, y=FITNESS_MEAN, x=group, hue=group,hue_order=h_order, ax=ax)
#ax.set_xlabel('', rotation=45)
ax.tick_params(labelrotation=45)



In [None]:
dim = 'representation'


def cb_distance_to_winner(overview, configdims_vary, dim): 
    
    res = cetris_paribus(overview, configdims_vary, dim, 'pivot')

    res = res.apply(lambda x: x-min(x), axis=1)

    res.columns.get_level_values(1).tolist()

    res = pd.DataFrame(
        res.values, 
        columns=res.columns.get_level_values(1).tolist()

    )

    plotdata = pd.melt(res, var_name=dim, value_name=FITNESS_MEAN)



    print(f'Mean: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.mean()}') 
    print(f'Sd: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.std()}') 
    print(f'Min: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.min()}')
    print(f'Max: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.max()}')


    return plotdata

cb_distance_to_winner(
    overview=grid1[grid1.representation.isin(['maintain_init_puzzle', 'with_replacement', 'without_replacement'  ])], 
    configdims_vary=configdims_vary_g1, 
    dim=dim
)


In [None]:
dim = 'representation'

res = cetris_paribus(grid1[grid1.representation.isin(['random_mix', 'maintain_init_puzzle' ])], configdims_vary_g1, dim, 'pivot')

res = res.apply(lambda x: x-min(x), axis=1)

res.columns.get_level_values(1).tolist()

res = pd.DataFrame(
    res.values, 
    columns=res.columns.get_level_values(1).tolist()

)

plotdata = pd.melt(res, var_name=dim, value_name=FITNESS_MEAN)
plotdata

print(f'Mean: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.mean()}') 
print(f'Sd: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.std()}') 
print(f'Min: {plotdata.loc[plotdata.fitness_mean != 0, :].fitness_mean.min()}') 



In [None]:
configdims_vary_g1[0:-1]
grid1.groupby(configdims_vary_g1[0:-1]).representation

In [None]:
cetris_paribus(overview_df=grid1, configdims_vary=configdims_vary_g1, dim=REPRESENTATION, mode='value_counts', verbose=False)


## maintain init vs random 
- best of each
- compare hostory

In [None]:
# TODO:

## ex2: explore maintain_init_position
- subset of ex1 PLUS Sudoku specific operators (['24_dp', '12_mr'])

In [None]:
analysis_name = 'ex2-maintain'

ex2 = overview.loc[
    (overview.run_name.isin(['24_dp', '12_mr']))
    & (overview.representation == 'maintain_init_puzzle')
    ,:]

configdims_vary_ex2 = analyse_config(ex2, configdims, analysis_name)

### overall variance


In [None]:

fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.boxplot(data=ex2, y=FITNESS_MEAN, ax=ax1)
sns.boxplot(data=ex2, y=FITNESS_SD, ax=ax2)
fig.tight_layout()

In [None]:
from copy import deepcopy
plotdata = deepcopy(ex2)

plotdata = plotdata.sort_values(by=FITNESS_MEAN, axis=0, ascending=True).reset_index()
plotdata

In [None]:
plotdata_top20 = plotdata[configdims_vary_ex2].head(20)#.apply(pd.Series.value_counts, axis=0)

fig, ax = plt.subplots(
    ncols=plotdata_top20.shape[1], 
    sharey=True
)
idx=0
for name, col in plotdata_top20.items():
    print(name)
    tmp = col.value_counts(normalize=True).reset_index()
    tmp.rename(columns={'index':name, name:'ratio'}, inplace=True)
    print(tmp)
    print('\n')
    
    sns.barplot(data=tmp, x=name, y='ratio', ax=ax[idx])
    #ax.set_xticklables(rotation=90)
    plt.setp(ax[idx].xaxis.get_majorticklabels(), rotation=90)
    

    idx += 1


fig.suptitle('Top20: Distribution of parameter options', size=12)
fig.tight_layout()
fig.savefig(joinpath(outpath, 'top20.pdf'))



In [None]:
plotdata_top20 = plotdata[configdims_vary_ex2].head(20)#.apply(pd.Series.value_counts, axis=0)

fig, ax = plt.subplots(
    nrows=plotdata_top20.shape[1], 
    sharex=True
)
idx=0
for name, col in plotdata_top20.items():
    print(name)
    tmp = col.value_counts(normalize=True).reset_index()
    tmp.rename(columns={'index':name, name:'ratio'}, inplace=True)
    print(tmp)
    print('\n')
    
    sns.barplot(data=tmp, y=name, x='ratio', ax=ax[idx], orient='h')
    #ax.set_xticklables(rotation=90)
    #plt.setp(ax[idx].xaxis.get_majorticklabels(), rotation=90)

    idx += 1

    
fig.tight_layout()
    

In [None]:
# figsize=(15,10)
fig, ax = plt.subplots()

# ax.plot(plotdata.fitness_mean)
ax = sns.pointplot(data=plotdata, x=plotdata.index, y=FITNESS_MEAN)


# Find the x,y coordinates for each point
x_coords = []
y_coords = []
for point_pair in ax.collections:
    for x, y in point_pair.get_offsets():
        x_coords.append(x)
        y_coords.append(y)

# Calculate the type of error to plot as the error bars
# Make sure the order is the same as the points were looped over
#errors = tips.groupby(['smoker', 'sex']).std()['tip']
#colors = ['steelblue']*2 + ['coral']*2
ax.errorbar(x_coords, y_coords, yerr=plotdata.fitness_sd, fmt=' ', zorder=-1, color='black', capsize=2)
ax.xaxis.set_visible(False)


outpath = joinpath(ANALYSIS_DIR, analysis_name)

fig.suptitle('Compare Fitness of configs \n Errorbar shows ± SD', size=12)
fig.savefig(joinpath(outpath, 'fitness.pdf'))


In [None]:

for dim in configdims_vary_ex2: 
    print(dim)
    res = cetris_paribus(ex2, configdims_vary_ex2, dim, mode='value_counts')
    print(res)


# exp4 - diff2

In [None]:
ex4 = overview.loc[overview.run_name == '25_dp']
plotdata = deepcopy(ex4)

plotdata = plotdata.sort_values(by=FITNESS_MEAN, axis=0, ascending=True).reset_index()
plotdata


In [None]:
analysis_name = 'ex4'
analyse_config(ex4, configdims, analysis_name)

In [None]:
# figsize=(15,10)
fig, ax = plt.subplots()

# ax.plot(plotdata.fitness_mean)
ax = sns.pointplot(data=plotdata, x=plotdata.index, y=FITNESS_MEAN)


# Find the x,y coordinates for each point
x_coords = []
y_coords = []
for point_pair in ax.collections:
    for x, y in point_pair.get_offsets():
        x_coords.append(x)
        y_coords.append(y)

# Calculate the type of error to plot as the error bars
# Make sure the order is the same as the points were looped over
#errors = tips.groupby(['smoker', 'sex']).std()['tip']
#colors = ['steelblue']*2 + ['coral']*2
ax.errorbar(x_coords, y_coords, yerr=plotdata.fitness_sd, fmt=' ', zorder=-1, color='black', capsize=2)
ax.xaxis.set_visible(False)


outpath = joinpath(ANALYSIS_DIR, analysis_name)

fig.suptitle('Compare Fitness of configs \n Errorbar shows ± SD', size=12)
fig.savefig(joinpath(outpath, 'fitness.pdf'))

# scratch

In [None]:
    # plot scores 
    def plot_scores(scores, width=200, height=200, dodge=True): 
        my_dpi = 200
        fig = plt.figure(
            figsize=(
                #10, 8
                width/my_dpi, height/my_dpi
            )
        )
        ax = sns.pointplot(data=scores, x='model', y='mean', hue='config', alpha=.7, dodge=dodge, join=False, scale=.5)

        # Find the x,y coordinates for each point
        x_coords = []
        y_coords = []
        for point_pair in ax.collections:
            for x, y in point_pair.get_offsets():
                x_coords.append(x)
                y_coords.append(y)

        # Calculate the type of error to plot as the error bars
        # Make sure the order is the same as the points were looped over
        #errors = tips.groupby(['smoker', 'sex']).std()['tip']
        #colors = ['steelblue']*2 + ['coral']*2
        ax.errorbar(x_coords, y_coords, yerr=scores.sd, fmt=' ', zorder=-1, color='black', capsize=2)

        plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',
                   ncol=1, mode="expand", borderaxespad=0., prop={'size': 6})
        plt.tight_layout()
        plt.xticks(rotation=90)
        ax.set(ylabel='mean micro f1 score')


        #plt.savefig(os.path.join(explorations_path, f'{filename}.png'), dpi=200, bbox_inches = "tight")


        plt.show()
       
    # generate and save plot
    #plot_scores(scores, explorations_path, 'comp_all', 1200, 1400, .4)

# pop size

In [None]:
from copy import deepcopy

In [None]:
def filter_runs(overview, colname, values):
    df = deepcopy(overview)
    if not isinstance(values, list): 
        values = [values]
    return overview.loc[overview[colname].isin(values), ]
    

In [None]:
plotdata = filter_runs(overview, 'run_id', 0)
plotdata.info()

In [None]:
#plotdata.loc['cat_maintain_init_puzzle'] = np.where(plotdata.representation == 'maintain_init_puzzle', 1, 0)

In [None]:
plotdata

In [None]:

fig, ax = plt.subplots(1,1)
#group = (plotdata.representation == 'maintain_init_puzzle').values
group = [1 if row.representation == 'maintain_init_puzzle' else 0 for row in plotdata.iterrows()]
group
#sns.boxplot(data = plotdata, y=fitness_mean, x=group, hue=group, ax=ax)


In [None]:
for row in plotdata.iteritems(): 
    print(row)

In [None]:
plotdata[(plotdata.fitness_mean < 40) & (plotdata.pop_size == 100)]

In [None]:
overview.loc[overview['run_id'] in [0] , ]