In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import sys
import math
sys.path.append('/home/eduardo/PycharmProjects/treemap-analysis/code')

import Parser
import SpatialMetrics
import TemporalMetrics

In [3]:
technique_list = Parser.list_techniques()
dataset_list = Parser.list_datasets()
print(technique_list)
print(dataset_list)

['app', 'git', 'hil', 'moo', 'nac', 'new', 'pbm', 'pbs', 'pbz', 'snd', 'spi', 'sqr', 'str']
['animate.css', 'AudioKit', 'bdb', 'beets', 'brackets', 'caffe', 'calcuta', 'cpython', 'earthdata-search', 'emcee', 'exo', 'exports', 'fsharp', 'gimp', 'hiv', 'hospitalrun-frontend', 'Hystrix', 'iina', 'jenkins', 'Leaflet', 'm-coffee', 'm-names', 'OptiKey', 'osquery', 'PhysicsJS', 'pybuilder', 'scikitlearn', 'shellcheck', 'soundnode-app', 'spacemacs', 'standard', 'uws']


# Visual Metrics

Let $w_k$ and $h_k$ be the weight and height of cell $c_k$, and $a_k$ be the relative weight of item $k$ relative to the complete tree weight.

$Q^{AR}_k = \min(w_k,h_k)/\max(w_k,h_k)$

$Q^{WAR}_k = a_k * Q^{AR}_k$

In [30]:
# If the metrics were already computed, read from file. Else, recompute and save to file.
try:
    ar_df = pd.read_csv('ar.csv', index_col=0)
    war_df = pd.read_csv('war.csv', index_col=0)
    
except:
    ar_df  = pd.DataFrame(columns=dataset_list, index=technique_list) # Aspect ratio dataframe
    war_df = pd.DataFrame(columns=dataset_list, index=technique_list) # Weighted aspect ratio dataframe
    for technique in technique_list:
        for dataset in dataset_list:
            history = Parser.parse_rectangles(technique, dataset)
            for i, df in enumerate(history):
                ar_df[dataset][technique]  = SpatialMetrics.q_ar(df)['q_ar'].mean()
                war_df[dataset][technique] = SpatialMetrics.q_weighted_ar(df)['q_w_ar'].mean()

    ar_df.to_csv('ar.csv')
    ar_df.to_csv('war.csv')    

### How much each technique in $Q^{ar}$ score for each dataset 

In [32]:
ar_df

Unnamed: 0,animate.css,AudioKit,bdb,beets,brackets,caffe,calcuta,cpython,earthdata-search,emcee,...,OptiKey,osquery,PhysicsJS,pybuilder,scikitlearn,shellcheck,soundnode-app,spacemacs,standard,uws
app,0.684565,0.659055,0.445995,0.662962,0.635311,0.690907,0.603092,0.684429,0.666995,0.681847,...,0.562277,0.675816,0.650434,0.710802,0.665772,0.668284,0.687974,0.626794,0.739188,0.70198
git,0.461645,0.454805,0.251037,0.400137,0.392522,0.453855,0.433477,0.317625,0.212654,0.344499,...,0.395375,0.474618,0.363336,0.527232,0.274224,0.484265,0.370244,0.421906,0.413298,0.356951
hil,0.630774,0.435393,0.332571,0.479805,0.303583,0.370615,0.389041,0.336044,0.383148,0.465973,...,0.46293,0.341557,0.332893,0.378496,0.422687,0.256631,0.436462,0.406484,0.413442,0.400631
moo,0.629194,0.368385,0.25587,0.416297,0.334311,0.251678,0.448974,0.40812,0.199319,0.513805,...,0.46293,0.412599,0.368525,0.284382,0.437752,0.227899,0.458906,0.396532,0.359182,0.477957
nac,0.551992,0.398819,0.286815,0.343779,0.319154,0.395018,0.373129,0.360574,0.296851,0.390329,...,0.336626,0.401643,0.423617,0.385819,0.355646,0.387849,0.3895,0.388563,0.315709,0.353041
new,0.667569,0.567579,0.335528,0.530474,0.520653,0.554568,0.561581,0.521457,0.497821,0.50886,...,0.535917,0.552576,0.572394,0.566798,0.528815,0.6068,0.558541,0.495402,0.275742,0.477449
pbm,0.622484,0.550473,0.334689,0.42414,0.495053,0.528365,0.463841,0.488546,0.487231,0.505689,...,0.448904,0.528203,0.507423,0.533881,0.511144,0.328786,0.473934,0.482408,0.367774,0.476632
pbs,0.660418,0.614854,0.33712,0.548278,0.475022,0.59493,0.459847,0.55153,0.541601,0.591333,...,0.562277,0.569044,0.552945,0.574005,0.537025,0.589508,0.561601,0.537996,0.463491,0.548236
pbz,0.67336,0.553062,0.344971,0.51748,0.524025,0.550637,0.527375,0.548549,0.533954,0.513561,...,0.506441,0.548509,0.524762,0.58316,0.532376,0.417262,0.56104,0.48025,0.473946,0.475757
snd,0.518291,0.114074,0.228111,0.10817,0.191458,0.150805,0.209768,0.116347,0.097881,0.361535,...,0.273351,0.214281,0.282249,0.025534,0.223167,0.100708,0.150015,0.209306,0.270424,0.198423


### Count and sort ranks for $Q^{ar}$ and $Q^{war}$

In [109]:
ar_rank_count = pd.DataFrame(0, columns=range(1, len(technique_list) + 1), index=technique_list)

for column in ar_df:
    df = ar_df.sort_values(by=column, ascending=False)[column]
    for rank, tech in enumerate(df.index.values):
        ar_rank_count[rank + 1][tech] += 1

for column in war_df:
    df = ar_df.sort_values(by=column, ascending=False)[column]
    for rank, tech in enumerate(df.index.values):
        ar_rank_count[rank + 1][tech] += 1
    
ar_rank_count

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
app,54,6,2,2,0,0,0,0,0,0,0,0,0
git,0,0,0,0,2,0,0,4,18,4,18,14,4
hil,0,0,2,0,0,2,6,12,4,14,12,12,0
moo,0,2,0,4,0,2,2,2,14,18,8,10,2
nac,0,0,0,0,0,0,0,0,6,12,24,22,0
new,0,6,10,12,18,10,0,4,0,0,2,2,0
pbm,0,0,0,0,6,22,18,6,6,6,0,0,0
pbs,0,12,26,14,4,2,4,2,0,0,0,0,0
pbz,0,2,14,22,12,4,6,0,2,2,0,0,0
snd,0,0,0,0,0,0,0,0,0,2,0,4,58


In [110]:
def rank_row(row):
    value = 0
    for rank, n_times in enumerate(row):
        value += (rank + 1) * n_times
    return value
    
ar_rank_count['avg'] = ar_rank_count.apply(lambda row: rank_row(row.values), axis=1)
ar_rank_count.sort_values(by='avg', ascending=True)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,avg
app,54,6,2,2,0,0,0,0,0,0,0,0,0,80
sqr,10,34,6,2,12,0,0,0,0,0,0,0,0,164
pbs,0,12,26,14,4,2,4,2,0,0,0,0,0,234
pbz,0,2,14,22,12,4,6,0,2,2,0,0,0,298
new,0,6,10,12,18,10,0,4,0,0,2,2,0,318
spi,0,0,4,8,8,12,18,6,8,0,0,0,0,402
pbm,0,0,0,0,6,22,18,6,6,6,0,0,0,450
str,0,2,0,0,2,10,10,28,6,6,0,0,0,482
moo,0,2,0,4,0,2,2,2,14,18,8,10,2,602
hil,0,0,2,0,0,2,6,12,4,14,12,12,0,608


# Stability Metrics

Let $c_k(t_i)$ and $c_k(t_j)$ be two cells in two consecutive versions $T(t_i)$ and $T(t_j=t_{i+1})$ for the same node in a dynamic tree. We use for $\delta c_k$ the average sum of distances between the four closest corresponding corners of $c_k(t_i)$ and $c_k(t_j)$, normalized by the treemap diagonal $\sqrt{W^2+H^2}$, so $\delta \in [0,1]$.

We next define the \emph{data change} between nodes $n_k(t_i)$ and $n_k(t_j)$ as $\delta a_k = |a_k(t_i)-a_k(t_j)|$, where $a_k$ is the data attribute of $n_k$. If either of $n_k(t_i)$ or $n_k(t_j)$ does not exist, \emph{i.e.}, a node was created or deleted in versions $t_i$ or $t_j$, we set the respective attribute to zero. We normalize $\delta a_k$ by the maximal $a_k(t_i)$ for all nodes $n_k$ in all versions $t_j$ in a dataset, so $\delta a_k \in [0,1]$.


$Q^{RATIO}_k =  (1-\delta c_k) / (1 - \delta a_k)$

$Q^{WRATIO}_k =  a_k * Q^{RATIO}_k$


In [34]:
try:
    ratio_df = pd.read_csv('ratio.csv', index_col=0)
    wratio_df = pd.read_csv('wratio.csv', index_col=0)
    p_df = pd.read_csv('p.csv', index_col=0)
    wp_df = pd.read_csv('wp.csv', index_col=0)
    
except:
    ratio_df  = pd.DataFrame(columns=dataset_list, index=technique_list) # Ratio dataframe
    wratio_df = pd.DataFrame(columns=dataset_list, index=technique_list) # Weighted ratio dataframe
    p_df  = pd.DataFrame(columns=dataset_list, index=technique_list)     # Pearson correlation dataframe
    wp_df = pd.DataFrame(columns=dataset_list, index=technique_list)     # Weighted Pearson correlation dataframe
    
    for technique in technique_list:
        for dataset in dataset_list:
            history = Parser.parse_rectangles(technique, dataset)
            for i in range(1, len(history)):
                df = TemporalMetrics.delta_vis(history[i - 1], history[i])
                df = pd.merge(df, TemporalMetrics.delta_data_by_area(history[i - 1], history[i]))
                df = pd.merge(df, TemporalMetrics.relative_weight(history[i - 1], history[i]))

                # ratio metric
                ratio_df[dataset][technique]  = TemporalMetrics.q_ratio(df)['q_ratio'].mean()
                wratio_df[dataset][technique] = TemporalMetrics.q_weighted_ratio(df)['q_w_ratio'].mean()
                
                # pearson correlation
                p_df[dataset][technique]  = max(TemporalMetrics.pearson(df)[0], 0)
                wp_df[dataset][technique] = max(TemporalMetrics.weighted_pearson(df), 0)

    ratio_df.to_csv('ratio.csv')
    wratio_df.to_csv('wratio.csv')
    p_df.to_csv('p.csv')
    wp_df.to_csv('wp.csv')

  r = r_num / r_den
  weighted_correlation = xy_weighted_covariance / math.sqrt(xx_weighted_covariance * yy_weighted_covariance)


In [36]:
ratio_df

Unnamed: 0,animate.css,AudioKit,bdb,beets,brackets,caffe,calcuta,cpython,earthdata-search,emcee,...,OptiKey,osquery,PhysicsJS,pybuilder,scikitlearn,shellcheck,soundnode-app,spacemacs,standard,uws
app,0.953005,0.999616,0.999923,0.998439,1,0.991312,0.9522,0.997972,0.998051,1,...,1.0,0.999187,1,1.0,0.997451,0.979447,1,0.942215,1,0.970716
git,0.932883,0.999631,0.999934,0.999682,1,0.998684,0.973433,0.999748,0.999129,1,...,0.995266,0.998984,1,0.999927,0.999366,0.998542,1,0.977415,1,0.999328
hil,0.993526,0.999418,0.999931,0.999709,1,0.995722,0.955108,0.995303,0.999167,1,...,1.0,0.998953,1,1.0,0.999075,0.999201,1,0.962458,1,0.999021
moo,0.99234,0.999474,0.999928,0.999664,1,0.993395,0.963774,0.995833,0.998778,1,...,1.0,0.99867,1,1.0,0.999126,0.999305,1,0.95307,1,0.999135
nac,0.94113,0.999393,0.999937,0.999694,1,0.998675,0.964919,0.999818,0.998865,1,...,0.988114,0.999341,1,0.999917,0.999505,0.999186,1,0.977074,1,0.999083
new,0.916109,0.999587,0.999939,0.999177,1,0.997699,0.962751,0.996057,0.999132,1,...,0.972212,0.999056,1,0.999931,0.999348,0.999381,1,0.897226,1,0.999431
pbm,0.924394,0.637792,0.999936,0.999809,1,0.998357,0.972708,0.999816,0.99865,1,...,0.974949,0.999398,1,0.999957,0.994152,0.998634,1,0.964173,1,0.999045
pbs,0.990388,0.999485,0.999931,0.990486,1,0.992834,0.966799,0.988545,0.996741,1,...,1.0,0.998245,1,1.0,0.999335,0.999113,1,0.967561,1,0.975915
pbz,0.897265,0.999658,0.999934,0.999752,1,0.995984,0.950506,0.996813,0.996118,1,...,0.99054,0.998741,1,0.999912,0.999299,0.998896,1,0.955468,1,0.985857
snd,0.942571,0.999308,0.999946,0.999786,1,0.999013,0.975645,0.99978,0.999055,1,...,0.99579,0.999116,1,0.999969,0.999555,0.99904,1,0.970755,1,0.999211


In [118]:
stab_rank_count = pd.DataFrame(0, columns=range(1, len(technique_list) + 1), index=technique_list)

for column in ratio_df:
    df = ratio_df.sort_values(by=column, ascending=False)[column]
    for rank, tech in enumerate(df.index.values):
        stab_rank_count[rank + 1][tech] += 1

for column in wratio_df:
    df = wratio_df.sort_values(by=column, ascending=False)[column]
    for rank, tech in enumerate(df.index.values):
        stab_rank_count[rank + 1][tech] += 1
        
for column in p_df:
    df = p_df.sort_values(by=column, ascending=False)[column]
    for rank, tech in enumerate(df.index.values):
        stab_rank_count[rank + 1][tech] += 1

for column in wp_df:
    df = wp_df.sort_values(by=column, ascending=False)[column]
    for rank, tech in enumerate(df.index.values):
        stab_rank_count[rank + 1][tech] += 1
    
stab_rank_count

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
app,36,10,9,9,4,2,2,4,7,14,13,8,10
git,20,28,16,9,8,10,7,4,6,4,6,4,6
hil,14,14,25,8,6,8,5,14,8,5,9,7,5
moo,7,11,13,26,5,10,14,7,10,9,7,9,0
nac,5,19,15,8,29,15,6,8,5,8,7,1,2
new,3,5,6,7,12,26,11,14,12,15,8,3,6
pbm,6,7,7,12,15,13,29,7,3,11,2,7,9
pbs,4,3,10,16,6,9,10,25,11,9,9,6,10
pbz,7,4,5,5,5,7,13,13,35,4,10,14,6
snd,16,17,7,11,12,8,9,5,6,27,2,5,3


In [119]:
stab_rank_count['avg'] = stab_rank_count.apply(lambda row: rank_row(row.values), axis=1)
stab_rank_count.sort_values(by='avg', ascending=True)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,avg
git,20,28,16,9,8,10,7,4,6,4,6,4,6,627
nac,5,19,15,8,29,15,6,8,5,8,7,1,2,701
hil,14,14,25,8,6,8,5,14,8,5,9,7,5,744
app,36,10,9,9,4,2,2,4,7,14,13,8,10,769
snd,16,17,7,11,12,8,9,5,6,27,2,5,3,771
moo,7,11,13,26,5,10,14,7,10,9,7,9,0,776
pbm,6,7,7,12,15,13,29,7,3,11,2,7,9,861
new,3,5,6,7,12,26,11,14,12,15,8,3,6,924
pbs,4,3,10,16,6,9,10,25,11,9,9,6,10,948
pbz,7,4,5,5,5,7,13,13,35,4,10,14,6,1023


### The APP technique looks suspicious. Very low value compared to the others, and, eventhough I don't know much about it, I believe it is a very unstable technique. Maybe there is an implementation issue.

### Lets see for which datasets it ranks high.

In [126]:
for column in ratio_df:
    df = ratio_df.sort_values(by=column, ascending=False)[column]
    for rank, tech in enumerate(df.index.values):
        if rank == 0 and tech == 'app':
            print(df)
            print()

app    1
git    1
hil    1
moo    1
nac    1
new    1
pbm    1
pbs    1
pbz    1
snd    1
spi    1
sqr    1
str    1
Name: brackets, dtype: object

app    1
git    1
hil    1
moo    1
new    1
pbm    1
pbs    1
pbz    1
snd    1
spi    1
sqr    1
str    1
nac    1
Name: emcee, dtype: object

app           1
hil           1
moo           1
nac           1
new           1
pbm           1
pbs           1
pbz           1
snd           1
spi           1
sqr           1
str           1
git    0.998896
Name: fsharp, dtype: object

app    0.998354
git     0.99769
nac    0.997282
hil    0.997236
pbs    0.996703
sqr    0.996615
moo    0.996612
spi    0.996543
pbz    0.996472
pbm    0.996465
new    0.996027
snd    0.993764
str    0.977137
Name: iina, dtype: object

app    0.998829
snd     0.99805
git    0.997347
sqr    0.993673
pbm     0.99238
spi    0.992106
str    0.990784
new    0.988418
pbz    0.988253
nac    0.988002
pbs    0.986101
moo    0.980023
hil    0.977563
Name: Leaflet, dtype: objec

### It looks like the precision is not enough, whenever values get rounded to 1, it uses alphabetical order to rank indexes, putting APP at the top. I'll try to fix it next. 