### Analyzing the Variance within Trials

Objective: How goos is the statistical tests for measuring trial and version (original, fixed) variance?

In [43]:
import pandas as pd
import os



In [44]:
folder = "../../results"

out_folder = "../../results/trials"


In [45]:
df = pd.read_csv('../../results/raw-results.csv')

  interactivity=interactivity, compiler=compiler, result=result)


#### Statistical Methods

1. One-way ANOVA 
2. Wilcoxon Test
3. Moore's Median Test

In [55]:
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

ALPHA = 0.01

def split_list_equal(a_list):
    half = len(a_list) // 2
    #print('%d - %d' % (half, len(a_list)))
    return a_list[:half], a_list[-half:]


def analyze_anova(score):
    _, p = stats.f_oneway(*score)
    return p < ALPHA


def analyze_wilcoxon(score):
    try:
        x, y = split_list_equal(score) # split into two
        _, p = stats.wilcoxon(np.ravel(x), np.ravel(y)) 
        return p < ALPHA
    except ValueError as e:
        #print(e)
        pass
        
    return "WRONG"


def analyze_median(score):
    try:
        _, p, _, _ = stats.median_test(*score)
        return p < ALPHA
    
    except ValueError as e:
        print(e)
        
    return "WRONG"


def analyze_normality(score):
    _, p = stats.normaltest(score)
    return p < ALPHA

In [60]:
def analyze_versions(exp):
    versions_score = [t.Score.values for name, t in exp.groupby(by='Version')]
    
    # Tagged for the normality test (needs to be separated)
    dict_score = dict([(name, t.Score.values) for name, t in exp.groupby(by='Version')])
    
    exp['Normality (Original)'] = analyze_normality(dict_score['original'])
    exp['Normality (Fixed)'] = analyze_normality(dict_score['fixed_full'])
    exp['Version ANOVA'] = analyze_anova(versions_score)
    exp['Version Wilcoxon'] = analyze_wilcoxon(versions_score)
    exp['Version Median'] = analyze_median(versions_score)
    
    return exp


def analyze_trial(exp): 
    trials_score = [t.Score.values for name, t in exp.groupby(by='Trial')]
    
    exp['Trials ANOVA'] = analyze_anova(trials_score)
    exp['Trials Wilcoxon'] = analyze_wilcoxon(trials_score)
    exp['Trials Median'] = analyze_median(trials_score)
    
    return exp
    
    
df = df.groupby(by=['Project', 'Package', 'Class', 'Method', 'Experiment', 'Version']).apply(analyze_trial)
df = df.groupby(by=['Project', 'Package', 'Class', 'Method', 'Experiment']).apply(analyze_versions)

  r_plus = np.sum((d > 0) * r, axis=0)
  r_minus = np.sum((d < 0) * r, axis=0)
  f = msb / msw
  z = (T - mn - correction) / se
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


All values are below the grand median (0.0).
All values are below the grand median (0.0).
All values are below the grand median (0.0).
All values are below the grand median (0.0).
All values are below the grand median (0.0).
All values are below the grand median (0.0).


  "anyway, n=%i" % int(n))
  term2 = np.where(denom < 0, term1, np.power((1-2.0/A)/denom, 1/3.0))


All values are below the grand median (0.0).
All values are below the grand median (0.0).
All values are below the grand median (0.0).


In [61]:
# Just get the first row as a representative - 
out = df.groupby(by=['Project', 'Package', 'Class', 'Method', 'Experiment', 'Version']).apply(lambda x: x.iloc[0])
out.to_csv(os.path.join(folder, 'statistical_tests.csv'))


PermissionError: [Errno 13] Permission denied: '../../results\\statistical_tests.csv'

### How many cases had significant differences within Trials?

In [62]:
total = len(out)

anova_t = out.loc[out['Trials ANOVA'] == True]
print('ANOVA %d our of %d scenarios' % (len(anova_t), total))

wilcoxon_t = out.loc[out['Trials Wilcoxon'] == True]
print('Wilcoxon %d our of %d scenarios' % (len(wilcoxon_t), total))

median_t = out.loc[out['Trials Median'] == True]
print('Median %d our of %d scenarios' % (len(median_t), total))


ANOVA 108 our of 414 scenarios
Wilcoxon 126 our of 414 scenarios
Median 169 our of 414 scenarios


### How many cases had reported significant difference between Versions?

In [63]:
anova_v = out[out['Version ANOVA'] == True]
print('ANOVA %d our of %d scenarios' % (len(anova_v), total))

wilcoxon_v = out[out['Version Wilcoxon'] == True]
print('Wilcoxon %d our of %d scenarios' % (len(wilcoxom_v), total))

median_v = out[out['Version Median'] == True]
print('Median %d our of %d scenarios' % (len(median_v), total))

ANOVA 142 our of 414 scenarios
Wilcoxon 294 our of 414 scenarios
Median 194 our of 414 scenarios


### How many cases had significant differences on both Trials AND Versions (BADCASE)



In [64]:
anova = pd.merge(anova_t, anova_v, how='inner')
print('ANOVA %d our of %d scenarios' % (len(anova), total))

wilcoxon = pd.merge(wilcoxon_t, wilcoxon_v, how='inner')
print('Wilcoxon %d our of %d scenarios' % (len(wilcoxon), total))

median = pd.merge(median_t, median_v, how='inner')
print('Median %d our of %d scenarios' % (len(median), total))

ANOVA 63 our of 414 scenarios
Wilcoxon 80 our of 414 scenarios
Median 89 our of 414 scenarios


### How many cases had no significantr differences only on Versions (GOOD CASE)

In [65]:
def remove_intersection(df1, df2):
    return df1.loc[df1.index.difference(df2.index)]

anova_only_v = remove_intersection(anova_v, anova_t)
print('ANOVA %d our of %d scenarios' % (len(anova_only_v), total))

wilcoxon_only_v = remove_intersection(wilcoxon_v, wilcoxon_t)
print('Wilcoxon %d our of %d scenarios' % (len(wilcoxon_only_v), total))

media_only_v = remove_intersection(median_t, median_v)
print('Median %d our of %d scenarios' % (len(media_only_v), total))

ANOVA 79 our of 414 scenarios
Wilcoxon 190 our of 414 scenarios
Median 80 our of 414 scenarios


In [57]:
### Generating a Report

In [None]:
def generate_report(df):
    
    diff = len(df[df['Factor_normalized'] > 0.0])
    evaluated = len(df)
    return pd.DataFrame({
        'Evaluated': evaluated,
        'Impact': diff,
        '%': '%.2f' % (diff / evaluated * 100)
    }, index=df.name)
    

report = df.groupby(by=['Project','Experiment'], squeeze=True).apply(generate_report)

### The most impacted benchmarks

In [75]:
# 1. OKIO - indexOfByte benchmark

#df.iloc[['']]

out['IndexOfElementBenchmark']


KeyError: 'IndexOfElementBenchmark'