In [192]:
!pip install PyNonpar

In [193]:
import numpy as np
import pandas as pd
from statistics import mean, stdev
from math import sqrt
from scipy import stats
from PyNonpar.twosample import wilcoxon_mann_whitney_test

# Data

In [194]:
#Eclipse BM25
Eclipse_data_sim_BM25 = (1.12, 1.35, 1.57, 2.24)
Eclipse_data_dissim_BM25 = (2.57, 2.94, 4.04, 6.25)

#Firefox BM25
Firefox_data_sim_BM25 = (11.57, 20.90, 26.68, 47.20)
Firefox_data_dissim_BM25 = (20.71, 34.68, 40.07, 58.09)

#Mobile BM25
Mobile_data_sim_BM25 = (22.00, 44.00, 48.00, 78.00)
Mobile_data_dissim_BM25 = (15.73, 28.09, 35.96, 59.55)

# Eclipse_data_sim_BM25 = (1.12,1.35,1.57,2.02,2.02,2.02,2.24,2.24,2.24)
# Eclipse_data_dissim_BM25 = (2.57,2.94,4.04,4.41,4.78,4.78,5.88,6.25,6.25)

# Firefox_data_sim_BM25 = (11.56,20.90,26.68,31.72,33.58,35.63,39.55,44.03,47.20)
# Firefox_data_dissim_BM25 = (20.71,34.68,40.07,45.83,47.55,48.90,53.06,55.88,58.09)

# Mobile_data_sim_BM25 = (22,44,48,58,60,64,70,78,78)
# Mobile_data_dissim_BM25 = (15.73,28.09,35.96,43.82,44.94,47.19,51.69,56.18,59.55)

#Eclipse LDA+GloVe
Eclipse_data_sim_LDA = (0, 9.0, 13.5, 20.0)
Eclipse_data_dissim_LDA = (0, 5.5, 8.5, 15.5)

#Firefox LDA+GloVe
Firefox_data_sim_LDA = (0, 3.0, 3.5, 6.0)
Firefox_data_dissim_LDA = (0, 6.0, 8.5, 14.5)

#Mobile LDA+GloVe
Mobile_data_sim_LDA = (0, 2.46, 13.11, 22.95)
Mobile_data_dissim_LDA = (0, 4.58, 6.11, 13.74)

# Statistical Significance Test

#### If p-value ≤ 0.05: then the null hypothesis can be rejected (i.e. the variable is NOT normally distributed).
#### If p-value > 0.05: then the null hypothesis cannot be rejected (i.e. the variable MAY BE normally distributed). 

In [195]:
#Significance Test
def significance_test_shapiro(data_name, algo_name, data_sim, data_dissim):
    data = data_sim + data_dissim;
    
    shapiro_sim = stats.shapiro(data_sim)
    shapiro_dissim = stats.shapiro(data_dissim)
    shapiro_BM25 = stats.shapiro(data)
    
    dict_result = {'Data':[], 'Statistics':[], 'P-Value':[]}
    
    result = pd.DataFrame(dict_result)
    
    result.loc[len(result.index)] = ['Textually Similar', shapiro_sim.statistic, shapiro_sim.pvalue]
    result.loc[len(result.index)] = ['Textually Dissimilar', shapiro_dissim.statistic, shapiro_dissim.pvalue]
    result.loc[len(result.index)] = ['Textually Similar & Dissimilar', shapiro_BM25.statistic, shapiro_BM25.pvalue]
    
    result = result.style.set_caption(data_name+' '+algo_name+' : ')
    display(result)

In [196]:
significance_test_shapiro('Eclipse', 'BM25', Eclipse_data_sim_BM25, Eclipse_data_dissim_BM25)
significance_test_shapiro('Eclipse', 'LDA+GloVe', Eclipse_data_sim_LDA, Eclipse_data_dissim_LDA)

Unnamed: 0,Data,Statistics,P-Value
0,Textually Similar,0.929077,0.588985
1,Textually Dissimilar,0.894547,0.40448
2,Textually Similar & Dissimilar,0.876609,0.174729


Unnamed: 0,Data,Statistics,P-Value
0,Textually Similar,0.99236,0.96923
1,Textually Dissimilar,0.993376,0.974069
2,Textually Similar & Dissimilar,0.949097,0.702121


In [197]:
significance_test_shapiro('Firefox', 'BM25', Firefox_data_sim_BM25, Firefox_data_dissim_BM25)
significance_test_shapiro('Firefox', 'LDA+GloVe', Firefox_data_sim_LDA, Firefox_data_dissim_LDA)

Unnamed: 0,Data,Statistics,P-Value
0,Textually Similar,0.948557,0.707116
1,Textually Dissimilar,0.985476,0.933301
2,Textually Similar & Dissimilar,0.969269,0.892212


Unnamed: 0,Data,Statistics,P-Value
0,Textually Similar,0.973019,0.860078
1,Textually Dissimilar,0.995554,0.983867
2,Textually Similar & Dissimilar,0.916692,0.403602


In [198]:
significance_test_shapiro('Mobile', 'BM25', Mobile_data_sim_BM25, Mobile_data_dissim_BM25)
significance_test_shapiro('Mobile', 'LDA+GloVe', Mobile_data_sim_LDA, Mobile_data_dissim_LDA)

Unnamed: 0,Data,Statistics,P-Value
0,Textually Similar,0.962871,0.796961
1,Textually Dissimilar,0.965712,0.814776
2,Textually Similar & Dissimilar,0.966348,0.867909


Unnamed: 0,Data,Statistics,P-Value
0,Textually Similar,0.922441,0.550718
1,Textually Dissimilar,0.960688,0.783237
2,Textually Similar & Dissimilar,0.89261,0.247431


# Significance Test for Normal Distribution

#### Cohen suggested that for
#### d = 0.2 effect size is  'small' ,
#### d = 0.5 effect size is 'medium' 
#### d = 0.8 effect size is 'large'  

In [199]:
#Test when Data is Normally Distributed
def significance_test_ttest_ind(data_name, algo_name, data_sim, data_dissim):
    t_value, p_value = stats.ttest_ind(data_sim, data_dissim)
    
    cohen_d = (mean(data_sim) - mean(data_dissim)) / (sqrt((stdev(data_sim) ** 2 + stdev(data_dissim) ** 2) / 2))

    dict_result = {'Test Statistic':[], 'P-Value':[], 'Cohens-d Value':[]}
    
    result = pd.DataFrame(dict_result)
    
    result.loc[len(result.index)] = [t_value, p_value, cohen_d]
    
    result = result.style.set_caption(data_name+' '+algo_name+' : ')
    display(result)

In [200]:
significance_test_ttest_ind('Eclipse', 'BM25', Eclipse_data_sim_BM25, Eclipse_data_dissim_BM25)
significance_test_ttest_ind('Eclipse', 'LDA+GloVe', Eclipse_data_sim_LDA, Eclipse_data_dissim_LDA)

Unnamed: 0,Test Statistic,P-Value,Cohens-d Value
0,-2.760112,0.032849,-1.951694


Unnamed: 0,Test Statistic,P-Value,Cohens-d Value
0,0.613394,0.562132,0.433735


In [201]:
significance_test_ttest_ind('Firefox', 'BM25', Firefox_data_sim_BM25, Firefox_data_dissim_BM25)
significance_test_ttest_ind('Firefox', 'LDA+GloVe', Firefox_data_sim_LDA, Firefox_data_dissim_LDA)

Unnamed: 0,Test Statistic,P-Value,Cohens-d Value
0,-1.092462,0.316542,-0.772487


Unnamed: 0,Test Statistic,P-Value,Cohens-d Value
0,-1.270798,0.250847,-0.89859


In [202]:
significance_test_ttest_ind('Mobile', 'BM25', Mobile_data_sim_BM25, Mobile_data_dissim_BM25)
significance_test_ttest_ind('Mobile', 'LDA+GloVe', Mobile_data_sim_LDA, Mobile_data_dissim_LDA)

Unnamed: 0,Test Statistic,P-Value,Cohens-d Value
0,0.892056,0.406714,0.630779


Unnamed: 0,Test Statistic,P-Value,Cohens-d Value
0,0.587377,0.578369,0.415338


# Significance Test for Non Gaussian Distribution

In [203]:
#Test when Data is not Normally Distributed
def significance_test_wilcoxon(data_name, algo_name, data_sim, data_dissim):
    t_value, p_value = stats.wilcoxon(data_sim, data_dissim)
    
    data_sim_list= list(data_sim)
    data_dissim_list = list(data_dissim)
    
    wilcoxon_result = PyNonpar.twosample.wilcoxon_mann_whitney_test(data_sim_list, data_dissim_list, method= "asymptotic")
   
    r_val = abs(wilcoxon_result.statistic)/sqrt(len(data_sim_list))
    
    dict_result = {'Test Statistic':[], 'P-Value':[], 'Effect Value':[]}
    
    result = pd.DataFrame(dict_result)
    
    result.loc[len(result.index)] = [t_value, p_value, r_val]
    
    result = result.style.set_caption(data_name+' '+algo_name+' : ')
    display(result)