# Hypothesis Test Demo

## Some Package Import and Data Preparation

In [7]:
from numpy import *
from scipy import stats

# data 1 is the performance score of 10-fold cv of model 1
random.seed(1)
data1 = [float('{:.4f}'.format(i)) for i in random.rand(10)]
# data 2 is the performance score of 10-fold cv of model 2
random.seed(2)
data2 = [float('{:.4f}'.format(i)) for i in random.rand(10)+0.4]
# data 3 is the performance score of 10-fold cv of model 3
random.seed(3)
data3 = [float('{:.4f}'.format(i)) for i in random.rand(10)+0.03]

# Print the data
print('Performance of model 1:',data1)
print('Performance of model 2:',data2)
print('Performance of model 3: ',data3)

Performance of model 1: [0.417, 0.7203, 0.0001, 0.3023, 0.1468, 0.0923, 0.1863, 0.3456, 0.3968, 0.5388]
Performance of model 2: [0.836, 0.4259, 0.9497, 0.8353, 0.8204, 0.7303, 0.6046, 1.0193, 0.6997, 0.6668]
Performance of model 3:  [0.5808, 0.7381, 0.3209, 0.5408, 0.9229, 0.9263, 0.1556, 0.2372, 0.0815, 0.4708]


## Mean and Confidence interval

In [3]:
# Set significance level
alpha = 0.05

# Mean of model 1
mean1 = mean(data1)

# Confidence interval of model 1
lower = percentile(data1, alpha/2)
upper = percentile(data1, alpha/2)
print('The model has a mean of %f with a %d%% confidence interval: [%.4f, %.4f]' % (mean1,(1-alpha)*100,lower, upper))

The model has a mean of 0.314630 with a 95% confidence interval: [0.0003, 0.0003]


## Student's t-Test for One-Sample

In [4]:
# Calculate the T-test for the mean of ONE group of scores. Let' have model 1 for example.

# Set significance level
alpha = 0.05

# Test if mean of model 1 is equal to mean1.
t,p = stats.ttest_1samp(data1, mean1)
print('t statistic value is %.4f.\nP-value is %.4f.'%(t,p))

# Conclude
if p > alpha:
    print('Probably the average performance of model 1 is %.4f at a %d%% confidence level.' %(mean1,(1-alpha)*100))
else:
    print('Probably the average performance of model 1 is not %.4f at a %d%% confidence level.' %(mean1,(1-alpha)*100))

t statistic value is 0.0000.
P-value is 1.0000.
Probably the average performance of model 1 is 0.3146 at a 95% confidence level.


## Paired Student's t-Test

Approach 1

In [5]:
# Set significance level
alpha = 0.05

# Calculate the difference of model 1 and model 2
diff_clf = [a-b for a,b in zip(data1,data2)]

# Test if the mean of difference is equal to 0.
t,p = stats.ttest_1samp(diff_clf, 0.0)
print('t statistic value is %.4f.\nP-value is %.4f.'%(t,p))

# Conclude
if p > 0.05:
    print('Probably the same distribution at a %d%% confidence level.' %((1-alpha)*100))
else:
    print('Probably different distributions at %d%% confidence level.' %((1-alpha)*100))

t statistic value is -4.0680.
P-value is 0.0028.
Probably different distributions at 95% confidence level.


Approach 2

In [7]:
# This is a two-sided test for the null hypothesis that 2 related samples have identical average values. Let's have model 1 and 2 for example

# Set significance level
alpha = 0.05

# Test if the distributions are the same.
t,p = stats.ttest_rel(data1,data2)
print('t statistic value is %.4f.\nP-value is %.4f.'%(t,p))

# Conclude
if p > alpha:
    print('Probably the same distribution at a %d%% confidence level.' %((1-alpha)*100))
else:
    print('Probably different distributions at a %d%% confidence level.' %((1-alpha)*100))

t statistic value is -4.0680.
P-value is 0.0028.
Probably different distributions at a 95% confidence level.


In [8]:
# Set significance level
alpha = 0.05

# Another example for model 1 and 3.
t,p = stats.ttest_rel(data1,data3)
print('t statistic value is %.4f.\nP-value is %.4f.'%(t,p))

# Conclude
if p > alpha:
    print('Probably the same distribution at a %d%% confidence level.' %((1-alpha)*100))
else:
    print('Probably different distributions at a %d%% confidence level.' %((1-alpha)*100))

t statistic value is -1.5411.
P-value is 0.1577.
Probably the same distribution at a 95% confidence level.


## Wilcoxon Signed-rankTest

In [9]:
# The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from the same distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. It is a non-parametric version of the paired T-test.

# Set significance level
alpha = 0.05

# Test if the distributions are the same.
t,p = stats.wilcoxon(data1,data2)
print('t statistic value is %.4f.\nP-value is %.4f.'%(t,p))

# Conclude
if p > alpha:
    print('Probably the same distribution at a %d%% confidence level.' %((1-alpha)*100))
else:
    print('Probably different distributions at a %d%% confidence level.' %((1-alpha)*100))

t statistic value is 2.0000.
P-value is 0.0093.
Probably different distributions at a 95% confidence level.


## McNemar’s Test

In [10]:
# Example of implementing the mcnemar test

from statsmodels.stats.contingency_tables import mcnemar

# Set significance level
alpha = 0.05

# Define contingency table
table = [[100, 12],
         [1, 30]]

# Implement mcnemar test
result = mcnemar(table, exact=True)
print('statistic=%.3f, p-value=%.3f.' % (result.statistic, result.pvalue))

# Conclude
if p > alpha:
	print('Same proportions of errors (fail to reject H0) at a %d%% confidence level.' %((1-alpha)*100))
else:
	print('Different proportions of errors (reject H0) at a %d%% confidence level.' %((1-alpha)*100))

statistic=1.000, p-value=0.003.
Different proportions of errors (reject H0) at a 95% confidence level.


## Friedman Test and Nemenyi Test

Friedman test

In [11]:
# The Friedman test tests the null hypothesis that the samples have the same distribution. 

# Set significance level
alpha = 0.05

# Test if the distributions are the same.
t,p = stats.friedmanchisquare(data1,data2,data3)
print('t statistic value is %.4f\nP-value is %.4f'%(t,p))

# Conclude
if p > alpha:
    print('Probably the same distribution at a %d%% confidence level.' %((1-alpha)*100))
else:
    print('Probably different distributions at a %d%% confidence level.' %((1-alpha)*100))

t statistic value is 6.2000
P-value is 0.0450
Probably different distributions at a 95% confidence level.


The post-hoc test: Nemenyi test

In [70]:
# The post-hoc test is deploed after we reject the null hypothesis of Friedman test. Here, we adopt Nemenyi test as an example.

# Set significance level
alpha = 0.05
# Set the number of models to be compared
k =3

# Obtain the value of q_alpha from 
import rpy2.robjects as robjects
r = robjects.r
q_alpha = r['qtukey'](1-alpha,k,float('inf'))/sqrt(2)  #qtukey(0.95,3,Inf)/sqrt(2) in R language
print('Critical value of q_alpha is: ',q_alpha)

# Calculate critical difference
CD = q_alpha*sqrt(k*(k+1)/(6*N))
print('Critical difference(CD) is: ',CD)

# Rank data
data = vstack((data1,data2,data3)).T
data = data.astype(float)
for i in range(len(data)):
    data[i] = stats.rankdata(data[i])
# print(data)
                  
# Compute the average rank and the difference between the average of the rank
avgrank = []
for i in range(len(data[0])):
   avgrank.append(mean(data[:,i]))
print('The average rank of each model is:', avgrank)
diff = []
for i in range(k):
    if i == 0:
        diff = abs(avgrank - avgrank[i])
    else: 
        diff = vstack((diff,abs(avgrank - avgrank[i])))
print('The difference table is:\n',diff)

# Conclude
for i in range(k):
    for j in range(i+1,k):
        if diff[i,j] > CD:
            print('The performance of model %d and %d is different at %.2f significance level.\n' %(i+1,j+1,alpha))
        else:
            print('The performance of model %d and %d is the same at %.2f significance level.\n' %(i+1,j+1,alpha))   

Critical value of q_alpha is:  [2.34370059]
Critical difference(CD) is:  [1.04813477]
The average rank of each model is: [1.5, 2.6, 1.9]
The difference table is:
 [[0.  1.1 0.4]
 [1.1 0.  0.7]
 [0.4 0.7 0. ]]
The performance of model 1 and 2 is different at 0.05 significance level.

The performance of model 1 and 3 is the same at 0.05 significance level.

The performance of model 2 and 3 is the same at 0.05 significance level.

