# # Statistical tests

Apply statistical tests (T-student and Wilcoxon's signed-rank test) on k-nn and decision trees on Iris data with 10 fold cross-validation.
For which cases are there statistical differences on the observed accuracies?

In [25]:
from scipy import stats
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

In [26]:
#perform a t-test for Decision Tree and KNeighborsClassifier
def t_test_DT_NN(dataset, n_neighbors = 7, weights = 'distance', min_samples_leaf=1, criteria = 'gini', log=True):
    clf_decision_tree = tree.DecisionTreeClassifier(min_samples_leaf = min_samples_leaf, criterion = criteria)
    cld_neigh = KNeighborsClassifier(n_neighbors=n_neighbors, weights = weights)

    n_split = 10
    kf = KFold(n_splits=n_split)
    fold = 1
    accuracy_difference = []
    accuracy_DT = []
    accuracy_NN = []
    for train, test in kf.split(dataset.data,dataset.target):
        clf_decision_tree.fit(dataset.data[train], dataset.target[train])
        accuracy_decision_tree = accuracy_score(dataset.target[test], clf_decision_tree.predict(dataset.data[test]))
        cld_neigh.fit(dataset.data[train], dataset.target[train])
        accuracy_neigh = accuracy_score(dataset.target[test], cld_neigh.predict(dataset.data[test]))
        if log : 
            print "\nFold: %s - Accuracy: (Decision Tree Classifier): %s" % (fold , accuracy_decision_tree)
            print "Fold: %s - Accuracy: (Decision KNeighborsClassifier): %s" % (fold , accuracy_neigh)
            print "DT-NN: %s" % (accuracy_decision_tree-accuracy_neigh)
        accuracy_difference.append(accuracy_decision_tree-accuracy_neigh)
        accuracy_DT.append(accuracy_decision_tree)
        accuracy_NN.append(accuracy_neigh)
        fold += 1
    if log:
        print "\nAVG: %s" % np.mean(accuracy_difference)
        print "\nStdev: %s" % np.std(accuracy_difference)

    #Null hypothesis rejected if the p-value is smaller than the significance level (Es: a = 0.05)
    statistic,p_value = stats.ttest_rel(accuracy_DT, accuracy_NN)
    print "\np-value: %s < 0.05 == %s" % (p_value, str(p_value < 0.05))
    print "\nNull hypothesis {0} rejected, the difference between the two algorithms is {0} found significant.".format("" if p_value < 0.05 else "not")

In [27]:
iris = load_iris()
t_test_DT_NN(iris, n_neighbors = 7)


Fold: 1 - Accuracy: (Decision Tree Classifier): 1.0
Fold: 1 - Accuracy: (Decision KNeighborsClassifier): 1.0
DT-NN: 0.0

Fold: 2 - Accuracy: (Decision Tree Classifier): 1.0
Fold: 2 - Accuracy: (Decision KNeighborsClassifier): 1.0
DT-NN: 0.0

Fold: 3 - Accuracy: (Decision Tree Classifier): 1.0
Fold: 3 - Accuracy: (Decision KNeighborsClassifier): 1.0
DT-NN: 0.0

Fold: 4 - Accuracy: (Decision Tree Classifier): 0.933333333333
Fold: 4 - Accuracy: (Decision KNeighborsClassifier): 1.0
DT-NN: -0.0666666666667

Fold: 5 - Accuracy: (Decision Tree Classifier): 0.933333333333
Fold: 5 - Accuracy: (Decision KNeighborsClassifier): 0.866666666667
DT-NN: 0.0666666666667

Fold: 6 - Accuracy: (Decision Tree Classifier): 0.866666666667
Fold: 6 - Accuracy: (Decision KNeighborsClassifier): 0.866666666667
DT-NN: 0.0

Fold: 7 - Accuracy: (Decision Tree Classifier): 1.0
Fold: 7 - Accuracy: (Decision KNeighborsClassifier): 1.0
DT-NN: 0.0

Fold: 8 - Accuracy: (Decision Tree Classifier): 0.866666666667
Fold: 8 -

In [28]:
#Different valuse of K (1-10)
for k in range (1,11):
    print "\n\nFor K = %s" % k
    t_test_DT_NN(iris, n_neighbors = k, log = False)



For K = 1

p-value: 0.343436396138 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 2

p-value: 0.212579892323 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 3

p-value: 0.678309741806 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 4

p-value: 0.555445442106 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 5

p-value: 0.678309741806 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 6

p-value: 1.0 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 7

p-value: 0.343436396138 < 0.05 == False

Null hypothesis not rejected, the difference 

In [29]:
import random 

def get_ramdom_params():
    rnd = random.randint(0, 1)
    if rnd == 1:
        criteria = 'gini'
    else:
        criteria = 'entropy'
    rnd = random.randint(0, 1)
    if rnd == 1:
        weights = 'uniform'
    else:
        weights = 'distance'
    k = random.randint(1, 20)
    min_samples_leaf = random.randint(1, 20)
    return k,weights,min_samples_leaf,criteria

In [30]:
#t-test with random k value, random distance "wighted" or uniform, random number of samples per leaf and random split
#criteria
for i in range (1,11):
    k,weights,min_samples_leaf,criteria = get_ramdom_params()
    print "\n\nFor K = %s | Weights: %s | min_samples_leaf = %s | criteria = %s" % (k,weights,min_samples_leaf,criteria)
    t_test_DT_NN(iris, n_neighbors = k, weights = weights, min_samples_leaf = min_samples_leaf, criteria = criteria, log = False)



For K = 7 | Weights: distance | min_samples_leaf = 10 | criteria = gini

p-value: 0.193422059603 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 12 | Weights: uniform | min_samples_leaf = 10 | criteria = entropy

p-value: 1.0 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 13 | Weights: uniform | min_samples_leaf = 16 | criteria = gini

p-value: 0.678309741806 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 7 | Weights: distance | min_samples_leaf = 12 | criteria = entropy

p-value: 0.193422059603 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 5 | Weights: uniform | min_samples_leaf = 1 | criteria = gini

p-value: 0.678309741806 < 0.05 == False

Null hypothesis not rejected, the d

In [31]:
#perform a Wilcoxon's for Decision Tree and KNeighborsClassifier
def wilcoxon_test_DT_NN(dataset, n_neighbors = 7, weights = 'weights', min_samples_leaf=1, criteria = 'gini', log=True):
    clf_decision_tree = tree.DecisionTreeClassifier(min_samples_leaf = min_samples_leaf, criterion = criteria)
    cld_neigh = KNeighborsClassifier(n_neighbors=n_neighbors, weights = weights)

    n_split = 10
    kf = KFold(n_splits=n_split)
    fold = 1
    accuracy_difference = []
    accuracy_DT = []
    accuracy_NN = []
    for train, test in kf.split(dataset.data,dataset.target):
        clf_decision_tree.fit(dataset.data[train], dataset.target[train])
        accuracy_decision_tree = accuracy_score(dataset.target[test], clf_decision_tree.predict(dataset.data[test]))
        cld_neigh.fit(dataset.data[train], dataset.target[train])
        accuracy_neigh = accuracy_score(dataset.target[test], cld_neigh.predict(dataset.data[test]))
        if log : 
            print "\nFold: %s - Accuracy: (Decision Tree Classifier): %s" % (fold , accuracy_decision_tree)
            print "Fold: %s - Accuracy: (Decision KNeighborsClassifier): %s" % (fold , accuracy_neigh)
            print "DT-NN: %s" % (accuracy_decision_tree-accuracy_neigh)
        accuracy_difference.append(accuracy_decision_tree-accuracy_neigh)
        accuracy_DT.append(accuracy_decision_tree)
        accuracy_NN.append(accuracy_neigh)
        fold += 1
    if log:
        print "\nAVG: %s" % np.mean(accuracy_difference)
        print "\nStdev: %s" % np.std(accuracy_difference)

    #Null hypothesis rejected if the p-value is smaller than the significance level (Es: a = 0.05)
    statistic,p_value = stats.wilcoxon(accuracy_DT, accuracy_NN)
    print "\np-value: %s < 0.05 == %s" % (p_value, str(p_value < 0.05))
    print "\nNull hypothesis {0} rejected, the difference between the two algorithms is {0} found significant.".format("" if p_value < 0.05 else "not")

In [32]:
#Wilcoxon's test with random k value, random distance "wighted" or uniform, random number of samples per leaf and random split
#criteria
for i in range (1,11):
    k,weights,min_samples_leaf,criteria = get_ramdom_params()
    print "\n\nFor K = %s | Weights: %s | min_samples_leaf = %s | criteria = %s" % (k,weights,min_samples_leaf,criteria)
    wilcoxon_test_DT_NN(iris, n_neighbors = k, weights = weights, min_samples_leaf = min_samples_leaf, criteria = criteria, log = False)



For K = 7 | Weights: uniform | min_samples_leaf = 10 | criteria = gini

p-value: 1.0 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 13 | Weights: uniform | min_samples_leaf = 20 | criteria = entropy

p-value: 0.654720846019 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 16 | Weights: distance | min_samples_leaf = 18 | criteria = gini

p-value: 0.563702861651 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 3 | Weights: distance | min_samples_leaf = 5 | criteria = entropy

p-value: 0.654720846019 < 0.05 == False

Null hypothesis not rejected, the difference between the two algorithms is not found significant.


For K = 4 | Weights: distance | min_samples_leaf = 14 | criteria = entropy

p-value: 0.256839257958 < 0.05 == False

Null hypothesis not rejected, t