In [1]:
import numpy as np
import pandas as pd
import itertools

from scipy import stats
from statsmodels.stats.descriptivestats import sign_test
from statsmodels.stats.weightstats import zconfint
from statsmodels.stats.weightstats import *

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
data = np.array([49,58,75,110,112,132,151,276,281,362])
m0 = 200

In [4]:
np.round(stats.wilcoxon(data - m0, mode='approx'), 4)

array([17.    ,  0.2845])

In [5]:
data1 = np.array([22,22,15,13,19,19,18,20,21,13,13,15,])
data2 = np.array([17,18,18,15,12,4,14,15,10])

In [6]:
stats.mannwhitneyu(data1, data2, alternative="greater")

MannwhitneyuResult(statistic=81.0, pvalue=0.02900499272087373)

In [7]:
round(0.02900499272087373, 4)

0.029

In [8]:
data3 = pd.read_csv("data/challenger.txt", sep="\t")

In [9]:
i1 = data3.query("Incident == 1").Temperature.reset_index(drop=True)

In [10]:
i0 = data3.query("Incident == 0").Temperature.reset_index(drop=True)

In [79]:
def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data.values[indices]
    return samples
def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [80]:
random.seed(0)



i0_b = get_bootstrap_samples(i0, 1000)
i1_b = get_bootstrap_samples(i1, 1000)

In [81]:
np.round(stat_intervals(i0_b.mean(axis=1) - i1_b.mean(axis=1), .05), 4)

array([1.423 , 7.9386])

In [71]:
np.round(zconfint(i1_b.mean(axis=1) - i0_b.mean(axis=1)), 4)

array([-4.7392, -4.5263])

In [50]:
def permutation_t_stat_ind(sample1, sample2):
    return np.mean(sample1) - np.mean(sample2)

def get_random_combinations(n1, n2, max_combinations):
    index = list(range(n1 + n2))
    indices = set([tuple(index)])
    for i in range(max_combinations - 1):
        np.random.shuffle(index)
        indices.add(tuple(index))
    return [(index[:n1], index[n1:]) for index in indices]

def permutation_zero_dist_ind(sample1, sample2, max_combinations=None):
    joined_sample = np.hstack((sample1, sample2))
    n1 = len(sample1)
    n = len(joined_sample)
    if max_combinations:
        indices = get_random_combinations(n1, len(sample2), max_combinations)
    else:
        indices = [(list(index), filter(lambda i: i not in index, range(n))) for index in itertools.combinations(range(n), n1)]
    distr = [joined_sample[list(i[0])].mean() - joined_sample[list(i[1])].mean() for i in indices]
    return distr

def permutation_test(sample, mean, max_permutation=None, alternative="two-sided"):
    if alternative not in ("two-sided", "less", "greater"):
        raise ValueError("alternative not recognized\nshould be 'two-sided', 'less' or 'greater'")
    t_stat = permutation_t_stat_ind(sample, mean)
    zero_distr = permutation_zero_dist_ind(sample, mean, max_permutation)
    if alternative == 'two-sided':
        return sum([1 if abs(x) >= abs(t_stat) else 0 for x in zero_distr]) / len(zero_distr)
    if alternative == 'less':
        return sum([1 if x <= t_stat else 0 for x in zero_distr]) / len(zero_distr)
    if alternative == 'greater':
        return sum([1 if x >= t_stat else 0 for x in zero_distr]) / len(zero_distr)

In [55]:
random.seed(0)
permutation_test(i0, i1, max_permutation=10000)

0.007