# Hypothesis Testing

## **Permutation Sampling** 

We have two datasets; null is that they have the same probability distribution

In [None]:
# Given datasets
force_a  # frog tongue impacts of adults
force_b  # frog tongue impacts of juveniles

# Compute difference of mean impact force from experiment; see dependencies for function
empirical_diff_means = diff_of_means(force_a, force_b)

# Draw 10,000 permutation replicates; see dependencies for functions
perm_replicates = draw_perm_reps(force_a, force_b,
                                 diff_of_means, size=10000)

# Compute p-value: p
p = np.sum(perm_replicates >= empirical_diff_means) / len(perm_replicates)

# Print the result
print('p-value =', p)

## **One Sample Tests**

We have one dataset and a value; null is that we could acquire that value from the dataset

In [None]:
# given data
newcomb_value = 299860  # speed of light value in km/s
michelson_speed_of_light  # dataset of speed of light measurements

# shift the michelson dataset so that the mean is that of newcomb's
michelson_shifted = michelson_speed_of_light - np.mean(michelson_speed_of_light) + newcomb_value

# test statistic is the mean of the bootstrap sample minus Newcomb's value
def diff_form_newcomb(data, newcomb_value = 299860):
    return np.mean(data) - newcomb_value

# calculate the observed test statistic
diff_obs = diff_from_newcomb(michelson_speed_of_light)

# draw bootstrap replicates (test statistic) from shifted dataset
bs_replicates = draw_bs_reps(michelson_shifted, diff_from_newcomb, 10000)  # see dependencies for function

# calculate the p-value
p_value = np.sum(bs_replicates <= diff_observed) / 10000

## **Two Sample Tests**

We have two datasets; null is that the means are the same, not necessarily the probability distributions

In [None]:
# Given datasets
force_a  # tongue forces of adult frogs
force_b  # tongue forces of juvenile frogs
forces_concat = np.concatenate((force_a, force_b))  # notice double brackets
empirical_diff_means  # observed difference in means between force_a and force_b

# Compute mean of all forces: mean_force
mean_force = np.mean(forces_concat)

# Generate shifted arrays
force_a_shifted = force_a - np.mean(force_a) + mean_force
force_b_shifted = force_b - np.mean(force_b) + mean_force

# Compute 10,000 bootstrap replicates from shifted arrays
bs_replicates_a = draw_bs_reps(force_a_shifted, np.mean, size=10000)  # see dependencies for function
bs_replicates_b = draw_bs_reps(force_b_shifted, np.mean, size=10000)

# Get replicates of difference of means: bs_replicates
bs_replicates = bs_replicates_a - bs_replicates_b

# Compute and print p-value: p
p = np.sum(bs_replicates >= empirical_diff_means) / len(bs_replicates)
print('p-value =', p)

## **A/B Testing**

In this example we have a new website design. We want to know if it results in a higher click through rate. We assign half of the users to the old design and half to the new design. We compare the click through rates to see if they are statistically different.

In [None]:
import numpy as np

# given datasets of clickthrough rates
clickthrough_A  # 0 and 1 arrays
clickthrough_B

In [None]:
def diff_frac(data_A, data_B):
    frac_A = np.sum(data_A) / len(data_A)
    frac_B = np.sum(data_B) / len(data_B)
    return frac_B - frac_A

In [None]:
# observed value of test statistic
diff_frac_obs = diff_frac(clickthrough_A, 
                         clickthrough_B)

# permutation tests
perm_replicates = np.empty(10000)

for i in range(10000):
    perm_replicates[i] = permutation_replicate(clickthrough_A, clickthrough_B, diff_frac)

p_value = np.sum(perm_replicates >= diff_frac_obs) / 10000

p_value    

## **Correlation Testing**

How can we know for sure that a correlation is real? The null hypothesis is that the variables are uncorrelated. We permute one of the arrays so that we lose any correlations.

In [None]:
# Compute observed correlation: r_obs
r_obs = pearson_r(illiteracy, fertility)

# Initialize permutation replicates: perm_replicates
perm_replicates = np.empty(10000)

# Draw replicates
for i in range(10000):
    # Permute illiteracy measurments: illiteracy_permuted
    illiteracy_permuted = np.random.permutation(illiteracy)

    # Compute Pearson correlation
    perm_replicates[i] = pearson_r(illiteracy_permuted, fertility)

# Compute p-value: p
p = np.sum(perm_replicates >= r_obs) / len(perm_replicates)
print('p-val =', p)

## Dependencies

In [None]:
# DataCamp Statistical Thinking II
def diff_of_means(data_1, data_2):
    """
    Difference in means of two arrays.
    """

    # The difference of means of data_1, data_2: diff
    diff = np.mean(data_1) - np.mean(data_2)

    return diff

In [None]:
# DataCamp Statistical Thinking II
def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""

    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)  # function will be user defined

    return perm_replicates

In [None]:
def draw_bs_reps(data, func, size=1):
    """
    Draw bootstrap replicates.
    """

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1d(data, func)

    return bs_replicates

In [4]:
# DataCamp Statistical Thinking I
def ecdf(data):
    """
    Compute ECDF for a one-dimensional array of measurements
    """
    
    # Number of data points: n
    n = len(data)
    
    # sorted values
    x = np.sort(data)
    
    # evenly spaced datapoints with the maximum of 1
    y = np.arange(1, n+1) / n
    
    return x,y

In [None]:
# DataCamp Statistical Thinking II
def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate((data1, data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2