# Permutation Sampling

Course: DataCamp Statistical Thinking II

In [None]:
# module needed to hangle arrays
import numpy as np

In [None]:
# data provided by course
dem_share_PA  # percent votes democratic in Pensylvania counties
dem_share_OH  # same, for Ohio

# create a single dataset
dem_share_both = np.concatenate((dem_share_PA,
                               dem_share_OH))

In [None]:
# permute the entries of the array
dem_share_perm = np.random.permutation(dem_share_both)

In [None]:
# create new datasets or permutation samples
perm_sample_PA = dem_share_perm[:len(dem_share_PA)]
perm_sample_OH = dem_share_perm[len(dem_share_PA):]

## Visualise the results of permutation sampling

In [None]:
# Create 50 permutation samples
for _ in range(50):
    # Generate permutation samples
    perm_sample_1, perm_sample_2 = permutation_sample(rain_june, rain_november)  # see dependencies for function

    # Compute ECDFs; see dependencies for function
    x_1, y_1 = ecdf(perm_sample_1)
    x_2, y_2 = ecdf(perm_sample_2)

    # Plot ECDFs of permutation sample
    _ = plt.plot(x_1, y_1, marker='.', linestyle='none',
                 color='red', alpha=0.02)
    _ = plt.plot(x_2, y_2, marker='.', linestyle='none',
                 color='blue', alpha=0.02)

# Create and plot ECDFs from original data
x_1, y_1 = ecdf(rain_june)
x_2, y_2 = ecdf(rain_november)
_ = plt.plot(x_1, y_1, marker='.', linestyle='none', color='red')
_ = plt.plot(x_2, y_2, marker='.', linestyle='none', color='blue')

# Label axes, set margin, and show plot
plt.margins(0.02)
_ = plt.xlabel('monthly rainfall (mm)')
_ = plt.ylabel('ECDF')
plt.show()

## Perform a hypothesis test w/ p-value

In [None]:
# Given datasets
force_a  # frog tongue impacts of adults
force_b  # frog tongue impacts of juveniles

# Compute difference of mean impact force from experiment; see dependencies for function
empirical_diff_means = diff_of_means(force_a, force_b)

# Draw 10,000 permutation replicates; see dependencies for functions
perm_replicates = draw_perm_reps(force_a, force_b,
                                 diff_of_means, size=10000)

# Compute p-value: p
p = np.sum(perm_replicates >= empirical_diff_means) / len(perm_replicates)

# Print the result
print('p-value =', p)

## Dependencies

In [None]:
# DataCamp Statistical Thinking II
def permutation_sample(data1, data2):
    """Generate a permutation sample from two data sets."""

    # Concatenate the data sets: data
    data = np.concatenate((data1, data2))

    # Permute the concatenated array: permuted_data
    permuted_data = np.random.permutation(data)

    # Split the permuted array into two: perm_sample_1, perm_sample_2
    perm_sample_1 = permuted_data[:len(data1)]
    perm_sample_2 = permuted_data[len(data1):]

    return perm_sample_1, perm_sample_2

In [4]:
# DataCamp Statistical Thinking I
def ecdf(data):
    """
    Compute ECDF for a one-dimensional array of measurements
    """
    
    # Number of data points: n
    n = len(data)
    
    # sorted values
    x = np.sort(data)
    
    # evenly spaced datapoints with the maximum of 1
    y = np.arange(1, n+1) / n
    
    return x,y

In [None]:
# DataCamp Statistical Thinking II
def draw_perm_reps(data_1, data_2, func, size=1):
    """Generate multiple permutation replicates."""

    # Initialize array of replicates: perm_replicates
    perm_replicates = np.empty(size)

    for i in range(size):
        # Generate permutation sample
        perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)

        # Compute the test statistic
        perm_replicates[i] = func(perm_sample_1, perm_sample_2)  # function will be user defined

    return perm_replicates

In [None]:
# DataCamp Statistical Thinking II
def diff_of_means(data_1, data_2):
    """
    Difference in means of two arrays.
    """

    # The difference of means of data_1, data_2: diff
    diff = np.mean(data_1) - np.mean(data_2)

    return diff