In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 18 ##

In [None]:
scores = Table.read_table('scores_by_section.csv')
scores

In [None]:
scores.group('Section')

In [None]:
scores.group('Section', np.average).show()

In [None]:
observed_average = 13.6667 

In [None]:
random_sample = scores.sample(27, with_replacement=False)
random_sample

In [None]:
np.average(random_sample.column('Midterm'))

In [None]:
# Simulate one value of the test statistic 
# under the hypothesis that the section is like a random sample from the class

def random_sample_midterm_avg():
    random_sample = scores.sample(27, with_replacement = False)
    return np.average(random_sample.column('Midterm'))

In [None]:
# Simulate 50,000 copies of the test statistic

sample_averages = make_array()

for i in np.arange(50000):
    sample_averages = np.append(sample_averages, random_sample_midterm_avg())    

In [None]:
# Compare the simulated distribution of the statistic
# and the actual observed statistic

averages_tbl = Table().with_column('Random Sample Average', sample_averages)
averages_tbl.hist(bins = 20)
plots.scatter(observed_average, 0, color = 'red', s=40);

In [None]:
sum(sample_averages <= observed_average) / 50000

In [None]:
# 5% of 50,000 = 2500

five_percent_point = averages_tbl.sort(0).column(0).item(2500)
five_percent_point

In [None]:
averages_tbl.hist(bins = 20)
plots.plot([five_percent_point, five_percent_point], [0, 0.35], color='gold', lw=2)
plots.title('Area to the left of the gold line: 5%');

## Deflategate

In [None]:
football = Table.read_table('deflategate.csv')
football.show()

In [None]:
# Initially, Patriot's footballs were ~12.5psi,
# Colts' footballs were ~13psi
initials = np.append(np.ones(11) * 12.5, np.ones(4) * 13)
initials

In [None]:
# Weights measured at halftime
halftime_wts = (football.column('Blakeman')+football.column('Prioleau'))/2
halftime_wts

In [None]:
football = football.with_columns(
    'Weight at Halftime', halftime_wts,
    'Estimate at Start', initials,
    'Drop', initials - halftime_wts
)
football.show()

In [None]:
def difference_in_average_drop(t):
    averages = t.select('Team', 'Drop').group('Team', np.average).column(1)
    return averages.item(1) - averages.item(0)

observed = difference_in_average_drop(football)
observed

In [None]:
group_labels = football.select('Team')
drops = football.select('Drop')

In [None]:
shuffled_drops = drops.sample(with_replacement=False).column(0)
shuffled_tbl = group_labels.with_column('Drop', shuffled_drops)
difference_in_average_drop(shuffled_tbl)

In [None]:
sampled_stats = make_array()

for i in np.arange(10000):
    shuffled_drops = drops.sample(with_replacement=False).column(0)
    shuffled_tbl = group_labels.with_column('Drop', shuffled_drops)
    new_diff = difference_in_average_drop(shuffled_tbl)
    sampled_stats = np.append(sampled_stats, new_diff)

In [None]:
Table().with_column('Null distribution', sampled_stats).hist()
_ = plots.plot([observed, observed], [0, 1.4])

In [None]:
np.count_nonzero(sampled_stats >= observed)/len(sampled_stats)

In [None]:
football = football.with_column(
    'Combined', (football.column(1)+football.column(2))/2
    ).drop(1, 2)
football.show()

In [None]:
np.ones(11)

In [None]:
patriots_start = 12.5 * np.ones(11)
colts_start = 13 * np.ones(4)
start = np.append(patriots_start, colts_start)
start

In [None]:
drop = start - football.column('Combined')
football = football.with_column('Pressure Drop', drop)
football.show()

In [None]:
football = football.drop('Combined')
football.group('Team', np.average)

In [None]:
observed_means = football.group('Team', np.average).column(1)

observed_difference = observed_means.item(1) - observed_means.item(0)
observed_difference

The function `difference_of_means` takes three arguments:

- the name of the table of data
- the label of the column containing the numerical variable whose average is of interest
- the label of the column containing the two group labels

It returns the difference between the means of the two groups. 

In [None]:
def difference_of_means(table, label, group_label):
    reduced = table.select(label, group_label)
    means_table = reduced.group(group_label, np.average)
    means = means_table.column(1)
    return means.item(1) - means.item(0)

In [None]:
difference_of_means(football, 'Pressure Drop', 'Team')

In [None]:
def one_simulated_difference(table, label, group_label):
    shuffled_labels = table.sample(with_replacement = False
                                                    ).column(group_label)
    shuffled_table = table.select(label).with_column(
        'Shuffled Label', shuffled_labels)
    return difference_of_means(shuffled_table, label, 'Shuffled Label')   

In [None]:
differences = make_array()

repetitions = 10000
for i in np.arange(repetitions):
    new_difference = one_simulated_difference(football, 'Pressure Drop', 'Team')
    differences = np.append(differences, new_difference)

In [None]:
# p-Value
empirical_P = np.count_nonzero(differences >= observed_difference) / 10000
empirical_P

In [None]:
Table().with_column('Difference Between Group Averages', differences).hist()
plots.scatter(observed_difference, 0, color='red', s=30)
plots.title('Prediction Under the Null Hypothesis')
print('Observed Difference:', observed_difference)
print('Empirical P-value:', empirical_P)