In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Statistical Signficance

In [None]:
scores = Table.read_table('scores_by_section.csv')
scores

observed_average = 13.6667

# Simulate one value of the test statistic 
# under the hypothesis that the section is like a random sample from the class

def random_sample_midterm_avg():
    random_sample = scores.sample(27, with_replacement = False)
    return np.average(random_sample.column('Midterm'))

# Simulate 50,000 copies of the test statistic

sample_averages = make_array()

for i in np.arange(50000):
    sample_averages = np.append(sample_averages, random_sample_midterm_avg())    

# Compare the simulated distribution of the statistic
# and the actual observed statistic

averages_tbl = Table().with_column('Random Sample Average', sample_averages)
averages_tbl.hist(bins = 20)
plots.scatter(observed_average, 0, color = 'red', s=40);

In [None]:
# Simulate one value of the test statistic 
# under the hypothesis that the section is like a random sample from the class

def random_sample_midterm_avg():
    random_sample = scores.sample(27, with_replacement = False)
    return np.average(random_sample.column('Midterm'))

In [None]:
# Simulate 50,000 copies of the test statistic

sample_averages = make_array()

for i in np.arange(50000):
    sample_averages = np.append(sample_averages, random_sample_midterm_avg())    

In [None]:
# Compare the simulated distribution of the statistic
# and the actual observed statistic

averages_tbl = Table().with_column('Random Sample Average', sample_averages)
averages_tbl.hist(bins = 20)
plots.scatter(observed_average, 0, color = 'red', s=40);

In [None]:
sum(sample_averages <= observed_average) / 50000

In [None]:
# 5% of 50,000 = 2500

five_percent_point = averages_tbl.sort(0).column(0).item(2500)
five_percent_point

In [None]:
averages_tbl.hist(bins = 20)
plots.plot([five_percent_point, five_percent_point], [0, 0.35], color='gold', lw=2)
plots.title('Area to the left of the gold line: 5%');

# A/B Testing

In [None]:
births = Table.read_table('baby.csv')
births

Let's quickly look at the table and ask:
1. What does each row represent?
2. What do the columns represent?

**Question:** Was there an association between baby health and whether the mother was a smoker?

In [None]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')
smoking_and_birthweight

#### How many smokers and nonsmoker were there?
What table method can help us determine this?

<details>
<summary>Solution</summary>
  smoking_and_birthweight.group('Maternal Smoker')
</details>


### Distribution of birth weights for the groups
We want to look at the distribution of birth weight 
for these two groups. 

What visualization should we use?
<details>
<summary>Solution</summary>
  histogram!
</details>
<details>
<summary>Code</summary>
  smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')
</details>


Do we see a difference between the two groups?

<details>
<summary>Solution</summary>
  Yes, the baby's whose mother's smoked were a little lighter on average
</details>

(back to slides)
## Test Statistic

In [None]:
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table

In [None]:
means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference

In [None]:
def difference_of_means(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups"""
    
    #table with the two relevant columns
    reduced = table.select(label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [None]:
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')

(back to slides)
# Random Permutation (Shuffling)

In [None]:
letters = Table().with_column('Letter', make_array('a', 'b', 'c', 'd', 'e'))

In [None]:
letters.sample()

In [None]:
letters.sample(with_replacement = False)

In [None]:
letters.with_column('Shuffled', letters.sample(with_replacement = False).column(0))

# Simulation Under Null Hypothesis

In [None]:
smoking_and_birthweight

### Permute/Shuffle our data

In [None]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False
                                                ).column('Maternal Smoker')

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)

In [None]:
original_and_shuffled

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')

## Permutation Test


In [None]:
def one_simulated_difference(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False
                                                    ).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, label, 'Shuffled Label') 

In [None]:
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');