In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Comparing Two Samples
Let's walk through these two functions together

In [None]:
def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [None]:
def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(
        with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(
        shuffled_table, numeric_label, 'Shuffled Label')

(back to slides)
# Randomized Control Trial
Let's look at the example from the textbook the discusses whether
botulinum toxin A causes pain relief

The data is stored in a csv file named `bta.csv`.
How can we load in the data?

In [None]:
bta = ...
bta.show()

**QUESTION:** What do the columns and values represent?

What table function can we use to help us figure this out? 

*Hint: what function shows us the statistics about the values a column has?*

<details>
<summary>Solution</summary>
  botox.pivot('Result', 'Group')
</details>

In [None]:
bta.pivot("Result", "Group")

**QUESTION:** What statistic should we use here to see a difference between the groups?

<details>
<summary>Review</summary>
  Copy the following in markdown below <br>
    ![title](hypothesis_testing_stat_review.png)
</details>

<details>
<summary>Solution</summary>
  difference of means
</details>

**QUESTION:** How do we compute that difference?

<details>
<summary>Solution</summary>
  botox.group('Group', np.average)
    <br><br>
  tmp_tbl = botox.group('Group', np.average)
   <br><br>
    observed_stat = bta_means_tbl.column(1).item(1) - bta_means_tbl.column(1).item(0)
observed_stat
</details>


In [None]:
bta_means_tbl = bta.group("Group", np.average)
bta_means_tbl

In [None]:
observed_stat = bta_means_tbl.column(1).item(1) - bta_means_tbl.column(1).item(0)
observed_stat

##  Testing the Hypothesis

**Question:** How can we use the functions from the top?
<details>
<summary>Solution</summary>
  difference of means(botox, 'Result', 'Group')
</details>  

In [None]:
observed_diff = ...
observed_diff

**Question:** How can we simulate this statistic once?
<details>
<summary>Solution</summary>
  one_simulated_difference(botox, 'Result', 'Group')
</details> 

In [None]:
one_simulated_difference(bta, 'Result', 'Group')

Now let's simulate that 10k times

In [None]:
# answer is below
all_simulated_stats = make_array()
for i in np.arange(1e4):
    simulated_stat = one_simulated_difference(bta, 'Result', 'Group')
    all_simulated_stats = np.append(all_simulated_stats, simulated_stat)

len(all_simulated_stats) == 1e4

In [None]:
Table().with_column('Mean difference of Treatment - Control outcomes', all_simulated_stats).hist() #bins=np.arange(-0.6, 0.6, 0.01))

**Question:** Is the observed statistic statistically significant?

In [None]:
observed_stat

**Question:** Which of the following is our p-value? 

Hint: Which tail should we use?

In [None]:
# p-value1
p_1 = sum(all_simulated_stats >= observed_stat)/len(all_simulated_stats)

# p-value2
p_2 = sum(all_simulated_stats <= observed_stat)/len(all_simulated_stats)

p_1, p_2

**Question:** Is this statistically significant?

(back to slides)
# Percentiles

In [None]:
# Manually compute the 55th percentile.
x = make_array(43, 20, 51, 7, 28, 34)

In [None]:
# Step 1. Sort the data
np.sort(x)

In [None]:
# Step 2. Figure out where 55th percentile would be.
# The Xth percentile is first value on the sorted list that is at least as large as X% of the elements 

In [None]:
# OR: 1 Line of Code using percentile()
percentile(55, x)

(back to slides)
## Percentiles questions

In [None]:
s= [1, 7, 3, 9, 5]

In [None]:
#1 
percentile(10, s) == 0

In [None]:
# 2 
percentile(39, s) == percentile(40, s)

In [None]:
# 3
percentile(40, s) == percentile(41, s) 

In [None]:
# 4
percentile(50, s) == 5 

# Estimation

### Sample Median

In [None]:
sf = Table.read_table('san_francisco_2015.csv')
sf.show(5)

**Question:** What is this dataset?

In [None]:
salary_sf.group(0).barh(0)

In [None]:
# skip

In [None]:
# skip

In [None]:
# skip

In [None]:
# We only care about salary for now
salary_sf = sf.select(3, 11, 21)
salary_sf

**Question:** Who is making the most money?

**Question:** Who is making the least money?

**Question:** What is the typical salary? Should we compute the mean or median for this?

How can we compute the median with what we covered today?
<details>
<summary>Solution</summary>
  percentile(50, sf.column('Total Compensation'))
</details>  

In [None]:
pop_median = ...
pop_median

In [None]:
sf_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=sf_bins)
plots.title('Population Distribution');

### Estimating Salaries
Say we dont have salaries. What would we do to compute the 50% percentile of salaries?

In [None]:
# skip

In [None]:
# skip

In [None]:
# skip

In [None]:
our_sample = sf.sample(300, with_replacement=False)
our_sample.show(5)

In [None]:
estimate_median = percentile(50, our_sample.column('Total Compensation'))
estimate_median

In [None]:
our_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Sample Distribution');

**Question:** How far off is our estimate from the true median?

In [None]:
**Question:** How far off is our estimate from the true median

In [None]:
pop_median, estimate_median, abs(pop_median - estimate_median)

## Variability of the Estimate

Let's implement the following function:

<details>
<summary>Solution</summary>
  our_sample = sf.sample(samp_size, with_replacement=False)
  return percentile(50, our_sample.column('Total Compensation'))
</details>


In [None]:
def generate_sample_median(samp_size):
    """
    Given a sample size
    Returns the median of a random sample of samp_size from the sf table"""
    # Question: sample with or without replacement?
    
    

sample_median = generate_sample_median(300)
sample_median

**Question:** What is our error?


<details>
<summary>Solution</summary>
  error = sample_median - pop_median
</details>


In [None]:
error = ...
error

(back to slides)
# Quantifying Uncertainty

Lets take 1k estimates where each sample has 300 individuals


In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = generate_sample_median(300)
    sample_medians = np.append(sample_medians, new_median)
sample_medians

Let's plot the medians

In [None]:
med_bins = np.arange(90000, 125001, 2500)
Table().with_column(
    'Sample Medians', sample_medians
).hist(bins = med_bins)

plots.scatter(pop_median, -1e-6, color="red");

Lets plot the errors

In [None]:
err_bins = np.arange(-15000, 12501, 2500)
Table().with_column(
    'Errors', sample_medians - pop_median
).hist(bins = err_bins)

plots.scatter(0, -1e-6, color="red");

(back to slides)
# Bootstrap

In [None]:
our_sample

In [None]:
# Take a bootstrap (re)sample of size 300, WITH replacement

# Sample from our sample
boot_sample = our_sample.sample(with_replacement=True)
#boot_sample

Let's compare the median from our sample with the median of the boostrapped sample

In [None]:
our_sample_median = percentile(50, our_sample.column('Total Compensation'))
boot_sample_median = percentile(50, boot_sample.column('Total Compensation'))

In [None]:
# Show the bootstrap sample 
boot_sample.hist('Total Compensation', bins=sf_bins)
plots.title('1 Bootstrap sample');

print("Population Median =       ", pop_median)
print("Our Sample Median =       ", our_sample_median)
print("Bootstrap Sample Median = ", 
      percentile(50,boot_sample.column('Total Compensation')))

### Multiple Bootstraps
Let's take 1k bootstraps. Lets start by implementing the following function

<details>
<summary>Solution</summary>
   single_sample = our_sample.sample()
   return percentile(50, single_sample.column('Total Compensation'))
</details>  

In [None]:
def one_bootstrap_median():
    single_sample = ...
    return ...

Now let's keep track of 1k bootstrapped median

In [None]:
# Bootstrap our sample 1000 times
bootstrap_medians = ...
for i in np.arange(1000):
    new_median = ...
    bootstrap_medians = ...

Let's visualize these bootstrapped medians

In [None]:
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.scatter(pop_median, 0, color="red");
plots.scatter(our_sample_median, 0, color="blue");
plots.title('Bootstrap Medians (1K Bootstraps from our Sample)');

# 95% Confidence Interval

**Question**: How could we make an interval based on the middle 95% of bootstrap samples?

- *Hint 1:* Remember we stored the bootstrapped medians in an array called `bootstrap_medians`
- *Hint 2:* What did we learn about in the begining of this lecture?


<details>
<summary>Solution</summary>
  left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)
</details>  

In [None]:
# Make an interval based on the middle 95% of bootstrap samples

left = ...
right = ...

In [None]:
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.plot([left, right], [0,0], color="gold",lw=3, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(our_sample_median, 0, color="blue", zorder=2);
plots.title('Bootstrap Medians (1K Bootstraps from our Sample)');

## Another Example: Mean Maternal Age

In [None]:
# This time we have a sample, but no population data!
births = Table.read_table('baby.csv')
births.show(5)

How can we see a distribution of maternal ages?

<details>
<summary>Solution</summary>
  births.hist('Maternal Age')
</details>  

What is the mean age?

In [None]:
mean_age = ...
mean_age

Now let's use bootstraping to find samples means

<details>
<summary>Solution</summary>
  np.mean(births.sample().column('Maternal Age'))
</details>  

In [None]:
def one_bootstrap_mean():
    return ...

Let's compute 1k bootstrapped samples

In [None]:
bootstrap_means = make_array()

for i in np.arange(1000):
    new_mean = one_bootstrap_mean()
    bootstrap_means = np.append(bootstrap_means, new_mean)
    
left = percentile(2.5, bootstrap_means)
right = percentile(97.5, bootstrap_means)

In [None]:
Table().with_column('Bootstrap means', bootstrap_means).hist()

plots.plot([left,right], [0,0], color="gold", lw=3, zorder=1);
plots.scatter(mean_age,0,color="blue", zorder=2);
plots.title('Bootstrap Means (1K Bootstraps from our Sample)');