In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def standard_units(any_numbers):
    """Convert any array of numbers to standard units."""
    return (any_numbers - np.average(any_numbers)) / np.std(any_numbers)

def correlation(t, x, y):
    """Return the correlation coefficient (r) of two variables."""
    return np.mean(standard_units(t.column(x)) * standard_units(t.column(y)))

def slope(t, x, y):
    """The slope of the regression line (original units)."""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)."""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))

def fit(t, x, y):
    """The fitted values along the regression line."""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a * t.column(x) + b

def plot_residuals(t, x, y):
    """Plot a scatter diagram and residuals."""
    t.scatter(x, y, fit_line=True)
    actual = t.column(y)
    fitted = fit(t, x, y)
    residuals = actual - fitted
    print('r:', correlation(t, x, y))
    print('RMSE:', np.mean(residuals**2)**0.5)
    t.select(x).with_column('Residual', residuals).scatter(0, 1)

# Regression Model

In [None]:
def draw_and_compare(true_slope, true_int, sample_size):
    """ Given true_slope and true_intercept, 
        randomly create sample_size-number
        of points along the line 
        (where we add some noise to the y-value)
    
    """
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    errors = np.random.normal(0, 6, sample_size)
    y = (true_slope * x + true_int) + errors
    sample = Table().with_columns('x', x, 'y', y)
    
    sample.scatter('x', 'y')
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('True Line, and Points Created')
    
    sample.scatter('x', 'y')
    plots.title('What We Get to See')
    
    sample.scatter('x', 'y', fit_line=True)
    plots.title('Regression Line: Estimate of True Line')
    
    sample.scatter('x', 'y', fit_line=True)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('Regression Line (blue) and True Line (green)')
    
draw_and_compare(2, -5, 10)

**Question:** Were the regression line and the true line the same?

Answer:

let's draw the same but for 100 and then 1000 examples

In [None]:
draw_and_compare(2, -5, 100)

In [None]:
draw_and_compare(2, -5, 1000)

(ask before we run the above)
**True or False Question:** The more points we have, the further away the regression line will be from the True line

(back to slides)
# Prediction Variability


In [None]:
baby = Table.read_table('baby.csv')
baby.show(5)

**Question:** Is there a linear assocation between the length of a pregnancy and the birth weight?

Visualize, then quantify!

**Question:** Let's find the best-fit line and make predictions

helpful methods: `fit`, `standard_unit`, `plot_residuals`, ...


Let's look at the error of our predictions.

**Question:** How can we do that?

There is not much structure in the residual (it is a blob) so that means this is a fairly linear association

Let's focus on the middle of the residuals and pick 300 as an x-value

In [None]:
x = 300
a = slope(baby, 'Gestational Days', 'Birth Weight')
b = intercept(baby, 'Gestational Days', 'Birth Weight')
predicted_y = a * x + b
baby.scatter('Gestational Days', 'Birth Weight', fit_line=True)
plots.scatter(300, predicted_y, color='gold', s=200);

When `x=300`, the predicted y is ...

In [None]:
predicted_y

We found this by using the prediction function. 

In [None]:
def prediction_at(t, x):
    a = slope(t, 'Gestational Days', 'Birth Weight')
    b = intercept(t, 'Gestational Days', 'Birth Weight')
    return a * x + b


We can make a prediction for a new sample of our data

In [None]:
for i in np.arange(4):
    resample = baby.sample()
    predicted_y = prediction_at(resample, 300)
    resample.scatter('Gestational Days', 'Birth Weight', fit_line=True)
    plots.scatter(300, predicted_y, color='gold', s=200)

**Question:** Is there variation in our predicted y-value when `x==300`?

Let's draw 10 different regression lines based on 10 resamples

In [None]:
lines = Table(['slope', 'intercept', 'at 150', 'at 300', 'at 350'])

for i in range(10):
    resample = baby.sample()
    a = slope(resample, 'Gestational Days', 'Birth Weight')
    b = intercept(resample, 'Gestational Days', 'Birth Weight')
    lines.append([a, b, a * 150 + b, a * 300 + b, a * 350 + b])
    
baby.scatter('Gestational Days', 'Birth Weight')
for i in np.arange(lines.num_rows):
    line = lines.row(i)
    plots.plot([150, 350], [line.item('at 150'), line.item('at 350')], lw=1)
    plots.scatter(300, line.item('at 300'), s=200)

**Question:** Where do we get very different regressions (estimated y-values)?

Let's repeat this but zoom in on `x==300`

In [None]:
lines = Table(['slope', 'intercept', 'at 291', 'at 300', 'at 309'])

for i in range(10):
    resample = baby.sample()
    a = slope(resample, 'Gestational Days', 'Birth Weight')
    b = intercept(resample, 'Gestational Days', 'Birth Weight')
    lines.append([a, b, a * 291 + b, a * 300 + b, a * 309 + b])
    
for i in np.arange(lines.num_rows):
    line = lines.row(i)
    plots.plot([291, 309], [line.item('at 291'), line.item('at 309')], lw=1)
    plots.scatter(300, line.item('at 300'), s=30)

**Question:** What is the kind of variability we see in this zoomed in figure?

Now we can compute the bootstrap to compute Confidence Interval

In [None]:
# This will take about 15 seconds so let's walk through the code

%%time
def bootstrap_prediction(table, x, y, new_x, repetitions=5000):

    # Bootstrap resampling
    predictions = []
    for i in np.arange(repetitions):
        resample = table.sample()
        a = slope(resample, x, y)
        b = intercept(resample, x, y)
        predicted_y = a * new_x + b
        predictions.append(predicted_y)

    # Find the ends of the approximate 95% prediction interval
    left = percentile(2.5, predictions)
    right = percentile(97.5, predictions)

    # Display results
    Table().with_column('Prediction', predictions).hist(bins=20)
    plots.xlabel('predictions at x='+str(new_x))
    plots.plot([left, right], [0, 0], color='yellow', lw=8);
    print('Approximate 95%-confidence interval for height of true line:')
    print(left, right, '(width =', right - left, ')')
    
bootstrap_prediction(baby, 'Gestational Days', 'Birth Weight', 300)

**Question:** What happens to our confidence interval if we want to make a prediction further from the center, so say `x==330`?

**Question:** What happens to our confidence interval if we want to make a prediction closer to the center, so say `x==270`?

(back to slides)

# Slope Inference

**Question:** How do we find the slope of 'Gestational Days', 'Birth Weight'?

This value is different than *r*. 

**Question:** why is that the case?

Let's take 4 samples and see what the slope is each time

In [None]:
for i in np.arange(4):
    baby.sample().scatter('Gestational Days', 'Birth Weight', fit_line=True)

**Question:** Do the slopes look similar?

*Hint: It is easier to determine this by looking at the ends of the line* 

Let's show these lines together in one graph

In [None]:
baby.scatter('Gestational Days', 'Birth Weight')
for i in np.arange(4):
    resample = baby.sample()
    s = slope(resample, 'Gestational Days', 'Birth Weight')
    c = intercept(resample, 'Gestational Days', 'Birth Weight')
    xlims = make_array(150, 350)
    plots.plot(xlims, s*xlims + c, lw=4)

Let's do this 5k times

In [None]:
slopes = []
for i in np.arange(5000):
    resample = baby.sample()
    resample_slope = slope(resample, 'Gestational Days', 'Birth Weight')
    slopes.append(resample_slope)
Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)

**Question:** How can we find confidence intervals of the estimated slope?

In [None]:
left = ...
right = ...
[left, right]

In [None]:
def bootstrap_slope(table, x, y, repetitions=5000):
    
    # Bootstrap resampling
    slopes = []
    for i in np.arange(repetitions):
        resample = table.sample()
        resample_slope = slope(resample, x, y)
        slopes.append(resample_slope)
    
    # Find the endpoints of the 95% confidence interval for the true slope
    left = percentile(2.5, slopes)
    right = percentile(97.5, slopes)
    
    # Slope of the regression line from the original sample
    observed_slope = slope(table, x, y)
    
    # Display results
    Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
    plots.plot([left, right], [0, 0], color='yellow', lw=8);
    print('Slope of regression line:', observed_slope)
    print('Approximate 95%-confidence interval for the true slope:')
    print(left, right)
    
bootstrap_slope(baby, 1, 0)

In [None]:
plot_residuals(baby, 2, 1)

In [None]:
bootstrap_slope(baby, 2, 1)

**Question:** How do we know there was actually a slope and that the variability wasn't caused by an error?

(slides for answer)

# Classification Examples: Medicine



In [None]:
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

This dataset represents a doctor's patients

**Question:** How many patients are there?

These patients are classified by whether they have kidney disease or not?

**Question:** Which column do you think indicates this?

**Question:** Now that we know it is `Class`, how many patients had kidney disease and how many didn't?
    <details>,
<summary>Solution</summary>
  ckd.group('Class')
</details>


**Question:** Can we predict whether a patience has kidney disease based on their Glocuse levels?
    <details>,
<summary>Solution</summary>
  Maybe
</details>


**Question:** What's our method to determine if we can?
       <details>,
<summary>Solution</summary>
    <h3>Visualize then quantify</h3>
    
</details>

**Question:** How can we visualzie?

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

Let's fix this error

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', group='Class')

Let's talk about this visualization. How would we classify whether a patience had kidney disease based on their gluclose levels and white blood cell count?

(skip)

**Question:** What about predicting kidney disease based on Hemoglobin and Glucose?

In [None]:
# make the visualization here