In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## Linear regression

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

def correlation(t, x, y):
    """Return the correlation coefficient (r) of two variables."""
    return np.mean(standard_units(t.column(x)) * standard_units(t.column(y)))

def slope(t, x, y):
    """The slope of ther regression line (original units)."""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)."""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))

def fit(table, x, y):
    """Return the height of the regression line at each x value."""
    a = slope(table, x, y)
    b = intercept(table, x, y)
    return a * table.column(x) + b

In [None]:
lw = Table.read_table('http://inferentialthinking.com/notebooks/little_women.csv').move_to_start('Periods')
lw.show(3)

In [None]:
shotput = Table.read_table('http://inferentialthinking.com/notebooks/shotput.csv')
shotput.show(3)

## Residuals

In [None]:
lw.scatter('Periods', 'Characters', fit_line=True)

In [None]:
y = lw.column('Characters')
fitted = fit(lw, 'Periods', 'Characters')
residuals = y - fitted

In [None]:
sum(residuals)

In [None]:
np.mean(residuals**2)**0.5

In [None]:
lw.with_column('residual', residuals).scatter('Periods', 'residual')

In [None]:
def plot_residuals(t):
    t.scatter(0, 1, fit_line=True)
    y = t.column(1)
    fitted = fit(t, 0, 1)
    residuals = y - fitted
    print('Sum of residuals:', sum(residuals))
    print('RMSE:', np.mean(residuals**2)**0.5)
    t.with_column('Residual', residuals).scatter(0, 2)

In [None]:
plot_residuals(shotput)

## Dugong

In [None]:
dugong = Table.read_table('http://www.statsci.org/data/oz/dugongs.txt')
dugong.show(3)

In [None]:
plot_residuals(dugong)

In [None]:
us_women = Table.read_table('http://inferentialthinking.com/notebooks/us_women.csv')
us_women.show(3)

In [None]:
correlation(us_women, 'height', 'ave weight')

In [None]:
plot_residuals(us_women)

## Variance

In [None]:
lw.hist('Characters')

In [None]:
y = lw.column('Characters')
fitted = fit(lw, 'Periods', 'Characters')
residuals = y - fitted

In [None]:
np.std(y)

In [None]:
np.std(y) ** 2

In [None]:
np.var(y)

In [None]:
r = correlation(lw, 'Periods', 'Characters')
r

In [None]:
r ** 2

In [None]:
1 - r ** 2

In [None]:
np.var(fitted) / np.var(y)

In [None]:
np.var(residuals) / np.var(y)

In [None]:
np.var(fitted) + np.var(residuals)

In [None]:
np.std(fitted) / np.std(y)

## Regression Model

In [None]:
def draw_and_compare(true_slope, true_int, sample_size):
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    errors = np.random.normal(0, 6, sample_size)
    y = (true_slope * x + true_int) + errors
    sample = Table().with_columns('x', x, 'y', y)

    sample.scatter(0, 1)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('True Line, and Points Created')

    sample.scatter(0, 1)
    plots.title('What We Get to See')

    sample.scatter(0, 1, fit_line=True)
    plots.title('Regression Line: Estimate of True Line')

    sample.scatter(0, 1, fit_line=True)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title("Regression Line and True Line")
    
draw_and_compare(2, -5, 10)

In [None]:
draw_and_compare(2, -5, 100)

In [None]:
draw_and_compare(2, -5, 1000)