In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

## Lecture 12 ##

## Chebyshev's Bounds ##

In [None]:
births = Table.read_table('baby.csv')

In [None]:
births.labels

In [None]:
births.hist(overlay = False)

In [None]:
mh = births.column('Maternal Height')
mean = np.mean(mh)
sd = np.std(mh)
mean, sd

In [None]:
within_3_SDs = births.where('Maternal Height', are.between(mean - 3*sd, mean + 3*sd))

In [None]:
within_3_SDs.num_rows/births.num_rows

In [None]:
# What's Chebyshev's Bound for 3 SDs?

In [None]:
# See if Chebyshev's bounds work for different distributions
# Go through all of the different columns
# Print what percentage are within 1-5 SDs
#   Average plus or minus 1 SDs: x %
#   Average plus or minus 2 SDs: y %, etc.


## Standard Units ##

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    ...

In [None]:
ages = births.column('Maternal Age')

In [None]:
ages_standard_units = standard_units(ages)

In [None]:
np.mean(ages_standard_units), np.std(ages_standard_units)

In [None]:
both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
np.mean(ages), np.std(ages)

In [None]:
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [None]:
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);

## The SD and Bell-Shaped Curves ##

In [None]:
heights

In [None]:
heights.scatter('MidParent')

In [None]:
hybrid = Table.read_table('hybrid.csv')

In [None]:
hybrid

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
hybrid.scatter('acceleration', 'msrp')

In [None]:
suv = hybrid.where('class', 'SUV')
suv.num_rows

In [None]:
suv.scatter('mpg', 'msrp')

In [None]:
Table().with_columns(
    'mpg (standard units)',  standard_units(suv.column('mpg')), 
    'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3)

### Calculating $r$ ###

In [None]:
x = np.arange(1, 7, 1)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t= t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
t

In [None]:
su_product = t.column(2) * t.column(3)
t = t.with_column('product of standard units', su_product)
t

In [None]:
# r is the average of the products of standard units

r = np.mean(t.column(4))
r

In [None]:
def correlation(tbl, x, y):
    """return the correlation coefficient of x and y
    tbl is a table; 
    x and y are column labels"""
    ...

In [None]:
correlation(t, 'x', 'y')

In [None]:
correlation(suv, 'mpg', 'msrp')

In [None]:
correlation(t, 'x', 'y')

In [None]:
correlation(t, 'y', 'x')

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t.scatter('y', 'x', s=30, color='red')

In [None]:
correlation(t, 'y', 'x')

## $r$ Interpretation ##

### Nonlinearity ###

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(nonlinear, 'x', 'y')

### Outliers ###

In [None]:
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(line, 'x', 'y')

In [None]:
outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(outlier, 'x', 'y')

### Ecological Correlation ###

In [None]:
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014

In [None]:
sat2014.scatter('Critical Reading', 'Math')

In [None]:
correlation(sat2014, 'Critical Reading', 'Math')