In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Familiar Dataset from Data 8 ##

In [None]:
births = pd.read_csv('baby.csv')

In [None]:
births.head()

## Bivariate Distributions ##

### 1. Scatter Plot ###

In [None]:
sns.jointplot(
    data=births, 
    x='Maternal Height', 
    y='Birth Weight',
    );

### 2. Hexbin Plot ###

In [None]:
sns.jointplot(
    data=births, 
    x='Maternal Height', 
    y='Birth Weight', 
    kind='hex');

### 3. Contour Plot ###

In [None]:
sns.jointplot(
    data=births, 
    x='Maternal Height', 
    y='Birth Weight', 
    kind='kde');

### Correlation ###

In [None]:
def standard_units(x):
    return (x - np.mean(x)) / np.std(x)

def correlation(x, y):
    return np.mean (standard_units(x) * standard_units(y))

In [None]:
jumps = pd.read_csv('jumps.csv')

In [None]:
jumps

In [None]:
sns.jointplot('triple', 'vertical', data=jumps);

In [None]:
correlation(jumps['triple'], jumps['vertical'])

In [None]:
jumps.corr()

### Simple Linear Regression ###

In [None]:
sns.lmplot(x='triple', y='vertical', data=jumps, ci=False);

### Slope and Intercept of the Regression Line ###

In [None]:
def slope(x, y):
    return correlation(x, y) * np.std(y) / np.std(x)

def intercept(x, y):
    return np.mean(y) - slope(x, y)*np.mean(x)

In [None]:
triple = jumps['triple']
vertical = jumps['vertical']

In [None]:
a_hat = intercept(triple, vertical)
b_hat = slope(triple, vertical)
est_vertical = a_hat + b_hat*triple

In [None]:
sns.lmplot(x='triple', y='vertical', data=jumps, ci=False)
plt.scatter(triple, est_vertical);

### Fitted or Estimated Values ###

In [None]:
triple_value = 550
fitted_550 = a_hat + b_hat*550
fitted_550

### Errors or Residuals ###

In [None]:
residuals = vertical - est_vertical
plt.scatter(triple, residuals);

In [None]:
np.mean(residuals)

### Comparing Two Lines ###

In [None]:
np.mean(vertical)

In [None]:
sns.lmplot(x='triple', y='vertical', data=jumps, ci=False)
plt.plot([min(triple), max(triple)], [np.mean(vertical), np.mean(vertical)], color='red');

### Anscombe's Data ###

In [None]:
anscombe = pd.read_csv('anscombe.csv')
anscombe

## Log Transformations ##

$$
y = 2^x
$$

$$
\log(y) = x\log(2) = cx
$$

In [None]:
x = np.arange(1, 11)
y = 2**x

In [None]:
plt.scatter(x, y);

In [None]:
plt.scatter(x, np.log(y));

$$
y = 2x^3
$$

$$
\log(y) = \log(2) + 3\log(x)
$$

So $w = av + b$ where $w = \log(y)$, $v = \log(x)$.

In [None]:
x = np.arange(1, 11)
y = 2 * x**3

In [None]:
plt.scatter(x, y);

In [None]:
plt.scatter(x, np.log(y));

In [None]:
plt.scatter(np.log(x), np.log(y));