In [1]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import nbinteract as nbi

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 9)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# HIDDEN
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=pd.options.display.max_rows,
               cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

## Polynomial Regression

In [None]:
mpg = sns.load_dataset('mpg').dropna()
mpg

In [None]:
pd.DataFrame({
    'term': ['bias', 'hp', 'hp^2'],
    'coef': [model.intercept_, *model.coef_]
})

## One-Hot Encoding

In [None]:
mpg

In [None]:
sns.boxplot(x='origin', y='mpg', data=mpg);

In [None]:
hot = pd.DataFrame({
    'bias': 1,
    'origin=usa': (origs == 'usa').astype(int),
    'origin=europe': (origs == 'europe').astype(int),
})
hot

## Adding Too Many Features

In [None]:
ice = pd.read_csv('icecream.csv')
ice

In [None]:
sns.scatterplot('sweetness', 'overall', data=ice)

In [None]:
def mse(pred, truth):
    return np.mean((pred - truth)**2)

def ice_mse(model):
    return mse(model.predict(X), y)

In [None]:
def draw_pred(model):
    to_draw = np.linspace(4, 12, 50)
    line = model.predict(to_draw.reshape((50, 1)))
    sns.scatterplot('sweetness', 'overall', data=ice)
    sns.lineplot(to_draw, line, color='r', linewidth=3)

In [None]:
def poly_model(degree, X, y):
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('lin_reg', LinearRegression(fit_intercept=False)),
    ])
    return model.fit(X, y)

In [None]:
models = ...

In [None]:
plt.figure(figsize=(10, 8))
for ind, deg in enumerate([1, 2, 5, 10]):
    plt.subplot(2, 2, ind + 1)
    draw_pred(models[deg])
    plt.title(f'Degree {deg} poly, MSE = {ice_mse(models[deg]):.2f}')
    plt.ylim(3.5, 7.5)
plt.tight_layout()

In [None]:
degrees = np.arange(1, 11)
all_loss = [ice_mse(models[deg]) for deg in degrees]
sns.lineplot(degrees, all_loss)
plt.title('Training error')
plt.xlabel('Degree of Poly Features')
plt.ylabel('MSE')

## Bias-Variance Tradeoff

In [None]:
clothes = pd.read_csv('clothes.csv')
clothes

In [None]:
sns.scatterplot('airVol', 'evapRes', data=clothes)