In [None]:
# HIDDEN
import warnings
# Ignore numpy dtype warnings. These warnings are caused by an interaction
# between numpy and Cython and can be safely ignored.
# Reference: https://stackoverflow.com/a/40846742
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import nbinteract as nbi

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 7)
pd.set_option('display.max_columns', 9)
pd.set_option('precision', 2)
# This option stops scientific notation for pandas
# pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
# HIDDEN
def df_interact(df, nrows=7, ncols=7):
    '''
    Outputs sliders that show rows and columns of df
    '''
    def peek(row=0, col=0):
        return df.iloc[row:row + nrows, col:col + ncols]

    row_arg = (0, len(df), nrows) if len(df) > nrows else fixed(0)
    col_arg = ((0, len(df.columns), ncols)
               if len(df.columns) > ncols else fixed(0))
    
    interact(peek, row=row_arg, col=col_arg)
    print('({} rows, {} columns) total'.format(df.shape[0], df.shape[1]))

def display_df(df, rows=50, cols=pd.options.display.max_columns):
    with pd.option_context('display.max_rows', rows,
                           'display.max_columns', cols):
        display(df)

## Cross-Validation

Data: Hourly Minneapolis-St Paul, MN traffic volume for westbound I-94. Includes weather and holiday features from 2012-2018.

In [None]:
tr = pd.read_csv('traffic_one_hot.csv')
df_interact(tr)

You should implement K-fold CV yourself to really understand it. For now, we'll use sklearn.

In [None]:
from sklearn.model_selection import KFold

kf = ...

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

def keep_cols(X, cols):
    pass

def poly_model(degree):
    pass

def mini_model(cols):
    pass

In [None]:
from sklearn.model_selection import cross_validate

model = ...
scores = ...
valid_error = ...
valid_error

In [None]:
def cv(model):
    return ...

In [None]:
def val_error(cv):
    return ...

In [None]:
models = pd.DataFrame({
    ...
})

Takes some work, but we can extract out all the errors for plotting:

In [None]:
def get_train_errs(models):
    return models['cv'].apply(lambda cv: -cv['train_score'])

def get_valid_errs(models):
    return models['cv'].apply(lambda cv: np.log(-cv['test_score']))

def make_errors(models, errs):
    return (pd.DataFrame.from_items(zip(errs.index, errs.values))
            .melt(var_name='index', value_name='mse')
            .merge(models[['name']], left_on='index', right_index=True))

def plot_errors(errors, log=False, err_type='Training'):
    plt.figure(figsize=(8, 4))
    sns.pointplot('name', 'mse', ci='sd', data=errors)
    plt.xlabel('Model')
    plt.ylabel('log(mse)' if log else 'mse')
    plt.title(f'{err_type} Error')

## Regularization

In [None]:
water = pd.read_csv('water.csv')
water

In [None]:
sns.scatterplot('water_level_change', 'water_flow', data=water)

In [None]:
X = water.iloc[:, [0]]
y = water.iloc[:, 1]

In [None]:
from sklearn.linear_model import Ridge, Lasso

def ridge_model(degree, alpha=1.0):
    pass

def lasso_model(degree, alpha=1.0):
    pass

In [None]:
from itertools import chain

def plot_model(model, ax=None):
    if not ax: ax = plt.gca()
    model.fit(X, y)
    xs = np.linspace(-50, 50, 100)
    ys = model.predict(xs.reshape((-1, 1)))
    sns.scatterplot('water_level_change', 'water_flow', data=water, ax=ax)
    sns.lineplot(xs, ys, ax=ax, color=sns.xkcd_rgb['dark gold'])
    ax.set_ylim(-5, 55)
    
def plot_models(models, cols=2):
    rows = int(np.ceil(len(models) / cols))
    fig, axes = plt.subplots(rows, cols, figsize=(10, 4 * rows),
                             sharex=True, sharey=True, squeeze=False)
    for ax, model in zip(chain(*axes), models):
        plot_model(model, ax)
    plt.tight_layout()

In [None]:
degree = 12

plot_models([
    ridge_model(degree, alpha=0.001),
    lasso_model(degree, alpha=1.0),
])

In [None]:
def coefs(model, name='coef'):
    clf = model.steps[-1][1]
    df = pd.DataFrame({
        name: (clf.intercept_, *clf.coef_),
    })
    df.index.name = 'deg'
    return df

def log_coefs(coefs):
    return coefs.apply(np.abs).apply(np.log10)

def plot_coefs(coefs):
    log_coefs(coefs).plot.line()