# CME538 - Introduction to Data Science
## Lecture 9.1 - Diagnosing Bias & Variance

## Setup Notebook

In [None]:
# Import 3rd party libraries
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge

# Local imports
from bias_variance_visualization import func, fit_model, bias_variance_visualizer

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

# Let's Create Some Data
Consider the following cubic function.
$$g(x) = \theta_0 + \theta_1 \cdot \text{x} + \theta_2 \cdot x^{2} + \theta_3 \cdot x^{3}$$

Where:
- $\theta_0$ =  1.000
- $\theta_1$ = -0.003
- $\theta_2$ =  0.050 
- $\theta_3$ =  0.003

Now, let's plot out function for `-10 <= x <= 10`.

In [None]:
plt.figure(figsize = (10, 7))
ax = sns.lineplot(np.arange(-10, 10, 0.01), func(np.arange(-10, 10, 0.01)), 
                  color='#fc4f30', lw = 4, label='g(x)')
ax.xaxis.set_tick_params(labelsize=16)
ax.yaxis.set_tick_params(labelsize=16)
ax.set_xlabel('x', fontsize=22)
ax.set_ylabel('y', fontsize=22)
ax.set_xlim([-10, 10])
ax.set_ylim([-3, 8])
ax.legend(loc=2, fontsize=16)
plt.show()

Next, let's create a large dataset by adding random noise to the true values.

In [None]:
n_samples = 10000
x = np.random.uniform(-10, 10, n_samples)
y = func(x) + np.random.normal(0, 0.3, n_samples)
data = pd.DataFrame({'x': x, 'y': y})
data.head()

Plot just the samples

In [None]:
plt.figure(figsize = (10, 7))
ax = sns.scatterplot(x='x', y='y', data=data, label='y = g(x) + $\epsilon$')
ax.xaxis.set_tick_params(labelsize=16)
ax.yaxis.set_tick_params(labelsize=16)
ax.set_xlabel('x', fontsize=22)
ax.set_ylabel('y', fontsize=22)
ax.set_xlim([-10, 10])
ax.set_ylim([-3, 8])
ax.legend(loc=2, fontsize=16)
plt.show()

Plot the samples and the true function.

In [None]:
plt.figure(figsize = (10, 7))
ax = sns.scatterplot(x='x', y='y', data=data, label='y = g(x) + $\epsilon$')
ax = sns.lineplot(np.arange(-10, 10, 0.01), func(np.arange(-10, 10, 0.01)), 
                  color='#fc4f30', lw = 4, label='g(x)')
ax.xaxis.set_tick_params(labelsize=16)
ax.yaxis.set_tick_params(labelsize=16)
ax.set_xlabel('x', fontsize=22)
ax.set_ylabel('y', fontsize=22)
ax.set_xlim([-10, 10])
ax.set_ylim([-3, 8])
ax.legend(loc=2, fontsize=16)
plt.show()

Lastly, let's try fitting a model to the complete dataset.

In [None]:
# Fit model
x_plotting, y_plotting, y_train_pred, y_test_pred, model = fit_model(2, LinearRegression(fit_intercept=True), data)

# Plot model and data
plt.figure(figsize = (10, 7))
ax = sns.scatterplot(x='x', y='y', data=data, label='y = g(x) + $\epsilon$')
ax.plot(x_plotting, y_plotting, label='$\hat{y}$', color='#6d904f', lw = 4)
ax.xaxis.set_tick_params(labelsize=16)
ax.yaxis.set_tick_params(labelsize=16)
ax.set_xlabel('x', fontsize=22)
ax.set_ylabel('y', fontsize=22)
ax.set_xlim([-10, 10])
ax.set_ylim([-3, 8])
ax.legend(loc=2, fontsize=16)
plt.show()

# Underfitting: Scenario 1
- Model: Polynomial Order 1
- Test Size: 30%
- Sample Size: 1000
- Simulations: 25

In [None]:
bias_variance_visualizer(data=data, deg=1, test_split=0.3, n_samples=1000, n_models=25, 
                         folder='underfitting_1', ml_model=LinearRegression(fit_intercept=True))

# Underfitting: Scenario 2
- Model: Polynomial Order 1
- Test Size: 30%
- Sample Size: 50
- Simulations: 25

In [None]:
bias_variance_visualizer(data=data, deg=1, test_split=0.3, n_samples=50, n_models=25, 
                         folder='underfitting_2', ml_model=LinearRegression(fit_intercept=True))

# Overfitting: Scenario 1
- Model: Polynomial Order 10
- Test Size: 30%
- Sample Size: 5000
- Simulations: 25

In [None]:
bias_variance_visualizer(data=data, deg=10, test_split=0.3, n_samples=5000, n_models=25, 
                         folder='Overfitting_1', ml_model=LinearRegression(fit_intercept=True))

# Overfitting: Scenario 2
- Model: Polynomial Order 10
- Test Size: 30%
- Sample Size: 1000
- Simulations: 25

In [None]:
bias_variance_visualizer(data=data, deg=10, test_split=0.3, n_samples=1000, n_models=25, 
                         folder='Overfitting_2', ml_model=LinearRegression(fit_intercept=True))

# Overfitting: Scenario 3
- Model: Polynomial Order 10
- Test Size: 30%
- Sample Size: 100
- Simulations: 25

In [None]:
bias_variance_visualizer(data=data, deg=10, test_split=0.3, n_samples=100, n_models=25, 
                         folder='Overfitting_3', ml_model=LinearRegression(fit_intercept=True))

# Overfitting: Scenario 4
- Model: Polynomial Order 10
- Test Size: 30%
- Sample Size: 10
- Simulations: 25

In [None]:
bias_variance_visualizer(data=data, deg=10, test_split=0.3, n_samples=10, n_models=25, 
                         folder='Overfitting_4', ml_model=LinearRegression(fit_intercept=True))