In [None]:
%load_ext autoreload
%autoreload 1
# custom functions being developed interactively
%aimport utils_practice_version
import utils_practice_version as utils

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.utils import shuffle

In [None]:
# Part 1 - Fixing high bias.
# Technique 1 - Get polynomial features
x_train, y_train, x_cv, y_cv, x_test, y_test = utils.prepare_dataset(
    "data/c2w3_lab2_data1.csv"
)
print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")

# Preview the first 5 rows
print(f"first 5 rows of the training inputs (1 feature):\n {x_train[:5]}\n")

# Instantiate the regression model class
model = LinearRegression()

# Train and plot polynomial regression models
utils.train_plot_poly(model, x_train, y_train, x_cv, y_cv, max_degree=10, baseline=400)

In [None]:
# Train and plot polynomial regression models. Baseline bias is defined lower.
utils.train_plot_poly(model, x_train, y_train, x_cv, y_cv, max_degree=10, baseline=250)

In [None]:
# Technique 2 - Add features
x_train, y_train, x_cv, y_cv, x_test, y_test = utils.prepare_dataset(
    "data/c2w3_lab2_data2.csv"
)

print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")

# Preview the first 5 rows
print(f"first 5 rows of the training inputs (2 features):\n {x_train[:5]}\n")

In [None]:
# Instantiate the model class
model = LinearRegression()

# Train and plot polynomial regression models. Dataset used has two features.
utils.train_plot_poly(model, x_train, y_train, x_cv, y_cv, max_degree=7, baseline=250)

In [None]:
# Technique 3 - Decrease lambda, the regularization parameter
# Define lambdas to plot
reg_params = [10, 5, 2, 1, 0.5, 0.2, 0.1, 0.08, 0.06]

# Define degree of polynomial and train for each value of lambda
utils.train_plot_reg_params(
    reg_params, x_train, y_train, x_cv, y_cv, degree=4, baseline=250
)

In [None]:
# Part 2 - Fixing high variance
# Technique 1 - Increase lambda, the regularization term.
# Define lambdas to plot
reg_params = [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1]

# Define degree of polynomial and train for each value of lambda
utils.train_plot_reg_params(
    reg_params, x_train, y_train, x_cv, y_cv, degree=4, baseline=250
)

In [None]:
# Technique 2 - Try smaller sets of features
# Prepare dataset with randomID feature
# To illustrate how removing features can improve performance, you will do polynomial regression for 2 datasets: the same data you used above (2 features) and another with a random ID column (3 features). You can preview these using the cell below. Notice that 2 columns are identical and a 3rd one is added to include random numbers.
x_train, y_train, x_cv, y_cv, x_test, y_test = utils.prepare_dataset(
    "data/c2w3_lab2_data2.csv"
)

# Preview the first 5 rows
print(f"first 5 rows of the training set with 2 features:\n {x_train[:5]}\n")

# Prepare dataset with randomID feature
x_train, y_train, x_cv, y_cv, x_test, y_test = utils.prepare_dataset(
    "data/c2w3_lab2_data3.csv"
)

# Preview the first 5 rows
print(
    f"first 5 rows of the training set with 3 features (1st column is a random ID):\n {x_train[:5]}\n"
)

In [None]:
# Define the model
model = LinearRegression()

# Define properties of the 2 datasets
file1 = {
    "filename": "data/c2w3_lab2_data3.csv",
    "label": "3 features",
    "linestyle": "dotted",
}
file2 = {
    "filename": "data/c2w3_lab2_data2.csv",
    "label": "2 features",
    "linestyle": "solid",
}
files = [file1, file2]

# Train and plot for each dataset
utils.train_plot_diff_datasets(model, files, max_degree=4, baseline=250)

In [None]:
# Technique 3 - Get more training examples
# Lastly, you can try to minimize the cross validation error by getting more examples. In the cell below, you will train a 4th degree polynomial model then plot the learning curve of your model to see how the errors behave when you get more examples.
# Prepare the dataset
x_train, y_train, x_cv, y_cv, x_test, y_test = utils.prepare_dataset(
    "data/c2w3_lab2_data4.csv"
)

print(f"the shape of the entire training set (input) is: {x_train.shape}")
print(f"the shape of the entire training set (target) is: {y_train.shape}\n")
print(f"the shape of the entire cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the entire cross validation set (target) is: {y_cv.shape}\n")

# Instantiate the model class
model = LinearRegression()

# Define the degree of polynomial and train the model using subsets of the dataset.
utils.train_plot_learning_curve(
    model, x_train, y_train, x_cv, y_cv, degree=4, baseline=250
)