# [HW4] Problem: Overparameterized Linear Regression


In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import interactive
import ipywidgets as widgets
from ipywidgets import fixed

# Import various helpful functions
from helpers import *


## Part (g): Solve the weighted LS problem

**Solve the weighted least-squares regression to obtain the final coefficients $\hat{\boldsymbol{\alpha}}$** using the sklearn `LinearRegression` model. Remember that we first find the coefficients for _weighted_ features $\hat{\boldsymbol{\beta}}$, then use the weights to find the final coefficients.


In [None]:
from sklearn.linear_model import LinearRegression
def solve_ls(phi, y, weights=None):
    d = phi.shape[1]
    if weights is None:
        weights  = np.ones(d)
    phi_weighted = weights*phi
    LR = LinearRegression(fit_intercept=False, normalize=False)
    # TODO: Train the linear regressor to obtain the weighted coefficients beta, then
    #       use the weights to obtain the final coefficients alpha
    ### start g1 ###

    ### end g1 ###
    loss = np.mean((y - phi @ alpha.T)**2)
    return alpha, loss


Next, we have implemented for you a solve function that calculates the regression coefficients for the cases in parts (a)-(c). Take a look at the coefficients learned in each case, and notice how the noise energy is aliased across $d > n$ features.


In [None]:
#Set randomness
seed = 127
np.random.seed(seed)

s = 25
n = 128
d = 257
num_training_noise_seeds = 10
phi_type = 'fourier'
x_type = 'grid'
f_type = 'cos2'
awgn_std = 2e-1
n_test = 10000
noise_seed_idx = 5
gamma = 0.9


def solve(s, n, d, num_training_noise_seeds,
          phi_type, x_type, f_type,
          awgn_std, n_test, noise_seed_idx,
          gamma,
          plot_all=True):
    # TODO: print SU, CNs, CNe values based on params
    assert(d >= n)
    assert( d >= s)
    x_train = generate_x(x_type=x_type, n=n)
    phi_train = featurize(x_train, d, phi_type)
    y_train = generate_y(x=x_train, f_type=f_type)

    x_test= generate_x(x_type = 'uniform_random', n=n_test)
    y_test = generate_y(x=x_test, f_type=f_type)
    phi_test = featurize(x_test, d, phi_type)

    weights = get_bilevel_weights(s,gamma,d)
    plot_weights(weights, gamma, s)

    # Expected prediction error
    lambd = s * (1 - gamma) / (n * gamma)
    SU = 1. / (1 + lambd)
    CNs_sqr = (n / d) * (lambd ** 2 / (1 + lambd) ** 2)
    CNe_sqr = awgn_std ** 2 * ((s / n) * ((1 + n * lambd ** 2 / d) / (1 + lambd) ** 2) + (n - s) / d)
    print("(1-SU)^2: {:.4f}, CNs^2: {:.4f}, CNe^2: {:.4f}".format(
            (1-SU)**2, CNs_sqr, CNe_sqr))
    print("lambda: "+str(round(lambd,4)))

    # Generate noise
    noise = np.random.normal(0, awgn_std, size = [y_train.shape[0], num_training_noise_seeds])
    y_train_noisy = y_train[:,None] + noise

    # Solve the noiseless case
    coeffs_noiseless, loss_noiseless  = solve_ls(phi_train, y_train, weights)
    true_coeffs = np.zeros_like(coeffs_noiseless)
    true_coeffs[:n] = solve_ls(phi_train[:,:n], y_train)[0]
    y_test_pred_noiseless = phi_test @ coeffs_noiseless
    if plot_all:
        plt.figure(figsize=[16, 9])
        plt.subplot(2,3,1)
        plot_prediction(x_train, y_train, x_test, y_test, y_test_pred_noiseless, show=not plot_all)
        plt.subplot(2,3,4)
        plot_coeffs(coeffs_noiseless, true_coeffs, title = 'Signal only', show=not plot_all)
    pred_error_noiseless = np.mean((y_test_pred_noiseless - y_test)**2)
    print("Noiseless Training Loss: ", loss_noiseless)
    print("Noiseless Prediction Error: ", round(pred_error_noiseless,3))

    # Solve the pure noise case
    coeffs_noise, loss_noise  = solve_ls(phi_train, noise, weights)
    coeffs_noise = coeffs_noise.T
    true_coeffs_noise = np.zeros(d)
    y_test_pred_noise = phi_test @ coeffs_noise
    if plot_all:
        plt.subplot(2,3,2)
        plot_prediction(x_train, noise[:,noise_seed_idx], x_test, np.zeros_like(x_test), y_test_pred_noise[:,noise_seed_idx], show=not plot_all)
        plt.subplot(2,3,5)
        plot_coeffs(coeffs_noise[:,noise_seed_idx], true_coeffs_noise, title = 'Noise only', show=not plot_all)
    pred_error_noise = np.mean((y_test_pred_noise)**2)
    print("Pure Noise Training Loss: ", loss_noise)
    print("Pure Noise Prediction Error: ", round(pred_error_noise,3))

    # Solve the signal + noise case
    # Here we take advantage of the linearity of the interpolator to avoid solving
    # another LS problem.
#     coeffs, loss  = solve_ls(phi_train, y_train_noisy, weights)
#     coeffs = coeffs.T
    coeffs = coeffs_noise + coeffs_noiseless[:,None]
    loss =  np.mean((phi_train @ coeffs - y_train_noisy)**2)
    y_test_pred = phi_test @ coeffs

    if plot_all:
        plt.subplot(2,3,3)
    plot_prediction(x_train, y_train_noisy[:,noise_seed_idx], x_test, y_test, y_test_pred[:,noise_seed_idx], show=not plot_all)

    if plot_all:
        plt.subplot(2,3,6)
    plot_coeffs(coeffs[:,noise_seed_idx], true_coeffs, title = 'Signal + Noise', show=not plot_all)
    pred_error = np.mean((y_test_pred - y_test[:,None])**2)
    print("Final Training Loss: ", loss)
    print("Final Prediction Error: ", round(pred_error,3))


solve(s, n, d,num_training_noise_seeds, phi_type,x_type,f_type,awgn_std,n_test,noise_seed_idx,gamma)


Play around with the interactive plot below. **Comment on the interactions between $\gamma$ and the prediction error, $s$ and the prediction error, $d$ and the prediction error, and the behavior as $\sigma_{awgn}$ increases.**


In [None]:
interactive_plot = interactive(solve,
                               s=generate_s_widget(50),
                               n=fixed(200),
                               d=generate_d_widget(),
                               num_training_noise_seeds=fixed(num_training_noise_seeds),
                               phi_type=fixed(phi_type), x_type=fixed(x_type), f_type=fixed(f_type),
                               awgn_std=generate_awgn_std_widget(),
                               n_test=fixed(n_test),
                               noise_seed_idx=fixed(noise_seed_idx),
                               gamma=generate_gamma_widget(),
                               plot_all=fixed(True))
interactive_plot


_Your comments here..._


Finally, use the function `vary_everything_together` to set a constant ratio for $s$, $d$, and $n$, then explore the behavior as $n$ varies. **Comment on the prediction error as $n$ varies for several combinations of $p$, $q$, and $r$.** You may want to restrict yourself to $n \leq 250$ and $q\leq 1.5$, as solving the regression problem becomes slow.


In [None]:
def vary_everything_together(n, p, q, r):
    """
    s = n^p
    d = n^q (q >= 1)
    gamma = n^(-r) (0 <= r <= q-p to favor first s features)
    """
    s = int(n**p)
    d = int(n**q)
    if d%2 == 0:
        d += 1
    gamma = n**(-r)
    if gamma < s/d:
        print("WARNING: No longer favoring first s features")
    return s,d,gamma

# TODO: Vary n, for various combinations of p, q, and r
n = 250
p = 0.8
q = 1.5
r = 0.05

s, d, gamma = vary_everything_together(n, p, q, r)
print("s: %d, d: %d, gamma: %f" % (s, d, gamma))
print("s/n: %f, n/d: %f, lambda: %f" % (s/n, n/d, s * (1 - gamma) / (n * gamma)))
solve(s, n, d, num_training_noise_seeds,
      phi_type, x_type, f_type,
      awgn_std, n_test, noise_seed_idx, gamma)


_Your comments here..._


## Part (i)
### Connection to ridge regression

The $\lambda$ parameter induced by the extra features can be used (with appropriate scaling) in ridge regression with the low frequency features. Below you can see that the survival of the true coefficient and the shape of the learned functions are almost the same. **Explore how s and $\gamma$ affect $\lambda$ and the correspondence between the two versions of regression.**


In [None]:
#Set randomness
seed = 127
np.random.seed(seed)

# s = 25
# n = 128
# d = 257
num_training_noise_seeds = 10
phi_type = 'fourier'
x_type = 'grid'
f_type = 'cos2'
awgn_std = 2e-1
n_test = 10000
noise_seed_idx = 5
gamma = 0.9

from helpers import solve2


In [None]:
interactive_plot = interactive(solve2,
                               s=generate_s_widget(50),
                               n=fixed(100),
                               d=fixed(1001),
                               num_training_noise_seeds=fixed(num_training_noise_seeds),
                               phi_type=fixed(phi_type), x_type=fixed(x_type), f_type=fixed(f_type),
                               awgn_std=fixed(0),
                               n_test=fixed(n_test),
                               noise_seed_idx=fixed(noise_seed_idx),
                               gamma=generate_gamma_widget(0.5),
                               plot_all=fixed(True))
interactive_plot


### Impulse response of training point

The plots below visualize the impulse response of a single training point, $\mathbf{y} = [0,\ldots,1,0,\ldots,0]$. **Explore how the shape of the function using the first $s$ features changes with $s$ and how it impacts the learned function. Explore how the full impulse reponse changes shape with $d$ and $\gamma$ and how that impacts the learned function.**


In [None]:
#Set randomness
seed = 127
np.random.seed(seed)

s = 12
n = 32
d = 1025
num_training_noise_seeds = 10
phi_type = 'fourier'
x_type = 'grid'
f_type = 'cos2'
awgn_std = 2e-1
n_test = 10000
noise_seed_idx = 5
gamma = 0.9

from helpers import solve3
# solve3(s, n, d, num_training_noise_seeds,
#       phi_type, x_type, f_type,
#       awgn_std, n_test, noise_seed_idx, gamma)


In [None]:
interactive_plot = interactive(solve3,
                               s=generate_s_widget(11, max_s=n),
                               n=fixed(32),
                               d=generate_d_widget(),
                               num_training_noise_seeds=fixed(num_training_noise_seeds),
                               phi_type=fixed(phi_type), x_type=fixed(x_type), f_type=fixed(f_type),
                               awgn_std=fixed(0),
                               n_test=fixed(n_test),
                               noise_seed_idx=fixed(noise_seed_idx),
                               gamma=generate_gamma_widget(0.9, gamma_min=s/d),
                               plot_all=fixed(True))
interactive_plot
