## Linear Regression

In [2]:
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from typing import Tuple

# import src.plot
# from src.utils import generate_synthetic_data, load_who_life_expectancy

In [4]:
# Set seed for reproducibility.
np.random.seed(0)

regression_datasets = [
    ("California Housing", datasets.fetch_california_housing),
    ("Diabetes", datasets.load_diabetes),
]

singular_value_cutoffs = np.logspace(-3, 0, 7)

In [6]:

X, y = datasets.fetch_california_housing(return_X_y=True)

# One ablation will be to make the true underlying relationship linear and noiseless.
# To do this, we need to know the ideal linear relationship. Unfortunately, we don't have
# any way to know this in practice, so we'll use all the data as our best guess.
beta_ideal = np.linalg.inv(X.T @ X) @ X.T @ y

dataset_loss_unablated_df = []
dataset_loss_no_small_singular_values_df = []
dataset_loss_no_residuals_in_ideal_fit_df = []
dataset_loss_test_features_in_training_feature_subspace_df = []

In [16]:
X.shape

(20640, 8)

In [17]:
y.shape

(20640,)

In [10]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [13]:
# subset_sizes = np.arange(1, 40, 1)
# for subset_size in subset_sizes:

subset_size=10

# Split the data into training/testing sets
(
    X_train,
    X_test,
    y_train,
    y_test,
    indices_train,
    indices_test,
) = train_test_split(
    X,
    y,
    np.arange(X.shape[0]),
    random_state=2,
    test_size=X.shape[0] - subset_size,
    shuffle=True,
)

In [14]:
X_train.shape

(10, 8)

In [15]:
X_test.shape

(20630, 8)

In [18]:
X.shape[0]

20640

In [19]:
U, S, Vt = np.linalg.svd(X_train, full_matrices=False, compute_uv=True)
min_singular_value = np.min(S[S > 0.0])
S_inverted = 1.0 / S
S_inverted[S_inverted == np.inf] = 0.0
beta_hat_unablated = Vt.T @ np.diag(S_inverted) @ U.T @ y_train
y_train_pred = X_train @ beta_hat_unablated
train_mse_unablated = mean_squared_error(y_train, y_train_pred)
y_test_pred = X_test @ beta_hat_unablated
test_mse_unablated = mean_squared_error(y_test, y_test_pred)
# END: Unablated linear fit.

# BEGIN:
X_hat_test = (
    X_test @ X_train.T @ np.linalg.pinv(X_train @ X_train.T) @ X_train
)
X_test_diff = X_hat_test - X_test
X_test_diff_inner_beta_ideal = np.mean(X_test_diff @ beta_ideal)


In [20]:
train_mse_unablated

0.0107481282748253

In [21]:
test_mse_unablated

5.464836582639113