# Q4 Linear Regression + SVD to find weights and minimize RMSE

In [19]:
from sklearn.neighbors import KDTree
import numpy
from data.data_utils import load_dataset
import numpy as np
import scipy
import matplotlib.pyplot as plt
from scipy.linalg import svd
from pdb import set_trace
import time

## Useful functions

In [20]:
def linear_reg_svd(x_train, y_train, x_test):
    """
    X = US(V^T)
    want to minimize norm(y - Xw) -> rewrite w=v(U^T)y/sigma where sigma is the sigular vlaue of S -> w, y found
    """
    U,S,Vt = np.linalg.svd(x_train, full_matrices=False, compute_uv=True)
    weights = Vt.T.dot(U.T.dot(y_train) / np.reshape(S, (-1,1)))
    y_hat = x_test.dot(weights)
    # print(U,S, Vt)
    return y_hat


## Regression

In [23]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('mauna_loa')
x = np.vstack([x_valid, x_train])
y = np.vstack([y_valid, y_train])
t0 = time.time()
y_hat = linear_reg_svd(x, y, x_test)
t1 = time.time()
rmse = np.sqrt(np.mean(np.square(y_test-y_hat)))
print("RMSE = %f, time: %f" %(rmse, t1-t0))
# y_hat

RMSE = 0.307285, time: 0.001478


In [24]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('rosenbrock', d=2, n_train=1000)
x = np.vstack([x_valid, x_train])
y = np.vstack([y_valid, y_train])
t0 = time.time()
y_hat = linear_reg_svd(x, y, x_test)
t1 = time.time()
rmse = np.sqrt(np.mean(np.square(y_test-y_hat)))
print("RMSE = %f, time: %f" %(rmse, t1-t0))

RMSE = 0.984087, time: 0.008408


In [25]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('pumadyn32nm')
x = np.vstack([x_valid, x_train])
y = np.vstack([y_valid, y_train])
t0 = time.time()
y_hat = linear_reg_svd(x, y, x_test)
t1 = time.time()
rmse = np.sqrt(np.mean(np.square(y_test-y_hat)))
print("RMSE = %f, time: %f" %(rmse, t1-t0))

RMSE = 0.862237, time: 0.025847


## Classification

In [29]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('iris')
x = np.vstack([x_valid, x_train])
y = np.vstack([y_valid, y_train])
y_train_encoded = np.tile(np.arange(y.shape[1]).reshape((1,-1)), (y.shape[0], 1))[y].reshape(-1,1)
y_test_encoded = np.tile(np.arange(y_test.shape[1]).reshape((1,-1)), (y_test.shape[0], 1))[y_test].reshape(-1,1)
t0 = time.time()
y_hat = linear_reg_svd(x, y_train_encoded, x_test)
t1 = time.time()
acc = np.mean(y_hat == y_test_encoded)
print("Accuracy = %f, time: %f" %(rmse, t1-t0))

Accuracy = 0.862237, time: 0.002640


In [55]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('iris')
x = np.vstack([x_valid, x_train])
y = np.vstack([y_valid, y_train])
y_train_encoded = np.tile(np.arange(y.shape[1]).reshape((1,-1)), (y.shape[0], 1))[y].reshape(-1,1)
y_test_encoded = np.tile(np.arange(y_test.shape[1]).reshape((1,-1)), (y_test.shape[0], 1))[y_test].reshape(-1,1)
t0 = time.time()
y_hat = linear_reg_svd(x, y_train_encoded, x_test)
t1 = time.time()
y_hat = np.rint(y_hat).astype(int)
acc = np.mean(y_hat == y_test_encoded)
print("Accuracy = %f, time: %f" %(rmse, t1-t0))

Accuracy = 0.862237, time: 0.001835


In [31]:
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('mnist_small')
x = np.vstack([x_valid, x_train])
y = np.vstack([y_valid, y_train])


t0 = time.time()
y_hat = linear_reg_svd(x, y, x_test)
t1 = time.time()
acc = np.mean(y_hat == y_test)
print("Accuracy = %f, time: %f" %(rmse, t1-t0))

Accuracy = 0.862237, time: 2.002626
