# COSMO Project
By Mathilde Raynal, Etienne Bonvin and Xavier Pantet

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from regressions import *
import pandas as pd

In [3]:
DATA_FOLDER = "data/"
#X = np.load(DATA_FOLDER + "feature_mat_radial_compression.npy")
X = np.load(DATA_FOLDER + "pca_x.npy")
y = np.load(DATA_FOLDER + "CSD500-r_train-H_total.npy")

In [4]:
print("X: " + str(X.shape))
print("y: " + str(y.shape))

X: (30049, 3004)
y: (30049,)


## Idea 1: Good ol' least squares (MSE loss without regularizer)

We first try a standard and naive implementation of `least_squares` on the full dataset:

In [5]:
w_star = least_squares(y, X)
loss = rmse(y, X, w_star)
print("Loss = " + str(loss))

Loss = 26.166553759575127


We see that the loss is quite large! We hope to do better using polynomial expansion using a smaller dataset composed only of a smaller number of features so that we don't need a cluster. We use 4-fold cross-validation to find the best `degree`:

In [6]:
#k_fold = 4
#k_indices = build_k_indices(y, k_fold)

#rmse_tr = []
#rmse_te = []

#for degree in range(4):
#    rmse_tr_tmp = []
#    rmse_te_tmp = []
#    for k in range(k_fold):
#        loss_tr, loss_te, _ = cross_validation(y, X, k_indices, k, degree, least_squares)
#        rmse_tr_tmp.append(loss_tr)
#        rmse_te_tmp.append(loss_te)
#    rmse_tr.append(np.mean(rmse_tr_tmp))
#    rmse_te.append(np.mean(rmse_te_tmp))

In [7]:
#pd.DataFrame([rmse_tr, rmse_te]).add_prefix("Degree ").rename({0: "Train error", 1: "Test error"})

Indeed, polynomial expansion provides better results. Moreover, we see that the best `degree` is 2.

## Idea 2: Ridge regression (MSE loss with $\mathcal{L}_2$-regulizer)

In [15]:
k_fold = 4
k_indices = build_k_indices(y, k_fold)

rmse_tr = []
rmse_te = []

#for lambda_ in [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]:
for lambda_ in [1e-13, 1e-12, 1e-11]:
    for degree in range(1, 3):
        rmse_tr_tmp = []
        rmse_te_tmp = []
        for k in range(k_fold):
            ridge_lambda = lambda y, X: ridge_regression(y, X, lambda_)
            loss_tr, loss_te, _ = cross_validation(y, X, k_indices, k, degree, ridge_lambda)
            rmse_tr_tmp.append(loss_tr)
            rmse_te_tmp.append(loss_te)
        rmse_tr.append(np.mean(rmse_tr_tmp))
        rmse_te.append(np.mean(rmse_te_tmp))

In [16]:
rmse_te

[0.7107496454416272,
 0.7489895213565038,
 0.7107496454416272,
 0.7489895213565052,
 0.7107496454416271,
 0.748989521356491]