# COSMO Project
By Mathilde Raynal, Etienne Bonvin and Xavier Pantet

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from regressions import *
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
DATA_FOLDER = "data/"
#X = np.load(DATA_FOLDER + "feature_mat_radial_compression.npy")
X = np.load(DATA_FOLDER + "pca_x.npy")
y = np.load(DATA_FOLDER + "CSD500-r_train-H_total.npy")

In [4]:
print("X: " + str(X.shape))
print("y: " + str(y.shape))

X: (30049, 3004)
y: (30049,)


## Idea 1: Good ol' least squares (MSE loss without regularizer)

We first try a standard and naive implementation of `least_squares` on the full dataset:

In [5]:
def run_least_squares():
    w_star = least_squares(y, X)
    loss = rmse(y, X, w_star)
    print("Loss = " + str(loss))

We see that the loss is quite large! We hope to do better using polynomial expansion using a smaller dataset composed only of a smaller number of features so that we don't need a cluster. We use 4-fold cross-validation to find the best `degree`:

In [6]:
k_fold = 4
k_indices = build_k_indices(y, k_fold)

def run_least_squares_poly():
    rmse_tr = []
    rmse_te = []

    for degree in range(4):
        rmse_tr_tmp = []
        rmse_te_tmp = []
        for k in range(k_fold):
            loss_tr, loss_te, _ = cross_validation(y, X, k_indices, k, degree, least_squares)
            rmse_tr_tmp.append(loss_tr)
            rmse_te_tmp.append(loss_te)
        rmse_tr.append(np.mean(rmse_tr_tmp))
        rmse_te.append(np.mean(rmse_te_tmp))
    pd.DataFrame([rmse_tr, rmse_te]).add_prefix("Degree ").rename({0: "Train error", 1: "Test error"}).head()
    return rmse_tr, rmse_te

Indeed, polynomial expansion provides better results. Moreover, we see that the best `degree` is 1.

## Idea 2: Ridge regression (MSE loss with $\mathcal{L}_2$-regulizer)

In [7]:
k_fold = 4
k_indices = build_k_indices(y, k_fold)

def run_ridge_regression():
    rmse_tr = []
    rmse_te = []
    
    for lambda_ in [1e-10]:
        for degree in range(1, 3):
            rmse_tr_tmp = []
            rmse_te_tmp = []
            for k in range(k_fold):
                ridge_lambda = lambda y, X: ridge_regression(y, X, lambda_)
                loss_tr, loss_te, _ = cross_validation(y, X, k_indices, k, degree, ridge_lambda)
                rmse_tr_tmp.append(loss_tr)
                rmse_te_tmp.append(loss_te)
            rmse_tr.append(np.mean(rmse_tr_tmp))
            rmse_te.append(np.mean(rmse_te_tmp))
    return rmse_tr, rmse_te

## Idea 3: Lasso (MSE loss with $\mathcal{L}_1$-regulizer)

In [8]:
def run_lasso():
    for lambda_ in [0, 1e-4, 1e-3, 1e-2, 1e-1]:
        for degree in range(1, 3):
            rmse_tr_tmp = []
            rmse_te_tmp = []
    
            for k in range(k_fold):
                lasso_lambda = lambda y, X, w: lasso(y, X, w, lambda_)
                lasso_stoch_grad_lambda = lambda y, X, w: lasso_stoch_grad(y, X, w, lambda_)
                loss_lambda = lambda y, X: stochastic_gradient_descent(y, X, np.zeros(X.shape[1]), 100, 1e-4, lasso_lambda, lasso_stoch_grad_lambda)
                loss_tr, loss_te, w = cross_validation(y, X, k_indices, k, degree, loss_lambda)
                rmse_tr_tmp.append(loss_tr)
                rmse_te_tmp.append(loss_te)
            print(np.mean(rmse_tr_tmp), np.mean(rmse_te_tmp))

In [None]:
lambda_ = 0
lasso_lambda = lambda y, X, w: lasso(y, X, w, lambda_)
lasso_stoch_grad_lambda = lambda y, X, w: lasso_stoch_grad(y, X, w, lambda_)
# lasso: gamma = 1e-15
ws, losses = stochastic_gradient_descent(y, X, np.zeros(X.shape[1]), 1000, 1e-10, mae, mae_stoch_grad, detail = True)

In [None]:
plt.plot(losses)
#list(map(lambda w: rmse(y, X, w), ws))

## Idea 4: MAE loss (with SGD)

In [11]:
def run_mae_sgd():
    rmse_tr = []
    rmse_te = []

    #for lambda_ in [0, 1e-4, 1e-3, 1e-2, 1e-1]:
    for _ in [0]:
        for degree in range(1, 3):
            rmse_tr_tmp = []
            rmse_te_tmp = []
    
            for k in range(k_fold):
                mae_lambda = lambda y, X: stochastic_gradient_descent(y, X, np.zeros(X.shape[1]), 100, 1e-4, mae, mae_stoch_grad)
                loss_tr, loss_te, _ = cross_validation(y, X, k_indices, k, degree, mae_lambda)
                rmse_tr_tmp.append(loss_tr)
                rmse_te_tmp.append(loss_te)
            rmse_tr.append(np.mean(rmse_tr_tmp))
            rmse_te.append(np.mean(rmse_te_tmp))
        print(rmse_tr, rmse_te)