# Tests and visualizations

## Imports

In [34]:
import numpy as np

## Load data

In [35]:
DATA_PATH = "../data/"

In [36]:
def load_data(filename):
    """load data."""
    path = DATA_PATH + filename
    data = np.genfromtxt(path, delimiter=",", names=True, dtype=None) # skip first row (column names)
    return data

In [37]:
training_data = load_data("train.csv")
training_labels = training_data.dtype.names

  data = np.genfromtxt(path, delimiter=",", names=True, dtype=None) # skip first row (column names)


In [47]:
print(training_labels, '\n')
print(training_data[0])

('Id', 'Prediction', 'DER_mass_MMC', 'DER_mass_transverse_met_lep', 'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', 'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt', 'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality', 'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi', 'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt', 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt') 

(100000, b's', 138.47, 51.655, 97.827, 27.98, 0.91, 124.711, 2.666, 3.064, 41.928, 197.76, 1.582, 1.396, 0.2, 32.638, 1.017, 0.381, 51.626, 2.273, -2.414, 16.824, -0.277, 258.733, 2, 67.435, 2.15, 0.444, 46.062, 1.24, -2.475, 113.497)


## Code

In [7]:
def compute_loss(y, tx, w):

    """Calculate the loss using MSE

    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N, M)
        w: numpy array of shape=(M,). The vector of model parameters.

    Returns:
        the value of the loss (a scalar), corresponding to the input parameters w.
    """
    e = y - tx@w
    n = y.shape[0]
    
    return e.T@e/(2*n)

In [48]:
def least_squares(y, tx):
    """Calculate the optimal vector w from the least squares regression, using the normal equations
    
    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N, M)
        
    Returns:
        w: the optimal model parameters resulting from the least squares regression
        loss: the MSE loss for the model parameters w.r. to y and tx
    """
    # Normal equations : w* = inverse(tx.T @ tx) @ tx.T @ y -> (tx.T @ tx) @ w* = tx.T @ y
    A = tx.T@tx
    b = tx.T@y
    w = np.linalg.solve(A, b) # Aw = b
    
    loss = compute_loss(y, tx, w)
    
    return (w, loss)

In [52]:
def ridge_regression(y, tx, lambda_):
    """Compute ridge regression using normal equations
    
    Args:
        y: numpy array of shape (N,), N is the number of samples.
        tx: numpy array of shape (N,D), D is the number of features.
        lambda_: scalar.
    
    Returns:
        w: optimal weights, numpy array of shape(D,), D is the number of features.
        loss : the MSE loss for the model parameters w.r. to y and tx
    """
    # Normal equations : w*_ridge = inverse(tx.T @ tx + lambda*I) @ tx.T @ y -> (tx.T @ tx + lambda_prime*I) @ w* = tx.T @ y
    n = y.shape[0]
    d = tx.shape[1]
    lambda_prime = 2*n*lambda_
    
    A = tx.T@tx + lambda_prime*np.identity(d)
    b = tx.T@y
    w = np.linalg.solve(A, b)
    
    loss = compute_loss(y, tx, w) + lambda_*np.sum(w**2)
    
    return (w, loss)