# Comparing Different Models

$\lambda=1$, $k=5$, $\text{degree}=9$

Using all feature engineering techniques.

In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from utils.helpers import *
from utils.prediction import *
from utils.preprocess import *
from utils.cross_validation import *

In [3]:
TRAIN_PATH = './data/train.csv'
TEST_PATH = './data/test.csv'

In [4]:
lambda_ = 0
degree = 9
learning_rate = 0.1
max_iter = 2000
k_fold = 5
seed = 20221031
batch_size = 1

In [5]:
y_raw_tr, tx_raw_tr, ids_tr = load_csv_data(TRAIN_PATH)
_, tx_raw_te, ids_te = load_csv_data(TEST_PATH)

In [6]:
y_tr = process_y(y_raw_tr)
tx_tr = tx_raw_tr
tx_te = tx_raw_te
print(y_tr.shape)
print(tx_tr.shape)
print(tx_te.shape)

(250000, 1)
(250000, 30)
(568238, 30)


In [7]:
tx_tr[:, [22, 29]] = tx_tr[:, [29, 22]]
tx_te[:, [22, 29]] = tx_te[:, [29, 22]]
tx_tr[tx_tr[:, 0] == -999, 0] = 60
tx_te[tx_te[:, 0] == -999, 0] = 60

In [8]:
# cross validation
k_indices = build_k_indices(y_tr, k_fold, seed)
tx_tr, tx_dev, y_tr, y_dev = cross_validation_dataset(y_tr, tx_tr, k_indices, k=k_fold-1)
print(tx_tr.shape)
print(tx_dev.shape)
print(y_tr.shape)
print(y_dev.shape)

(200000, 30)
(50000, 30)
(200000, 1)
(50000, 1)


In [9]:
# split datasets to different jet nums
# and remove columns with missing values for each jet num
tx_train_list, y_tr_list = split_jet_num(tx_tr, y_tr)
tx_dev_list, y_dev_list = split_jet_num(tx_dev, y_dev)

In [10]:
# add polynomial features
for i in range(3):
    tx_train_list[i] = build_poly(tx_train_list[i], degree)
    tx_dev_list[i] = build_poly(tx_dev_list[i], degree)

In [11]:
for i in range(3):
    print(tx_train_list[i].shape, tx_dev_list[i].shape)

(79917, 162) (19996, 162)
(62257, 198) (15287, 198)
(57826, 261) (14717, 261)


In [12]:
maxs = [0, 0, 0]
mins = [0, 0, 0]
for i in range(3):
    tx_train_list[i], tx_dev_list[i], maxs[i], mins[i] = normalization(
        tx_train_list[i],
        tx_dev_list[i]
    )

## Least Square

In [13]:
def ridge_regression_plot(y_tr, tx_tr, y_dev, tx_dev, lambda_):
    """Ridge regression using normal equations.
    Args:
        y: numpy array of shape (N, 1), N is the number of samples.
        tx: numpy array of shape (N, D), D is the number of features.
        lambda_: scalar.

    Returns:
        w: optimal weights, numpy array of shape(D, 1), D is the number of features.
        loss: scalar
    """
    N, D = tx_tr.shape
    I = np.eye(D)
    w = np.linalg.solve(tx_tr.T @ tx_tr + 2 * N * lambda_ * I, tx_tr.T @ y_tr).reshape(-1, 1)
    train_loss = compute_mse(y_tr, tx_tr, w)
    dev_loss = compute_mse(y_dev, tx_dev, w)

    return w, train_loss, dev_loss

In [14]:
train_losses = []
dev_losses = []
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):
    train_losses = []
    dev_losses = []
    w_list = []
    lambda_list = [0]

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    for lambda_ in lambda_list:
        w, train_loss, dev_loss = ridge_regression_plot(
            y_tr, tx_tr_fe,
            y_dev, tx_dev_fe,
            lambda_,
        )
        train_losses.append(train_loss)
        dev_losses.append(dev_loss)
        w_list.append(w)

    # cross_validation_visualization(lambda_list, train_losses, dev_losses, i)
    index = np.argmin(dev_losses)
    best_lambda = lambda_list[index]
    best_w = w_list[index]
    print("The best lambda for PRI_JET_NUM = {} is {}.".format(i, best_lambda))

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

The best lambda for PRI_JET_NUM = 0 is 0.
The best lambda for PRI_JET_NUM = 1 is 0.
The best lambda for PRI_JET_NUM = 2 is 0.


In [15]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.82581 0.7660244797655992 0.7087286665792197 0.7362635698820537
Validation
0.82624 0.7634130982367758 0.7108596223759822 0.7361996720714157


## Ridge Regression

In [16]:
train_losses = []
dev_losses = []
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):
    train_losses = []
    dev_losses = []
    w_list = []
    lambda_list = np.logspace(-10, 1, 12)

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    for lambda_ in lambda_list:
        w, train_loss, dev_loss = ridge_regression_plot(
            y_tr, tx_tr_fe,
            y_dev, tx_dev_fe,
            lambda_,
        )
        train_losses.append(train_loss)
        dev_losses.append(dev_loss)
        w_list.append(w)

    # cross_validation_visualization(lambda_list, train_losses, dev_losses, i)
    index = np.argmin(dev_losses)
    best_lambda = lambda_list[index]
    best_w = w_list[index]
    print("The best lambda for PRI_JET_NUM = {} is {}.".format(i, best_lambda))

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

The best lambda for PRI_JET_NUM = 0 is 1e-06.
The best lambda for PRI_JET_NUM = 1 is 1e-10.
The best lambda for PRI_JET_NUM = 2 is 1e-10.


In [17]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.807565 0.7492718607268154 0.659889525308615 0.7017459567114328
Validation
0.80748 0.7452132576257758 0.6618388647824557 0.7010559006211181


## Linear Regression GD

In [18]:
def mean_squared_error_sgd(y_tr, tx_tr, y_dev, tx_dev, initial_w, max_iters, gamma, batch_size=1):
    """Linear regression using stochastic gradient descent.

    Args:
        y: numpy array of shape=(N, 1)
        tx: numpy array of shape=(N, D)
        initial_w: numpy array of shape=(D, 1). The initial guess (or the initialization) for the model parameters
        max_iters: a scalar denoting the total number of iterations of SGD
        gamma: a scalar denoting the stepsize
        batch_size: default 1, a scalar denoting the batch size

    Returns:
        w: the last weight vector of shape (D, 1)
        loss: the corresponding mse loss
    """

    # Define parameters to store w and loss
    w = initial_w
    train_loss = compute_mse(y_tr, tx_tr, w)
    ws = [initial_w]
    train_losses = [train_loss]

    for n_iter in range(max_iters):
        # implement stochastic gradient descent.
        for y_batch, tx_batch in batch_iter(y_tr, tx_tr, batch_size=batch_size, num_batches=1):

            # compute gradient
            grad = linear_reg_gradient(y_batch, tx_batch, w)

            # update w by gradient
            w = w - gamma * grad

            # compute loss
            train_loss = compute_mse(y_tr, tx_tr, w)
            dev_loss = compute_mse(y_dev, tx_dev, w)

            # store w and loss
            ws.append(w)
            train_losses.append(train_loss)
            dev_losses.append(dev_loss)

    index = np.argmin(dev_losses)
    return ws[index], train_losses[index], dev_losses[index]

In [19]:
train_losses = []
dev_losses = []
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]
    initial_w = np.random.rand(tx_tr_fe.shape[1], 1)

    best_w, train_loss, dev_loss = mean_squared_error_sgd(
        y_tr, tx_tr_fe,
        y_dev, tx_dev_fe,
        initial_w,
        max_iter,
        learning_rate,
        batch_size=tx_tr_fe.shape[0]
    )

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

In [20]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.73056 0.6746779283019764 0.4144549866643347 0.5134793521243748
Validation
0.72848 0.6633784291619692 0.4140377624017826 0.5098563073146075


## Linear Regression SGD

In [21]:
train_losses = []
dev_losses = []
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]
    initial_w = np.random.rand(tx_tr_fe.shape[1], 1)

    best_w, train_loss, dev_loss = mean_squared_error_sgd(
        y_tr, tx_tr_fe,
        y_dev, tx_dev_fe,
        initial_w,
        max_iter,
        learning_rate,
        batch_size=1
    )

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

In [22]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.636595 0.46797253889273793 0.4331540670135397 0.4498906305583518
Validation
0.63978 0.4700694314130231 0.4406590829130996 0.4548893798613844


## Logistic Regression GD

In [14]:
def reg_logistic_regression_plot(y_tr, tx_tr, y_dev, tx_dev, lambda_, initial_w, max_iters, gamma, batch_size=8):
    """Regularized logistic regression using gradient descent
    or SGD (y ∈ {0, 1}, with regularization term λ|w|2)

    Args:
        y_tr: numpy array of shape=(N_tr, 1)
        tx_tr: numpy array of shape=(N_tr, D)
        y_dev: numpy array of shape=(N_dev, 1)
        tx_dev: numpy array of shape=(N_dev, D)
        lambda_: a scalar denoting the regularization term
        initial_w: numpy array of shape=(D, 1). The initial guess (or the initialization) for the model parameters
        max_iters: a scalar denoting the total number of iterations of SGD
        gamma: a scalar denoting the stepsize
        batch_size: mini batch size. default 8.
        optimizer: 'gd' (batch sgd), 'ada' (adagrad), and 'adam'. default 'gd'.

    Returns:
        w: the best weight vector of shape (D, 1) for validation
        train_loss: the corresponding mse loss
        dev_loss: the corresponding mse loss
    """

    # Define parameters to store w and loss
    w = initial_w
    ws = [initial_w]
    train_losses = [compute_ce(y_tr, tx_tr, w)]
    dev_losses = [compute_ce(y_dev, tx_dev, w)]

    for n_iter in range(max_iters):
        for y_batch, tx_batch in batch_iter(
            y_tr, tx_tr, batch_size=batch_size, num_batches=1
        ):
            # compute gradient
            grad = logistic_reg_gradient(y_batch, tx_batch, w)

            # update w by gradient
            w = w - gamma * (grad + 2 * lambda_ * w)

            # compute loss
            loss = compute_ce(y_tr, tx_tr, w)

            # store w and loss
            ws.append(w)
            train_losses.append(loss)

            # compute dev loss
            dev_losses.append(compute_ce(y_dev, tx_dev, w))
    
    index = np.argmin(dev_losses)
    return ws[index], train_losses[index], dev_losses[index]

In [24]:
train_losses = []
dev_losses = []
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):
    train_losses = []
    dev_losses = []
    w_list = []
    lambda_list = [0]

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    for lambda_ in lambda_list:
        initial_w = np.random.rand(tx_tr_fe.shape[1], 1)
        w, train_loss, dev_loss = reg_logistic_regression_plot(
            y_tr, tx_tr_fe,
            y_dev, tx_dev_fe,
            lambda_,
            initial_w,
            max_iter,
            learning_rate,
            batch_size=tx_tr_fe.shape[0],
        )
        train_losses.append(train_loss)
        dev_losses.append(dev_loss)
        w_list.append(w)

    index = np.argmin(dev_losses)
    best_lambda = lambda_list[index]
    best_w = w_list[index]
    print("The best lambda for PRI_JET_NUM = {} is {}.".format(i, best_lambda))

    y_tr_pred = np.vstack((y_tr_pred, predict_logistic(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_logistic(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

The best lambda for PRI_JET_NUM = 0 is 0.
The best lambda for PRI_JET_NUM = 1 is 0.
The best lambda for PRI_JET_NUM = 2 is 0.


In [25]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.720055 0.663879945996469 0.3726699022051215 0.47736840631388333
Validation
0.72064 0.6596647350993378 0.3738125952855635 0.47720637772288343


## Logistic Regression SGD

In [26]:
train_losses = []
dev_losses = []
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):
    train_losses = []
    dev_losses = []
    w_list = []
    lambda_list = [0]

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    for lambda_ in lambda_list:
        initial_w = np.random.rand(tx_tr_fe.shape[1], 1)
        w, train_loss, dev_loss = reg_logistic_regression_plot(
            y_tr, tx_tr_fe,
            y_dev, tx_dev_fe,
            lambda_,
            initial_w,
            max_iter,
            learning_rate,
            batch_size=1,
        )
        train_losses.append(train_loss)
        dev_losses.append(dev_loss)
        w_list.append(w)

    index = np.argmin(dev_losses)
    best_lambda = lambda_list[index]
    best_w = w_list[index]
    print("The best lambda for PRI_JET_NUM = {} is {}.".format(i, best_lambda))

    y_tr_pred = np.vstack((y_tr_pred, predict_logistic(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_logistic(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

The best lambda for PRI_JET_NUM = 0 is 0.
The best lambda for PRI_JET_NUM = 1 is 0.
The best lambda for PRI_JET_NUM = 2 is 0.


In [27]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.70925 0.6225433932210536 0.38734642123212804 0.4775569611155035
Validation
0.71058 0.6206896551724138 0.38946874633517065 0.4786164655017115


## Penalized Logistic Regression GD

In [15]:
train_losses = []
dev_losses = []
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):
    train_losses = []
    dev_losses = []
    w_list = []
    lambda_list = np.logspace(-10, 1, 12)

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    for lambda_ in lambda_list:
        initial_w = np.random.rand(tx_tr_fe.shape[1], 1)
        w, train_loss, dev_loss = reg_logistic_regression_plot(
            y_tr, tx_tr_fe,
            y_dev, tx_dev_fe,
            lambda_,
            initial_w,
            max_iter,
            learning_rate,
            batch_size=tx_tr_fe.shape[0],
        )
        train_losses.append(train_loss)
        dev_losses.append(dev_loss)
        w_list.append(w)

    index = np.argmin(dev_losses)
    best_lambda = lambda_list[index]
    best_w = w_list[index]
    print("The best lambda for PRI_JET_NUM = {} is {}.".format(i, best_lambda))

    y_tr_pred = np.vstack((y_tr_pred, predict_logistic(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_logistic(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

The best lambda for PRI_JET_NUM = 0 is 1e-07.
The best lambda for PRI_JET_NUM = 1 is 1e-05.
The best lambda for PRI_JET_NUM = 2 is 1e-08.


In [16]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.72174 0.6673293914121201 0.3766778890297757 0.48154496841869915
Validation
0.72112 0.6601112026359144 0.37592353700011727 0.4790405738623627


## Penalized Logistic Regression SGD

In [17]:
train_losses = []
dev_losses = []
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):
    train_losses = []
    dev_losses = []
    w_list = []
    lambda_list = np.logspace(-10, 1, 12)

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    for lambda_ in lambda_list:
        initial_w = np.random.rand(tx_tr_fe.shape[1], 1)
        w, train_loss, dev_loss = reg_logistic_regression_plot(
            y_tr, tx_tr_fe,
            y_dev, tx_dev_fe,
            lambda_,
            initial_w,
            max_iter,
            learning_rate,
            batch_size=1,
        )
        train_losses.append(train_loss)
        dev_losses.append(dev_loss)
        w_list.append(w)

    index = np.argmin(dev_losses)
    best_lambda = lambda_list[index]
    best_w = w_list[index]
    print("The best lambda for PRI_JET_NUM = {} is {}.".format(i, best_lambda))

    y_tr_pred = np.vstack((y_tr_pred, predict_logistic(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_logistic(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

The best lambda for PRI_JET_NUM = 0 is 1e-05.
The best lambda for PRI_JET_NUM = 1 is 1e-05.
The best lambda for PRI_JET_NUM = 2 is 0.001.


In [18]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.71431 0.6464208242950108 0.36917202279451417 0.46995306035362433
Validation
0.7166 0.6460992907801418 0.37392986982526094 0.4737037587282722
