# Ablation Study

Study effects of different parts from feature engineering.

Model: Least Square.

- Full feature engineering with normalization: $x := (x-\min(x))/(\max(x)-\min(x))$ (FFE+N)
- Full feature engineering with standardization: $x := (x-\mu(x))/\sigma(x)$ (FFE+S)
- No clipping outliers with normalization (FFE+N-O)
- Remove the first column `DER_mass_MMC` with normalization (FFE+N-O-FC)
- Remove the first column and no polynomial expansion with normalization (FFE+N-O-FC-PE)
- Remove all columns with `-999` and no polynomial expansion with normalization (FFE+N-O-ALLNAN-PE)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from utils.helpers import *
from utils.prediction import *
from utils.preprocess import *
from utils.cross_validation import *

In [3]:
TRAIN_PATH = "./data/train.csv"
TEST_PATH = "./data/test.csv"

In [4]:
lambda_ = 0
degree = 9
learning_rate = 0.1
max_iter = 2000
k_fold = 5
seed = 20221031
batch_size = 1

In [5]:
def ridge_regression_plot(y_tr, tx_tr, y_dev, tx_dev, lambda_=0):
    """Ridge regression using normal equations.
    Args:
        y: numpy array of shape (N, 1), N is the number of samples.
        tx: numpy array of shape (N, D), D is the number of features.
        lambda_: scalar.

    Returns:
        w: optimal weights, numpy array of shape(D, 1), D is the number of features.
        loss: scalar
    """
    N, D = tx_tr.shape
    I = np.eye(D)
    w = np.linalg.solve(tx_tr.T @ tx_tr + 2 * N * lambda_ * I, tx_tr.T @ y_tr).reshape(
        -1, 1
    )
    train_loss = compute_mse(y_tr, tx_tr, w)
    dev_loss = compute_mse(y_dev, tx_dev, w)

    return w, train_loss, dev_loss

## FFE+S

In [6]:
y_raw_tr, tx_raw_tr, ids_tr = load_csv_data(TRAIN_PATH)
_, tx_raw_te, ids_te = load_csv_data(TEST_PATH)

y_tr = process_y(y_raw_tr)
tx_tr = tx_raw_tr
tx_te = tx_raw_te

tx_tr[:, [22, 29]] = tx_tr[:, [29, 22]]
tx_te[:, [22, 29]] = tx_te[:, [29, 22]]
tx_tr[tx_tr[:, 0] == -999, 0] = np.nan
tx_te[tx_te[:, 0] == -999, 0] = np.nan

median = np.nanmedian(np.hstack((tx_tr[:, 0], tx_te[:, 0])))
tx_tr[np.isnan(tx_tr[:, 0]), 0] = median
tx_te[np.isnan(tx_te[:, 0]), 0] = median

# cross validation
k_indices = build_k_indices(y_tr, k_fold, seed)
tx_tr, tx_dev, y_tr, y_dev = cross_validation_dataset(
    y_tr, tx_tr, k_indices, k=k_fold - 1
)

# split datasets to different jet nums
# and remove columns with missing values for each jet num
tx_train_list, y_tr_list = split_jet_num(tx_tr, y_tr)
tx_dev_list, y_dev_list = split_jet_num(tx_dev, y_dev)

# remove outliers
means = []
stds = []
for i in range(3):
    mean = np.mean(tx_train_list[i], axis=0)
    std = np.std(tx_train_list[i], axis=0)
    tx_train_list[i] = np.clip(tx_train_list[i], mean - 2 * std, mean + 2 * std)
    tx_dev_list[i] = np.clip(tx_dev_list[i], mean - 2 * std, mean + 2 * std)
    means.append(mean)
    stds.append(std)

# add polynomial features
for i in range(3):
    tx_train_list[i] = build_poly(tx_train_list[i], degree)
    tx_dev_list[i] = build_poly(tx_dev_list[i], degree)

means = [0, 0, 0]
stds = [0, 0, 0]
for i in range(3):
    tx_train_list[i], tx_dev_list[i], means[i], stds[i] = standardization(
        tx_train_list[i], tx_dev_list[i]
    )

ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    best_w, train_loss, dev_loss = ridge_regression_plot(
        y_tr,
        tx_tr_fe,
        y_dev,
        tx_dev_fe,
    )

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)


accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.61216 0.4005994005994006 0.26299680818503784 0.3175315419944043
Validation
0.61388 0.40045968882602545 0.26562683241468277 0.3193964605513643


## FFE+N

In [7]:
y_raw_tr, tx_raw_tr, ids_tr = load_csv_data(TRAIN_PATH)
_, tx_raw_te, ids_te = load_csv_data(TEST_PATH)

y_tr = process_y(y_raw_tr)
tx_tr = tx_raw_tr
tx_te = tx_raw_te

tx_tr[:, [22, 29]] = tx_tr[:, [29, 22]]
tx_te[:, [22, 29]] = tx_te[:, [29, 22]]
tx_tr[tx_tr[:, 0] == -999, 0] = np.nan
tx_te[tx_te[:, 0] == -999, 0] = np.nan

median = np.nanmedian(np.hstack((tx_tr[:, 0], tx_te[:, 0])))
tx_tr[np.isnan(tx_tr[:, 0]), 0] = median
tx_te[np.isnan(tx_te[:, 0]), 0] = median

# cross validation
k_indices = build_k_indices(y_tr, k_fold, seed)
tx_tr, tx_dev, y_tr, y_dev = cross_validation_dataset(
    y_tr, tx_tr, k_indices, k=k_fold - 1
)

# split datasets to different jet nums
# and remove columns with missing values for each jet num
tx_train_list, y_tr_list = split_jet_num(tx_tr, y_tr)
tx_dev_list, y_dev_list = split_jet_num(tx_dev, y_dev)

# remove outliers
means = []
stds = []
for i in range(3):
    mean = np.mean(tx_train_list[i], axis=0)
    std = np.std(tx_train_list[i], axis=0)
    tx_train_list[i] = np.clip(tx_train_list[i], mean - 2 * std, mean + 2 * std)
    tx_dev_list[i] = np.clip(tx_dev_list[i], mean - 2 * std, mean + 2 * std)
    means.append(mean)
    stds.append(std)

# add polynomial features
for i in range(3):
    tx_train_list[i] = build_poly(tx_train_list[i], degree)
    tx_dev_list[i] = build_poly(tx_dev_list[i], degree)

means = [0, 0, 0]
stds = [0, 0, 0]
for i in range(3):
    tx_train_list[i], tx_dev_list[i], means[i], stds[i] = normalization(
        tx_train_list[i], tx_dev_list[i]
    )

ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    best_w, train_loss, dev_loss = ridge_regression_plot(
        y_tr, tx_tr_fe, y_dev, tx_dev_fe, lambda_=1e-8
    )

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)


accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.83079 0.7790923538760375 0.7073295148149767 0.741478618245153
Validation
0.83004 0.7738445781590065 0.7088659552011258 0.7399314481576692


## FFE+N-O

In [8]:
y_raw_tr, tx_raw_tr, ids_tr = load_csv_data(TRAIN_PATH)
_, tx_raw_te, ids_te = load_csv_data(TEST_PATH)

In [9]:
y_tr = process_y(y_raw_tr)
tx_tr = tx_raw_tr
tx_te = tx_raw_te

In [10]:
tx_tr[:, [22, 29]] = tx_tr[:, [29, 22]]
tx_te[:, [22, 29]] = tx_te[:, [29, 22]]
tx_tr[tx_tr[:, 0] == -999, 0] = 60
tx_te[tx_te[:, 0] == -999, 0] = 60

In [11]:
# cross validation
k_indices = build_k_indices(y_tr, k_fold, seed)
tx_tr, tx_dev, y_tr, y_dev = cross_validation_dataset(
    y_tr, tx_tr, k_indices, k=k_fold - 1
)

In [12]:
# split datasets to different jet nums
# and remove columns with missing values for each jet num
tx_train_list, y_tr_list = split_jet_num(tx_tr, y_tr)
tx_dev_list, y_dev_list = split_jet_num(tx_dev, y_dev)

In [13]:
# add polynomial features
for i in range(3):
    tx_train_list[i] = build_poly(tx_train_list[i], degree)
    tx_dev_list[i] = build_poly(tx_dev_list[i], degree)

In [14]:
maxs = [0, 0, 0]
mins = [0, 0, 0]
for i in range(3):
    tx_train_list[i], tx_dev_list[i], maxs[i], mins[i] = normalization(
        tx_train_list[i], tx_dev_list[i]
    )

In [15]:
ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    best_w, train_loss, dev_loss = ridge_regression_plot(
        y_tr,
        tx_tr_fe,
        y_dev,
        tx_dev_fe,
    )

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

In [16]:
accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.82581 0.7660244797655992 0.7087286665792197 0.7362635698820537
Validation
0.82624 0.7634130982367758 0.7108596223759822 0.7361996720714157


## FFE+N-O-FC

Note that here we change polynomial term (`degree`) from 9 to 7 for better performance.

In [17]:
y_raw_tr, tx_raw_tr, ids_tr = load_csv_data(TRAIN_PATH)
_, tx_raw_te, ids_te = load_csv_data(TEST_PATH)

y_tr = process_y(y_raw_tr)
tx_tr = process_tx2(tx_raw_tr)
tx_te = process_tx2(tx_raw_te)

# cross validation
k_indices = build_k_indices(y_tr, k_fold, seed)
tx_tr, tx_dev, y_tr, y_dev = cross_validation_dataset(
    y_tr, tx_tr, k_indices, k=k_fold - 1
)

# split datasets to different jet nums
# and remove columns with missing values for each jet num
tx_train_list, y_tr_list = split_jet_num2(tx_tr, y_tr)
tx_dev_list, y_dev_list = split_jet_num2(tx_dev, y_dev)

# add polynomial features
for i in range(3):
    tx_train_list[i] = build_poly(tx_train_list[i], 7)
    tx_dev_list[i] = build_poly(tx_dev_list[i], 7)

maxs = [0, 0, 0]
mins = [0, 0, 0]
for i in range(3):
    tx_train_list[i], tx_dev_list[i], maxs[i], mins[i] = normalization(
        tx_train_list[i], tx_dev_list[i]
    )

ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    best_w, train_loss, dev_loss = ridge_regression_plot(
        y_tr,
        tx_tr_fe,
        y_dev,
        tx_dev_fe,
    )

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

# add polynomial features
for i in range(3):
    tx_train_list[i] = build_poly(tx_train_list[i], degree)
    tx_dev_list[i] = build_poly(tx_dev_list[i], degree)

accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.804635 0.7437615526802218 0.6568143063267894 0.6975891025889092
Validation
0.80522 0.7409261576971214 0.6595520112583558 0.697874980611137


## FFE+N-O-FC-PE

In [18]:
y_raw_tr, tx_raw_tr, ids_tr = load_csv_data(TRAIN_PATH)
_, tx_raw_te, ids_te = load_csv_data(TEST_PATH)

y_tr = process_y(y_raw_tr)
tx_tr = process_tx2(tx_raw_tr)
tx_te = process_tx2(tx_raw_te)

# cross validation
k_indices = build_k_indices(y_tr, k_fold, seed)
tx_tr, tx_dev, y_tr, y_dev = cross_validation_dataset(
    y_tr, tx_tr, k_indices, k=k_fold - 1
)

# split datasets to different jet nums
# and remove columns with missing values for each jet num
tx_train_list, y_tr_list = split_jet_num2(tx_tr, y_tr)
tx_dev_list, y_dev_list = split_jet_num2(tx_dev, y_dev)

maxs = [0, 0, 0]
mins = [0, 0, 0]
for i in range(3):
    tx_train_list[i], tx_dev_list[i], maxs[i], mins[i] = normalization(
        tx_train_list[i], tx_dev_list[i]
    )

ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    best_w, train_loss, dev_loss = ridge_regression_plot(
        y_tr,
        tx_tr_fe,
        y_dev,
        tx_dev_fe,
    )

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.758595 0.6930350903843232 0.5319400113681081 0.6018948357891438
Validation
0.7601 0.6903167556993455 0.5379969508619679 0.6047124732245839


## FFE+N-O-ALLNAN-PE

In [19]:
y_raw_tr, tx_raw_tr, ids_tr = load_csv_data(TRAIN_PATH)
_, tx_raw_te, ids_te = load_csv_data(TEST_PATH)

y_tr = process_y(y_raw_tr)

# mask
tx_raw_tr[:, [22, 29]] = tx_raw_tr[:, [29, 22]]
tx_raw_te[:, [22, 29]] = tx_raw_te[:, [29, 22]]
col_mask = np.zeros(tx_raw_tr.shape[1], dtype=bool)
col_mask[[1, 2, 3, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 29]] = True
tx_tr = tx_raw_tr[:, col_mask]
tx_te = tx_raw_te[:, col_mask]

# cross validation
k_indices = build_k_indices(y_tr, k_fold, seed)
tx_tr, tx_dev, y_tr, y_dev = cross_validation_dataset(
    y_tr, tx_tr, k_indices, k=k_fold - 1
)

# split datasets to different jet nums
# and remove columns with missing values and constant values for each jet num
tx_train_list = [tx_tr[tx_tr[:, -1] == 0, :-2]]
y_tr_list = [y_tr[tx_tr[:, -1] == 0]]
tx_train_list.append(tx_tr[tx_tr[:, -1] >= 1, :-1])
y_tr_list.append(y_tr[tx_tr[:, -1] >= 1])

tx_dev_list = [tx_dev[tx_dev[:, -1] == 0, :-2]]
y_dev_list = [y_dev[tx_dev[:, -1] == 0]]
tx_dev_list.append(tx_dev[tx_dev[:, -1] >= 1, :-1])
y_dev_list.append(y_dev[tx_dev[:, -1] >= 1])

maxs = [0, 0]
mins = [0, 0]
for i in range(2):
    tx_train_list[i], tx_dev_list[i], maxs[i], mins[i] = normalization(
        tx_train_list[i], tx_dev_list[i]
    )

ws = []
y_tr_pred, y_tr_true = np.empty((0, 1)), np.empty((0, 1))
y_dev_pred, y_dev_true = np.empty((0, 1)), np.empty((0, 1))

for i in range(len(tx_train_list)):

    y_tr = y_tr_list[i]
    tx_tr_fe = tx_train_list[i]
    y_dev = y_dev_list[i]
    tx_dev_fe = tx_dev_list[i]

    best_w, train_loss, dev_loss = ridge_regression_plot(
        y_tr,
        tx_tr_fe,
        y_dev,
        tx_dev_fe,
    )

    y_tr_pred = np.vstack((y_tr_pred, predict_linear(tx_tr_fe, best_w)))
    y_dev_pred = np.vstack((y_dev_pred, predict_linear(tx_dev_fe, best_w)))
    y_tr_true = np.vstack((y_tr_true, y_tr))
    y_dev_true = np.vstack((y_dev_true, y_dev))
    ws.append(best_w)

accuracy, precision, recall, f1_score = compute_metrics(y_tr_true, y_tr_pred)
print("Training")
print(accuracy, precision, recall, f1_score)

accuracy, precision, recall, f1_score = compute_metrics(y_dev_true, y_dev_pred)
print("Validation")
print(accuracy, precision, recall, f1_score)

Training
0.74205 0.6557941171087072 0.5221750980134959 0.5814062931048471
Validation
0.74138 0.6502441513009256 0.5231617215902428 0.5798212835093419
