In [1]:
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
from implementations import mean_squared_error_gd, mean_squared_error_sgd, least_squares, ridge_regression, \
    logistic_regression, reg_logistic_regression, predict_simple, predict_logistic, penalized_logistic_regression

In [2]:
Y, X, ids = load_csv_data("data/train.csv", sub_sample=False)

In [3]:
np.random.seed(42)

In [4]:
# Partition the data based on the value of the PRI_JET_NUM (feature number 22)
xs, ys, _ = partition_data(X, Y, ids)

In [5]:
# columns_names = ["DER_mass_MMC", "DER_mass_transverse_met_lep", "DER_mass_vis", "DER_pt_h", "DER_deltaeta_jet_jet", "DER_mass_jet_jet", "DER_prodeta_jet_jet",	"DER_deltar_tau_lep",	"DER_pt_tot",	"DER_sum_pt",	"DER_pt_ratio_lep_tau",	"DER_met_phi_centrality",	"DER_lep_eta_centrality",	"PRI_tau_pt",	"PRI_tau_eta",	"PRI_tau_phi",	"PRI_lep_pt",	"PRI_lep_eta",	"PRI_lep_phi",	"PRI_met",	"PRI_met_phi",	"PRI_met_sumet",	"PRI_jet_num",	"PRI_jet_leading_pt",	"PRI_jet_leading_eta", "PRI_jet_leading_phi", "PRI_jet_subleading_pt", "PRI_jet_subleading_eta", "PRI_jet_subleading_phi", "PRI_jet_all_pt"]
#
#
# ## Plot Box Plots
# for i, x in enumerate(xs):
#     for j in range(x.shape[1]):
#         plt.boxplot(x[:, j][x[:, j]!=-999])
#         plt.title("BoxPlot for: " +  columns_names[j]  + " of sub-dataset: " + str(i))
#         plt.ylabel("values")
#         plt.show()
#
#
# # Plot Histograms
# ## Plot Box Plots
# for i, x in enumerate(xs):
#     for j in range(x.shape[1]):
#         plt.hist(x[:, j][x[:, j]!=-999], bins=40)
#         plt.title("Histogram for: " +  columns_names[j]  + " of sub-dataset: " + str(i))
#         plt.ylabel("count")
#         plt.xlabel("value")
#         plt.show()

In [6]:
# # Perform feature engineering on the samples of the TRAIN SET
for i, x in enumerate(xs):
    xs[i] = process_features(x)

In [7]:
# Perform 5-Fold cross validation in order to find the best hyper-parameters (lambdas)

lambdas = []
for i, x, y in zip(range(len(xs)), xs, ys):
    best_lambda, best_loss = do_cross_validation(x, y, nfolds = 4)
    lambdas.append(best_lambda)
    print(f'set {i}: Optimal lambda = {best_lambda} leads to loss = {best_loss}')

set 0: Optimal lambda = 4.37547937507418e-10 leads to loss = 0.22464598465609198
set 1: Optimal lambda = 1e-10 leads to loss = 0.2691440073196232
set 2: Optimal lambda = 1e-10 leads to loss = 0.228181252522845
set 3: Optimal lambda = 1.3433993325988987e-10 leads to loss = 0.22813041984996715


In [18]:
# lambdas for the ridge regression method
lambdas = [0.0001, 0.0001, 0.0001, 0.0001]

# lambdas for least squared method
# lambdas = [0, 0, 0, 0]

In [None]:
lambdas

In [19]:
# Train the models on sub-datasets and validate each of the models on validation set

validation_accuracy = []
ws = []

for i, x, y in zip(range(len(xs)), xs, ys):
    print("training and validation model on sub-dataset number: ", i)
    N = x.shape[0]
    thresh = int(0.80*N)
    x_train, x_validation = x[:thresh], x[thresh:]
    y_train, y_validation = y[:thresh], y[thresh:]

    x_train, x_validation = add_bias(x_train), add_bias(x_validation)
    lambda_ = lambdas[i]
    w, loss = ridge_regression(y_train, x_train, lambda_)
    ws.append(w)
    print("train accuracy:", (predict_simple(x_train, w) == y_train).mean())
    print("validation accuracy:", (predict_simple(x_validation, w) == y_validation).mean())
    validation_accuracy.append((predict_simple(x_validation, w) == y_validation).mean())

population = np.array([p.shape[0] / X.shape[0] for p in xs])
print("\n\nCombined validation accuracy = ", (validation_accuracy * population).sum())

training and validation model on sub-dataset number:  0
train accuracy: 0.8451269861128488
validation accuracy: 0.8445678827002953
training and validation model on sub-dataset number:  1
train accuracy: 0.8139921012331748
validation accuracy: 0.8111419175962344
training and validation model on sub-dataset number:  2
train accuracy: 0.8508051509813165
validation accuracy: 0.8472608177848353
training and validation model on sub-dataset number:  3
train accuracy: 0.8489086909931758
validation accuracy: 0.8513422061809158


Combined validation accuracy =  0.8353432044771721


In [27]:
# Loading the test data
y_test, X_test, ids_test = load_csv_data("data/test.csv", sub_sample=False)

In [28]:
# Partition the test data based on the value of the PRI_JET_NUM (feature number 22)
xs_test, _, idss_test = partition_data(X_test, y_test, ids_test)

In [29]:
# Perform feature engineering on the samples of the TEST SET
for i, x in enumerate(xs_test):
    xs_test[i] = process_features(x)

In [30]:
# Finding the predictions on the test set
labels = []
for i, x_test in enumerate(xs_test):
    x_test = add_bias(x_test)
    # calculate labels
    labels_partition = predict_simple(x_test, ws[i])
    labels.append(labels_partition)

y_preds = np.concatenate(labels)

In [31]:
# Producing the sumbission.csv output
idss_test = np.concatenate(idss_test)
idx_ids_sorted = np.argsort(idss_test)
idss_test = idss_test[idx_ids_sorted]
print("idss_test.shape=", idss_test.shape)

y_preds = y_preds[idx_ids_sorted]
print("y_preds.shape=", y_preds.shape)

create_csv_submission(idss_test, y_preds, "submission-0001.csv")

idss_test.shape= (568238,)
y_preds.shape= (568238,)
