In [12]:
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
from implementations import (
    mean_squared_error_gd,
    mean_squared_error_sgd,
    least_squares,
    ridge_regression,
    logistic_regression,
    reg_logistic_regression,
    predict_simple,
    predict_logistic,
    penalized_logistic_regression,
)

In [13]:
# Loading the data
Y, X, ids = load_csv_data("data/train.csv", sub_sample=False)

In [14]:
# Splitting the train and local test datasets
N = X.shape[0]
thresh = int(0.9 * N)
X_train, X_test_local = X[:thresh], X[thresh:]
Y_train, Y_test_local = Y[:thresh], Y[thresh:]
ids_train, ids_test_local = ids[:thresh], ids[thresh:]

In [15]:
np.random.seed(42)

In [16]:
# Partition the data based on the value of the PRI_JET_NUM (feature number 22)
xs, ys, _ = partition_data(X_train, Y_train, ids_train)

In [17]:
# Data Visualization [Please Uncomment if you want to see the plots]

# columns_names = ["DER_mass_MMC", "DER_mass_transverse_met_lep", "DER_mass_vis", "DER_pt_h", "DER_deltaeta_jet_jet", "DER_mass_jet_jet", "DER_prodeta_jet_jet",	"DER_deltar_tau_lep",	"DER_pt_tot",	"DER_sum_pt",	"DER_pt_ratio_lep_tau",	"DER_met_phi_centrality",	"DER_lep_eta_centrality",	"PRI_tau_pt",	"PRI_tau_eta",	"PRI_tau_phi",	"PRI_lep_pt",	"PRI_lep_eta",	"PRI_lep_phi",	"PRI_met",	"PRI_met_phi",	"PRI_met_sumet",	"PRI_jet_num",	"PRI_jet_leading_pt",	"PRI_jet_leading_eta", "PRI_jet_leading_phi", "PRI_jet_subleading_pt", "PRI_jet_subleading_eta", "PRI_jet_subleading_phi", "PRI_jet_all_pt"]
#
#
# ## Plot Box Plots
# for i, x in enumerate(xs):
#     for j in range(x.shape[1]):
#         plt.boxplot(x[:, j][x[:, j]!=-999])
#         plt.title("BoxPlot for: " +  columns_names[j]  + " of sub-dataset: " + str(i))
#         plt.ylabel("values")
#         plt.show()
#
#
# # Plot Histograms
# ## Plot Box Plots
# for i, x in enumerate(xs):
#     for j in range(x.shape[1]):
#         plt.hist(x[:, j][x[:, j]!=-999], bins=40)
#         plt.title("Histogram for: " +  columns_names[j]  + " of sub-dataset: " + str(i))
#         plt.ylabel("count")
#         plt.xlabel("value")
#         plt.show()

In [18]:
# Perform feature engineering on the samples of the TRAIN SET
for i, x in enumerate(xs):
    xs[i] = process_features(x)

In [19]:
# Perform 4-Fold cross validation in order to find the best hyper-parameters (lambdas) for Ridge Regression Classifier [Please uncomment if you need to redo the cross-validation.

# lambdas = []
# # for i, x, y in zip(range(len(xs)), xs, ys):
# #     best_lambda, best_loss = do_cross_validation(x, y, nfolds = 4)
# #     lambdas.append(best_lambda)
# #     print(f'set {i}: Optimal lambda = {best_lambda} leads to loss = {best_loss}')

In [20]:
# lambdas for logistic regression
lambdas = [0, 0, 0, 0]

In [21]:
# Train the models on sub-datasets
ws = []
accuracies = []

f1scores = []
for i, x, y in zip(range(len(xs)), xs, ys):
    print("training the model on sub-dataset number: ", i)
    x = add_bias(x)

    lambda_ = lambdas[i]

    N = x.shape[0]
    thresh = int(0.8 * N)
    x_train, x_val = x[:thresh], x[thresh:]
    y_train, y_val = y[:thresh], y[thresh:]

    (
        training_iter,
        validation_iter,
        training_loss,
        validation_loss,
        training_accuracy,
        validation_accuracy,
    ) = ([], [], [], [], [], [])
    w = np.random.random(x_train.shape[1])
    lr = 0.001
    for step in range(1, 2001):
        w, loss = reg_logistic_regression(y_train, x_train, 0, w, 10, lr)
        # lr /= 1.0002
        training_iter.append(step * 10)
        training_loss.append(loss)
        training_accuracy.append(
            (predict_logistic(x_train, w) == y_train).sum() / len(y_train)
        )

        if step % 100 == 0:
            validation_iter.append(step * 10)
            validation_loss.append(
                penalized_logistic_regression(y_val, x_val, w, 0.1)[0]
            )
            validation_accuracy.append(
                (predict_logistic(x_val, w) == y_val).sum() / len(y_val)
            )
            print(step, validation_accuracy[-1], lr)

    ws.append(w)
    # y_pred_i = predict_logistic(x_val, w)
    # accuracy_set_i = (y_pred_i== y_val).mean()
    # print("training accuracy on set" +  str(i) + ":", accuracy_set_i)
    # accuracies.append(accuracy_set_i)
    # f1scores.append(f1(y, y_pred_i))

    # w, loss = ridge_regression(y, x, lambda_)
    # ws.append(w)
    # y_pred_i = predict_simple(x, w)

    plt.plot(training_iter, training_loss, label="train")
    plt.plot(validation_iter, validation_loss, label="validation")
    plt.grid()
    plt.xlabel("num iter")
    plt.ylabel("loss")
    plt.title("Loss curve")
    plt.legend()
    plt.show()

    plt.plot(training_iter, training_accuracy, label="train")
    plt.plot(validation_iter, validation_accuracy, label="validation")
    plt.grid()
    plt.xlabel("num iter")
    plt.ylabel("accuracy")
    plt.title("accuracy curve")
    plt.legend()
    plt.show()

population_percentage = np.array([p.shape[0] / X_train.shape[0] for p in xs])
print(
    "\n\nWeighted mean accuracy for training data = ",
    (accuracies * population_percentage).sum(),
)
print(
    "Weighted F1-Score for training data = ", (f1scores * population_percentage).sum()
)

training the model on sub-dataset number:  0


  elementwise_loss = np.log(1+np.exp(xw)) - y*xw
  return 1/(1+np.exp(-t))


100 0.001000111123458162 0.001
200 0.0008889876652961441 0.001


KeyboardInterrupt: 

In [None]:
# Local Testing using local test set

# Partition the local test data based on the value of the PRI_JET_NUM (feature number 22)
xs_test_local, ys_test_local, _ = partition_data(
    X_test_local, Y_test_local, ids_test_local
)

# Perform feature engineering on the local test dataset
for i, x in enumerate(xs_test_local):
    xs_test_local[i] = process_features(x)

In [None]:
# Perform feature engineering on the samples of the local test set
accuracies = []
f1scores = []
for i, x, y in zip(range(len(xs_test_local)), xs_test_local, ys_test_local):
    x = add_bias(x)
    # calculate labels
    y_pred_i = predict_simple(x, ws[i])
    accuracy_set_i = (y_pred_i == y).mean()
    print("test accuracy on set" + str(i) + ":", accuracy_set_i)
    accuracies.append(accuracy_set_i)
    f1scores.append(f1(y, y_pred_i))
# Calculating the weighted mean accuracy across the 4 sub-datasets:
population_percentage = np.array(
    [p.shape[0] / X_test_local.shape[0] for p in xs_test_local]
)
print(
    "\n\nWeighted mean accuracy for local test set = ",
    (accuracies * population_percentage).sum(),
)
print(
    "\n\nWeighted F1-Score for local test data = ",
    (f1scores * population_percentage).sum(),
)

In [None]:
# Train the models on sub-datasets (Uncomment if you need to train the data on the whole dataset. However, it does not change the performance)

Xs, Ys, _ = partition_data(X, Y, ids)
for i, x in enumerate(Xs):
    Xs[i] = process_features(x)

Ws = []

for i, x, y in zip(range(len(Xs)), Xs, Ys):
    print("training the model on sub-dataset number: ", i)
    x = add_bias(x)
    lambda_ = lambdas[i]
    w, loss = ridge_regression(y, x, lambda_)
    Ws.append(w)
ws = Ws

In [None]:
# Loading the main (AICrowd) test data
Y_test, X_test, ids_test = load_csv_data("data/test.csv", sub_sample=False)

In [None]:
# Partition the test data based on the value of the PRI_JET_NUM (feature number 22)
xs_test, _, idss_test = partition_data(X_test, Y_test, ids_test)

In [None]:
# Perform feature engineering on the test dataset
for i, x in enumerate(xs_test):
    xs_test[i] = process_features(x)

In [None]:
# Finding the predictions on the test set
labels = []
for i, x_test in enumerate(xs_test):
    x_test = add_bias(x_test)
    # calculate labels
    labels_partition = predict_simple(x_test, ws[i])
    labels.append(labels_partition)

y_preds = np.concatenate(labels)

In [None]:
# Producing the sumbission.csv output
idss_test_concat = np.concatenate(idss_test)
idx_ids_sorted = np.argsort(idss_test_concat)
idss_test_concat = idss_test_concat[idx_ids_sorted]

y_preds = y_preds[idx_ids_sorted]

create_csv_submission(idss_test_concat, y_preds, "submission.csv")