In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Pre-processing of the training data:

In [3]:
from data_processing import *

thresh = 0
std_limit = 5
processed_data, most_frq_mass, jet_num_idx, possible_jets, idx_out, all_mean, all_std = process_data(tX, thresh, std_limit)


## Linear regression using gradient descent:

In [4]:
from implementations import *

max_iters_GD = 10
gamma = 0.1
loss_GD = []
w_GD = []

# For each group run the Linear regression using gradient descent
for jet in possible_jets:
    y_data = y[jet_num_idx[jet][0]]
    initial_w = np.zeros(processed_data[jet].shape[1]+1)
    num_samples = len(y_data)
    tx_offset = np.c_[np.ones(num_samples), processed_data[jet]]
    
    # Gradient descent
    loss, w = least_squares_GD(y_data, tx_offset, initial_w, max_iters_GD, gamma)
    loss_GD.append(loss)
    w_GD.append(w)
    
    # Print the results
    print("Linear regression using gradient descent (jet_num={j}): loss*={l}".format(
      l=loss, j=jet))


Linear regression using gradient descent (jet_num=0): loss*=0.30348548560801025
Linear regression using gradient descent (jet_num=1): loss*=0.39205235851210957
Linear regression using gradient descent (jet_num=2): loss*=0.371602859565645
Linear regression using gradient descent (jet_num=3): loss*=0.38093296875464505


## Linear regression using stochastic gradient descent:

In [5]:
max_iters_SGD = 20
gamma = 0.1
loss_SGD = []
w_SGD = []

# For each group run the Linear regression using stochastic gradient descent
for jet in possible_jets:
    y_data = y[jet_num_idx[jet][0]]
    initial_w = np.zeros(processed_data[jet].shape[1]+1)
    num_samples = len(y_data)
    tx_offset = np.c_[np.ones(num_samples), processed_data[jet]]
    
    # Stochastic gradient descent
    loss, w = least_squares_SGD(y_data, tx_offset, initial_w, max_iters_GD, gamma)
    loss_SGD.append(loss)
    w_SGD.append(w)
    
    # Print the results
    print("Linear regression using stochastic gradient descent (jet_num={j}): loss*={l}".format(
      l=loss, j=jet))

Linear regression using stochastic gradient descent (jet_num=0): loss*=1.0111865801136868
Linear regression using stochastic gradient descent (jet_num=1): loss*=1.0752660517066155
Linear regression using stochastic gradient descent (jet_num=2): loss*=14.08418955065592
Linear regression using stochastic gradient descent (jet_num=3): loss*=1.6675451352125146


## Least squares regression using normal equations:

In [6]:
loss_LS = []
w_LS = []

# For each group run Least Squares
for jet in possible_jets:
    y_data = y[jet_num_idx[jet][0]]
    initial_w = np.zeros(processed_data[jet].shape[1]+1)
    num_samples = len(y_data)
    tx_offset = np.c_[np.ones(num_samples), processed_data[jet]]
    
    # Least Squares
    loss, w = least_squares(y_data, tx_offset)
    loss_GD.append(loss)
    w_GD.append(w)
    
    # Print the results
    print("Least Squares (jet_num={j}): loss*={l}".format(
      l=loss, j=jet))

Least Squares (jet_num=0): loss*=0.2680627711475311
Least Squares (jet_num=1): loss*=0.3691029570396751
Least Squares (jet_num=2): loss*=0.3537527108680059
Least Squares (jet_num=3): loss*=0.357532399372507


## Ridge regression using normal equations:

In [7]:
from hyperparameters import *

k_fold = 4
degrees = []
lambdas = []

loss_RR = []
w_RR = []

# For each group run Ridge Regression 
for jet in possible_jets:
    y_data = y[jet_num_idx[jet][0]]
    
    # Find best hyperparameters
    best_degree, best_lambda, min_testerror, accuracy, best_error, best_acc = find_hyperparameters(y_data, processed_data[jet], k_fold,seed=1)
    lambdas.append(best_lambda)
    degrees.append(best_degree)
    
    # Build polynomial
    tx = build_poly(processed_data[jet], degrees[jet])
    
    # Ridge Regression 
    loss, w = ridge_regression(y_data, tx, lambdas[jet])
    loss_RR.append(loss)
    w_RR.append(w)

    # Print the results
    print("Ridge Regression (jet_num={j}): loss*={l}".format(
      l=loss, j=jet))

Ridge Regression (jet_num=0): loss*=0.23918437365081713
Ridge Regression (jet_num=1): loss*=0.31823864046736117
Ridge Regression (jet_num=2): loss*=0.28951305976086317
Ridge Regression (jet_num=3): loss*=0.2980138231531105


## Logistic regression using gradient descent:

In [9]:
from hyperparameters import *
from costs import *

max_iters_LR = 10000
gamma_LR = 0.001
threshold_LR = 1e-3

loss_LR = []
w_LR = []

# For each group run Logistic Regression
for jet in possible_jets:
    y_data = y[jet_num_idx[jet][0]]
    y_bin = 1*np.equal(y_data,1)
    
    # Build polynomial
    tx = build_poly(processed_data[jet], degrees[jet])
    
    initial_w = np.zeros(tx.shape[1])
    
    # Logistic Regression
    loss, w = logistic_regression(y_bin, tx, initial_w, max_iters_LR, gamma_LR, threshold_LR)
    loss_LR.append(loss)
    w_LR.append(w)
    
    # Print the results
#     print("Logistic Regression (jet_num={j}): loss*={l}".format(
#       l=loss, j=jet))

## Regularized logistic regression using gradient descent:

In [10]:
loss_RLR = []
w_RLR = []

# For each group run Logistic Regression
for jet in possible_jets:
    y_data = y[jet_num_idx[jet][0]]
    y_bin = 1*np.equal(y_data,1)
    
    # Build polynomial
    tx = build_poly(processed_data[jet], degrees[jet])
    
    initial_w = np.zeros(tx.shape[1])
    
    # Regularized Logistic Regression
    loss, w = reg_logistic_regression(y_bin, tx, lambdas[jet], initial_w, max_iters_LR, gamma_LR, threshold_LR)
    loss_RLR.append(loss)
    w_RLR.append(w)
    
    # Regularized Logistic Regression
#     print("Logistic Regression (jet_num={j}): loss*={l}".format(
#       l=loss, j=jet))

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = 'test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
w_optimal = w_RR

# Pre-processing test data

# Grouping according to jet number
jet_num_idx = grouping(tX_test)
tx_imp = tX_test

# Imputation
tx_imp[:, 0] = np.where(tx_imp[:, 0] == -999, most_frq_mass, tx_imp[:, 0])
tx_imp[jet_num_idx[0], :] = np.where(tx_imp[jet_num_idx[0], :] == -999, 0, tx_imp[jet_num_idx[0], :])
tx_imp[jet_num_idx[1], :] = np.where(tx_imp[jet_num_idx[1], :] == -999, 0, tx_imp[jet_num_idx[1], :])

# Create predictions
OUTPUT_PATH = 'predictions.csv'

processed_tX_test= []
y_pred = np.empty(tX_test.shape[0])

for jet in possible_jets:
    data = tx_imp[jet_num_idx[jet][0], :]
    num_samples = data.shape[0]
    all_idx = range(data.shape[1])    
    processed_tX_test.append(data[:, np.setdiff1d(all_idx, idx_out[jet])])
    
    # Handle outliers and standardize
    processed_tX_test[jet] = clip_outliers(processed_tX_test[jet], std_limit, all_mean[jet], all_std[jet])
    test_offset = build_poly(processed_tX_test[jet], degrees[jet])
    
    # Create preditions for each jet number
    y_pred[jet_num_idx[jet][0]] = predict_labels(w_optimal[jet], test_offset)

create_csv_submission(ids_test, y_pred, OUTPUT_PATH)