In [1]:
import io, os, sys, types
import numpy as np

In [2]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import datetime

In [3]:
from implementations import *
from proj1_helpers import *
from cross_validation import *

In [4]:
dataset_variables = ["DER_mass_MMC",
 "DER_mass_transverse_met_lep", 
 "DER_mass_vis",
 "DER_pt_h", 
 "DER_deltaeta_jet_jet", 
 "DER_mass_jet_jet", 
 "DER_prodeta_jet_jet", 
 "DER_deltar_tau_lep", 
 "DER_pt_tot",
 "DER_sum_pt", 
 "DER_pt_ratio_lep_tau", 
 "DER_met_phi_centrality", 
 "DER_lep_eta_centrality", 
 "PRI_tau_pt",
 "PRI_tau_eta", 
 "PRI_tau_phi",
 "PRI_lep_pt", 
 "PRI_lep_eta",
 "PRI_lep_phi", 
 "PRI_met", 
 "PRI_met_phi",
 "PRI_met_sumet", 
 "PRI_jet_num", 
 "PRI_jet_leading_pt", 
 "PRI_jet_leading_eta", 
 "PRI_jet_leading_phi",
 "PRI_jet_subleading_pt", 
 "PRI_jet_subleading_eta",
 "PRI_jet_subleading_phi", 
 "PRI_jet_all_pt"]

jet_num_index = dataset_variables.index('PRI_jet_num')

In [5]:
data_train_path = 'data/train.csv' 
y_train, tx_train, ids_train = load_csv_data(data_train_path)

data_test_path = 'data/test.csv' 
_, tx_test, ids_test = load_csv_data(data_test_path)

In [6]:
def standardize(x):
    ''' fill your code in here...
    '''
    centered_data = x - np.nanmean(x, axis=0)
    std_data = centered_data / np.nanstd(centered_data, axis=0)
    
    return std_data

In [7]:
def accuracy(labels_gt, labels_pred):
    """ Computes accuracy.
    
    Args:
        labels_gt (np.array): GT labels of shape (N, ).
        labels_pred (np.array): Predicted labels of shape (N, ).
        
    Returns:
        float: Accuracy, in range [0, 1].
    """
    np.sum(np.abs(labels_gt - labels_pred)==0)
    
    return np.sum(labels_gt == labels_pred) / labels_gt.shape[0]

In [67]:
jets = [0, 1, 2, 3]

def prepare_into_jet_subsets(y, tx, ids):
    tx[np.where(tx == -999)] = np.nan
    std_tx = standardize(tx)
    std_tx[:, jet_num_index] = tx[:, jet_num_index]
    
    y_split_temp = {}
    tx_split_temp = {}
    split_ids_temp = {}

    #Splits the data into 4 subsets, one for each jet number.
    for jet in jets:
        jet_ids = np.where(std_tx[:, jet_num_index] == jet)
        split_ids_temp[jet] = ids[jet_ids]
        if(y.shape != ()):
            y_split_temp[jet] = y[jet_ids]
        tx_split_temp[jet] = std_tx[jet_ids]
    return y_split_temp, tx_split_temp, split_ids_temp

In [66]:
y_split, tx_split, ids_split = prepare_into_jet_subsets(y_train, tx_train, ids_train)

In [10]:
#Remove data features where every entry is nan
del_indices = {}
for jet in jets:
    id_nans = np.where(np.isnan(tx_split[jet]))
    
    nan_index, nan_counts = np.unique(id_nans[1], return_counts=True)
    indices_todel = nan_index[nan_counts == len(tx_split[jet][:,0])]
    
    del_indices[jet] = indices_todel
    
    tx_split[jet] = np.delete(tx_split[jet], indices_todel, axis=1)

In [11]:
def nans_to_mean(tx_split_ntm, jet):
    #Initialize variables
    tx_jet = tx_split_ntm[jet]
    means_without_outliers = np.zeros(tx_jet.shape[1])
    counts_without_outliers = np.zeros(tx_jet.shape[1])
    
    #Counts the number of valid entries and sums their value
    for entry in tx_jet:
        for id_point, point in enumerate(entry):
            if(not np.isnan(point)):
                means_without_outliers[id_point] += point
                counts_without_outliers[id_point] += 1
            
    #Computes the mean of valid entries
    means_without_outliers /= counts_without_outliers
    
    tx_jet_no_nan = tx_jet.copy()
    bad_variables = np.where(np.isnan(tx_jet))

    #Replaces every nan variable by the mean of that variable
    for entry, point in zip(bad_variables[0], bad_variables[1]):
        tx_jet_no_nan[entry, point] = means_without_outliers[point]
    return tx_jet_no_nan

In [12]:
#Turns every remaining nan into the mean of the variable in that jet subset.
for jet in jets:
        tx_split[jet] = nans_to_mean(tx_split, jet)

In [13]:
#Runs ridge regression 
best_degrees = {}
w_preds = {}
for jet in jets:
    degrees = np.arange(2,4)
    k_fold = 3
    lambdas = np.logspace(-4, 0, 30)
    #Computes the best degree and lambda
    best_degree, best_lambda, _ = best_degree_selection(tx_split[jet], y_split[jet],
                                                        degrees, k_fold, lambdas, seed = 1)
    #Saves the degree for predictions
    best_degrees[jet] = best_degree
    
    poly_tr = build_poly(tx_split[jet], best_degree)
    w_pred, _ = ridge_regression(y_split[jet], poly_tr, best_lambda)

    w_preds[jet] = w_pred
    print("Ridge done for jet subset nr", jet)

Ridge done for jet subset nr 0
Ridge done for jet subset nr 1
Ridge done for jet subset nr 2
Ridge done for jet subset nr 3


In [74]:
#Splits the data for testing in the same way as for the training data
_, te_tx_split, te_ids_split = prepare_into_jet_subsets(np.array(0), tx_test, ids_test)

In [75]:
#Removes the same data features as for the train data
for jet in jets:
    te_tx_split[jet] = np.delete(te_tx_split[jet], del_indices[jet], axis=1)

In [76]:
#Sets the remaing nans to the mean of the variable
for jet in jets:
    te_tx_split[jet] = nans_to_mean(te_tx_split, jet)

In [77]:
#Predicts the labels
predicted_labels = {}
for jet in jets:
    poly_te = build_poly(te_tx_split[jet], best_degrees[jet])
    predicted_labels[jet] = predict_labels(w_preds[jet], poly_te)

In [82]:
pred_labels_final = np.concatenate((predicted_labels[0], predicted_labels[1], predicted_labels[2], predicted_labels[3]))
ids_final = np.concatenate((te_ids_split[0], te_ids_split[1], te_ids_split[2], te_ids_split[3]))

In [86]:
create_csv_submission(ids_final, pred_labels_final, "prediction.csv")