In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

In [3]:
# -------------------------------------------------- GROUPING ------------------------------------------------------------
# Grouping according to jet number

def grouping(tX):
    jet_num_idx = []
    jet_num_idx.append(np.where(tX[:,22] == 0))
    jet_num_idx.append(np.where(tX[:,22] == 1))
    jet_num_idx.append(np.where(tX[:,22] == 2))
    jet_num_idx.append(np.where(tX[:,22] == 3))

    return jet_num_idx
jet_num_idx = grouping(tX)

In [4]:
np.unique(np.where(tX[jet_num_idx[0], :] == -999)[2]), np.unique(np.where(tX[jet_num_idx[1], :] == -999)[2]), np.unique(np.where(tX[jet_num_idx[2], :] == -999)[2]), np.unique(np.where(tX[jet_num_idx[3], :] == -999)[2])

(array([ 0,  4,  5,  6, 12, 23, 24, 25, 26, 27, 28]),
 array([ 0,  4,  5,  6, 12, 26, 27, 28]),
 array([0]),
 array([0]))

In [5]:
# -------------------------------------- IMPUTATION ACCORDING TO JET NUMBER ----------------------------------------------
def imputation (data, jet_num_idx):
    # Imputation the mass column with the most frequent value
    tx_imp = data.copy()
    good_idx = np.where(data[:, 0] != -999)
    round_values = np.round(data[good_idx, 0]).astype(int)
    counts = np.bincount(round_values[0,:])
    tx_imp[:, 0] = np.where(tx_imp[:, 0] == -999, np.argmax(counts), tx_imp[:, 0]) 

    # Imputation of data for jet_num = 0 and jet_num = 1
    tx_imp[jet_num_idx[0], :] = np.where(tx_imp[jet_num_idx[0], :] == -999, 0, tx_imp[jet_num_idx[0], :])
    tx_imp[jet_num_idx[1], :] = np.where(tx_imp[jet_num_idx[1], :] == -999, 0, tx_imp[jet_num_idx[1], :])

    return tx_imp
tx = imputation (tX, jet_num_idx)

In [6]:
# ----------------------------------------------- VARIANCE THRESHOLD -----------------------------------------------------------
thresh = 0
data = tx[jet_num_idx[0][0], :]
def variance(data, thresh):
    v_vector = np.var(data, axis=0)
    index_keep = np.where(v_vector > thresh)
    new_data = data[:, index_keep[0]]
    
    return new_data
new_data = variance(data, thresh)

In [7]:
# ---------------------------------------------- CORRELATION COEFFICIENT -------------------------------------------------------
thresh_corr = 0.8
data = tx[jet_num_idx[0][0], :]

def correlation(data, thresh_corr):
    corr_mat = np.empty([data.shape[1], data.shape[1]])
    for i in range(data.shape[1]):
        for j in range(i):
            if i != j:
                corr_mat[i, j] = np.corrcoef(data[:, i], data[:, j])[0, 1]

    index_out1 = np.unique(np.where(corr_mat > 0.8)[0])
    index_out2 = np.unique(np.where(corr_mat > 0.8)[1])
    all_idx = range(data.shape[1])

    if len(index_out1) > len(index_out2):
        new_data = data[:, np.setdiff1d(all_idx, index_out1)]
    else:
        new_data = data[:, np.setdiff1d(all_idx, index_out2)]
    
    return new_data
new_data2 = correlation(new_data, thresh_corr=0.8)

In [8]:
# ---------------------------------------------------- STANDARDIZE -------------------------------------------------------------
from data_processing import *
y_data = y[jet_num_idx[0][0]]
tx_std = standardize(new_data2)

In [8]:
#-------------------------------------------------- HANDLING OUTLIERS ----------------------------------------------------------
from data_processing import *
def remove_outliers(data, y, std_limit = 4):

    num_datapoints = np.shape(data)[0]
    num_feat = np.shape(data)[1]
    indices = np.indices((1,num_datapoints))

    standardized = standardize(tx)
    number_outliers = np.zeros((1,num_feat))
    index_outliers = []

    for ii in range(num_feat):    
        pos_outlier = standardized[:,ii]>std_limit
        neg_outlier = standardized[:,ii]<-std_limit
        number_outliers[0,ii] = np.sum(pos_outlier) + np.sum(neg_outlier)
    
        for jj in range(num_datapoints):
            if (pos_outlier[jj] == True or neg_outlier[jj] == True) and jj not in index_outliers:
                index_outliers.append(jj)

    print("Percentage of points containing at least one outlier is", f'{(100*len(index_outliers)/num_datapoints):.3f}%')
    standardized_outliers_removed = standardized[np.setdiff1d(indices,index_outliers)]
    y_std = y[np.setdiff1d(indices,index_outliers)]
    
    return standardized_outliers_removed, y_std
tx_std, y_std = remove_outliers(tx, y, std_limit = 4)

Percentage of points containing at least one outlier is 7.066%


In [9]:
# Linear regression using gradient descent
from implementations import *
# Define the parameters of the algorithm.
max_iters = 50
gamma = 0.7 # use linspace to test various gamma values and look for the best ?

# Initialization
initial_w = np.zeros(tx_std.shape[1]+1)
tx_offset = np.empty([tx_std.shape[0], tx_std.shape[1]+1])
tx_offset[:, 0] = np.ones([tx_std.shape[0]]) 
tx_offset[:, 1:] = tx_std
loss_GD, w_GD = least_squares_GD(y_data, tx_offset, initial_w, max_iters, gamma)

print("Gradient Descent: loss={l}, w = {w}".format(
    l=loss_GD, w=w_GD))


Gradient Descent: loss=42.5924752980188, w = [-0.48971605 -2.60767668 -0.71779636 -3.21291976 -1.52302557 -0.43929834
 -0.49430603 -0.34019738 -1.47949577 -0.04899586  0.02464828 -1.9731975
 -0.03676249  0.00991877  0.02442958 -0.01480398 -1.27873522]


In [None]:
# Time Visualization
# from ipywidgets import IntSlider, interact

# def plot_figure(n_iter):
#     fig = gradient_descent_visualization(
#         gradient_losses, gradient_ws, grid_losses, grid_w0, grid_w1, mean_x, std_x, height, weight, n_iter)
#     fig.set_size_inches(10.0, 6.0)

# interact(plot_figure, n_iter=IntSlider(min=1, max=len(gradient_ws)))

In [135]:
# Linear regression using stochastic gradient descent

# Define the parameters of the algorithm.
max_iters = 50
gamma = 0.7 # use linspace to test various gamma values and look for the best ?

# Initialization
# initial_w = np.zeros(tx.shape[1]+1)
# tx_offset = np.empty([tx.shape[0], tx.shape[1]+1])
# tx_offset[:, 0] = np.ones([tx.shape[0]]) 
# tx_offset[:, 1:] = tx
loss_GD, w_GD = least_squares_SGD(y_data, tx_offset, initial_w, max_iters, gamma)

print("Stochastic Gradient Descent: loss={l}, w = {w}".format(
    l=loss_GD, w=w_GD))

Stochastic Gradient Descent: loss=7.2799027472704135e+31, w = [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]


## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = 'test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = 'predictions.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [None]:
# def imputation(tX):
#     # Imputation Using Zero
#     tx_zeros = np.where(tX == -999, 0, tX) 

#     # Imputation using the most frequent value (constant)
#     col_index = np.unique(np.where(tX == -999)[1])
#     tx_mostFrq = tX.copy()
#     for index in col_index:
#         good_idx = np.where(tX[:, index] != -999)
#         round_values = np.round(tX[good_idx, index]).astype(int)
    
#         # Taking care of the negative values
#         neg = round_values[np.where(round_values < 0)]  
#         if neg.size > 0:    
#             # Positive values
#             pos = round_values[np.where(round_values >= 0)]        
#             counts_neg = np.bincount(np.abs(neg))
#             counts_pos = np.bincount(pos)
        
#             if max(counts_neg) > max(counts_pos):
#                 tx_mostFrq[:, index] = np.where(tx_mostFrq[:, index] == -999, -np.argmax(counts_neg), tx_mostFrq[:, index])
        
#             else:
#                 tx_mostFrq[:, index] = np.where(tx_mostFrq[:, index] == -999, np.argmax(counts_pos), tx_mostFrq[:, index])
#         else:
#             counts = np.bincount(round_values[0,:])
#             tx_mostFrq[:, index] = np.where(tx_mostFrq[:, index] == -999, np.argmax(counts), tx_mostFrq[:, index]) 
            
#     tx_mixed = tX.copy()
#     tx_mixed[:, col_index[0]] = tx_mostFrq[:, col_index[0]]
#     tx_mixed[:, col_index[1:]] = tx_zeros[:, col_index[1:]]
    
#     return tx_zeros, tx_mostFrq, tx_mixed