# Exploratory data analysis

In [None]:
import math

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from implementations import *
from proj1_helpers import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# Setting global styles for matplotlib.pyplot

SMALL_SIZE = 11
MEDIUM_SIZE = 14
BIG_SIZE = 16
LARGE_SIZE = 24 

plt.rc('axes', labelsize=BIG_SIZE)
plt.rc('xtick', labelsize=MEDIUM_SIZE)
plt.rc('ytick', labelsize=MEDIUM_SIZE)

In [None]:
def plot_partioned_column(col_name, col_idx, log_scale=False):
    fig, ax = plt.subplots(2, 2, figsize=(2 * 7, 2 * 7))
    sns.histplot(x0[:, col_idx[col_name]], ax = ax[0, 0], log_scale=log_scale)
    ax[0, 0].set_xlabel(f'{col_name} | PRI_jet_num = 0')
    ax[0, 0].set_ylabel('')
    sns.histplot(x1[:, col_idx[col_name]], ax = ax[0, 1], log_scale=log_scale)
    ax[0, 1].set_xlabel(f'{col_name} | PRI_jet_num = 1')
    ax[0, 1].set_ylabel('')
    sns.histplot(x2[:, col_idx[col_name]], ax = ax[1, 0], log_scale=log_scale)
    ax[1, 0].set_xlabel(f'{col_name} | PRI_jet_num = 2')
    ax[1, 0].set_ylabel('')
    sns.histplot(x3[:, col_idx[col_name]], ax = ax[1, 1], log_scale=log_scale)
    ax[1, 1].set_xlabel(f'{col_name} | PRI_jet_num = 3')
    ax[1, 1].set_ylabel('')

In [None]:
def partition_data(x, y, PRI_jet_num, col_idx):
    # Merge data and labels
    data = np.c_[y, x]
    # Now, all columns from x are shifted with 1!
    index = data[:, col_idx['PRI_jet_num']+1]==PRI_jet_num
    
    # Split the data according to the indexes found.
    data_i = data[index, :]
    # Unmerge the data and labels
    y_i = data_i[:, 0]
    x_i = data_i[:, 1:]
    return x_i, y_i

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
DATA_TRAIN_PATH = "../data/train.csv"
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [None]:
print(f"y.shape {y.shape}")
print(f"tX.shape {tX.shape}")
print(f"ids.shape {ids.shape}")

The class labels are `1` and `-1` and stored in `y`, a column vector.

The features are stored in `tX`.

`col_idx` is a dictionary to map the column names to the index in the `tX` matrix.

In [None]:
header = np.genfromtxt(DATA_TRAIN_PATH, delimiter=",", skip_header=0, dtype=str, max_rows=1)
# Skip first 2 columns which are Id and Prediction
header = header[2:]
col_idx = {key: value for value, key in enumerate(header)}
col_idx

### 1. Distribution in each column

In [None]:
# Disable scrolling for this cell to see all plots by right clicking on the
# cell.
fig, ax = plt.subplots(10, 3, figsize=(3 * 7, 10 * 7))

for col_name, col in col_idx.items():
    i = int(col / 3)
    j = int(col % 3)
    sns.histplot(tX[:, col], ax=ax[i, j])
    ax[i, j].set_xlabel(col_name)
    ax[i, j].set_ylabel("")

### PRI_jet_num

This variable has only 4 values: 0, 1, 2, 3. We split the data based on this and will train a classifier for each group.col_idx

In [None]:
x0, y0 = partition_data(tX, y, 0, col_idx)
x1, y1 = partition_data(tX, y, 1, col_idx)
x2, y2 = partition_data(tX, y, 2, col_idx)
x3, y3 = partition_data(tX, y, 3, col_idx)

### DER_deltaeta_jet_jet

<span style="color:green; font-weight:bold">Conclusion:</span>
The DER_deltaeta_jet_jet is the same for x0 and x1 and can be safely discarded.

In [None]:
plot_partioned_column('DER_deltaeta_jet_jet', col_idx)

### DER_mass_jet_jet

<span style="color:green; font-weight:bold">Conclusion:</span>
The DER_mass_jet_jet is the same for x0 and x1 and can be safely discarded.

In [None]:
plot_partioned_column('DER_mass_jet_jet', col_idx)

### DER_prodeta_jet_jet

<span style="color:green; font-weight:bold">Conclusion:</span>
The DER_prodeta_jet_jet is the same for x0 and x1 and can be safely discarded.

In [None]:
plot_partioned_column('DER_prodeta_jet_jet', col_idx)

### DER_pt_tot

<span style="color:green; font-weight:bold">Conclusion:</span>
Everything seems ok.

In [None]:
# Plots seem fine
# plot_partioned_column('DER_pt_tot', col_idx)

### DER_lep_eta_centrality

<span style="color:green; font-weight:bold">Conclusion:</span>
The DER_lep_eta_centrality is undefined in x0 and x1.DER_lep_eta_centrality

In [None]:
plot_partioned_column('DER_lep_eta_centrality', col_idx)

### PRI_jet_leading_pt

<span style="color:green; font-weight:bold">Conclusion:</span>
The PRI_jet_leading_pt is undefined in **just x0**.

In [None]:
plot_partioned_column('PRI_jet_leading_pt', col_idx)

### PRI_jet_leading_eta

<span style="color:green; font-weight:bold">Conclusion:</span>
The PRI_jet_leading_eta is undefined in **just x0**.

In [None]:
plot_partioned_column('PRI_jet_leading_eta', col_idx)

### PRI_jet_leading_phi

<span style="color:green; font-weight:bold">Conclusion:</span>
The PRI_jet_leading_phi is undefined in **just x0**.

In [None]:
plot_partioned_column('PRI_jet_leading_phi', col_idx)

### PRI_jet_subleading_pt

<span style="color:green; font-weight:bold">Conclusion:</span>
The PRI_jet_subleading_pt is undefined in x0 and x1.

In [None]:
plot_partioned_column('PRI_jet_subleading_pt', col_idx)

### PRI_jet_subleading_eta

<span style="color:green; font-weight:bold">Conclusion:</span>
The PRI_jet_subleading_eta is undefined in x0 and x1.

In [None]:
plot_partioned_column('PRI_jet_subleading_eta', col_idx)

### PRI_jet_subleading_phi

<span style="color:green; font-weight:bold">Conclusion:</span>
The PRI_jet_subleading_phi is undefined in x0 and x1.

In [None]:
plot_partioned_column('PRI_jet_subleading_phi', col_idx)

### PRI_jet_all_pt

<span style="color:green; font-weight:bold">Conclusion:</span>
The PRI_jet_all_pt is undefined in **just x0**.

In [None]:
plot_partioned_column('PRI_jet_all_pt', col_idx)