# JLab ML Lunch 2 - Data Exploration

* Second ML challenge hosted
* On October 30th, a test dataset will be released, and predictions must be submitted within 24 hours
* Let's take a look at the training data!

In [1]:
%matplotlib widget

In [2]:
import pandas as pd
import missingno as mno
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d

## Training Data

* This shows the state vector ($x,y,z, p_x, p_y, p_z$) for 25 detector stations
* Jupyter-matplotlib widget used for handy visualizations (https://github.com/matplotlib/jupyter-matplotlib)

In [3]:
X_train = pd.read_csv("MLchallenge2_training.csv")
X_train.head()

Unnamed: 0,x,y,z,px,py,pz,x1,y1,z1,px1,...,z23,px23,py23,pz23,x24,y24,z24,px24,py24,pz24
0,0.877079,1.32218,65,-0.24398,-0.053204,2.41426,-10.669,0.330138,176.944,-0.253523,...,341.28,-0.226149,0.106356,2.4076,-27.4327,5.06932,343.405,-0.226602,0.106052,2.40755
1,0.786361,-2.48294,65,0.103229,0.432216,2.59291,7.36649,15.502,176.944,0.205638,...,341.28,0.326228,0.295249,2.58186,25.2686,37.6072,343.405,0.325574,0.295009,2.58195
2,-13.1339,-26.5309,65,0.064432,-0.020771,0.952952,-7.58617,-30.6867,176.944,0.026643,...,341.28,-0.015729,-0.047137,0.948236,-8.45915,-39.7158,343.405,-0.01365,-0.047591,0.948226
3,18.4542,2.80469,65,-0.019384,0.069384,1.83259,18.0433,6.79747,176.944,0.013039,...,341.28,0.039513,0.051584,1.82412,20.7692,11.7062,343.405,0.039183,0.052164,1.8241
4,15.5521,-19.196,65,-0.009768,-0.010642,2.36608,15.0681,-19.7502,176.944,-0.014308,...,341.28,-0.013718,-0.00229,2.35087,13.6575,-20.6234,343.405,-0.014211,-0.002519,2.35085


* All detectors are equidistant in z (2.125) from each other
* This should make a time-series approach a bit easier, if we go that route

In [18]:
def plot_one_track_position(df, track_id):
    
    track = df.loc[track_id].values

    x = [track[(6*i)] for i in range(1, 25)]
    y = [track[1+(6*i)] for i in range(1, 25)]
    z = [track[2+(6*i)] for i in range(1, 25)]

    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot(z, x, y)
    ax.set_title("Track {}".format(track_id))
    ax.set_xlabel("z", fontweight="bold")
    ax.set_ylabel("x", fontweight="bold")
    ax.set_zlabel("y", fontweight="bold")

    plt.show()

In [32]:
def plot_one_track_momentum(df, track_id):
    
    track = df.loc[track_id].values

    z = [track[2+(6*i)] for i in range(1, 25)]
    px = [track[3+(6*i)] for i in range(1, 25)]
    py = [track[4+(6*i)] for i in range(1, 25)]
    pz = [track[5+(6*i)] for i in range(1, 25)]

    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot(z, px, py)
    ax.set_title("Track {}".format(track_id))
    ax.set_xlabel("z", fontweight="bold")
    ax.set_ylabel(r"p$_x$", fontweight="bold")
    ax.set_zlabel(r"p$_y$", fontweight="bold")

    plt.show()

In [21]:
plot_one_track_position(X_train, 2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [33]:
plot_one_track_momentum(X_train, 2)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Now read in the example test data

In [34]:
X_test = pd.read_csv("test_in.csv", names=X_train.columns)
X_test.head()

Unnamed: 0,x,y,z,px,py,pz,x1,y1,z1,px1,...,z23,px23,py23,pz23,x24,y24,z24,px24,py24,pz24
0,0.877,1.322,65.0,-0.244,-0.053,2.414,-10.669,0.33,176.944,-0.254,...,,,,,,,,,,
1,0.786,-2.483,65.0,0.103,0.432,2.593,7.366,15.502,176.944,0.206,...,,,,,,,,,,
2,-13.134,-26.531,65.0,0.064,-0.021,0.953,-7.586,-30.687,176.944,0.027,...,,,,,,,,,,
3,18.454,2.805,65.0,-0.019,0.069,1.833,18.043,6.797,176.944,0.013,...,,,,,,,,,,
4,15.552,-19.196,65.0,-0.01,-0.011,2.366,15.068,-19.75,176.944,-0.014,...,341.28,-0.014,-0.002,2.351,343.405,,,,,


In [37]:
mno.matrix(X_test.head(100))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x1a295a40f0>

## One caveat on the test data

* The last value of each row is actually the z-value of the next step to be predicted, not the x-position
* ... but this isn't the same spot for each row
* Just add two commas before the last number of each row

In [84]:
import re
from io import StringIO

In [93]:
with open('test_in.csv', 'r') as f:
    data_str = f.read()

In [94]:
data_str_io = StringIO(
    re.sub(r"([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\n)", r",,\1", data_str)
)

In [95]:
X_test = pd.read_csv(data_str_io, names=X_train.columns)

In [96]:
X_test.head()

Unnamed: 0,x,y,z,px,py,pz,x1,y1,z1,px1,...,z23,px23,py23,pz23,x24,y24,z24,px24,py24,pz24
0,0.877,1.322,65.0,-0.244,-0.053,2.414,-10.669,0.33,176.944,-0.254,...,,,,,,,,,,
1,0.786,-2.483,65.0,0.103,0.432,2.593,7.366,15.502,176.944,0.206,...,,,,,,,,,,
2,-13.134,-26.531,65.0,0.064,-0.021,0.953,-7.586,-30.687,176.944,0.027,...,,,,,,,,,,
3,18.454,2.805,65.0,-0.019,0.069,1.833,18.043,6.797,176.944,0.013,...,,,,,,,,,,
4,15.552,-19.196,65.0,-0.01,-0.011,2.366,15.068,-19.75,176.944,-0.014,...,341.28,-0.014,-0.002,2.351,,,343.405,,,


## This should be saved for later usage

In [97]:
import re
from io import StringIO

def load_test_data(filename):
    with open(filename, 'r') as f:
        data_str = f.read()
    data_str_io = StringIO(
        re.sub(r"([-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?\n)", r",,\1", data_str)
    )
    X_test = pd.read_csv(data_str_io, names=X_train.columns)
    
    return X_test