# Data exploration

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import h5py
import re
import pandas as pd

### Load code for this project

In [None]:
import data_processing.data as dp
%load_ext autoreload
%autoreload 1
%aimport data_processing.data

In [None]:
%matplotlib inline

### Investigate structure of data

In [None]:
filename = 'data_processing/data/Mg22_alphaalpha_digiSim.h5'
hf = h5py.File(filename, "r")
hf.get('/get')

In [None]:
event_i = 1
event = hf["Event_[{}]".format(event_i)][:]
display(pd.DataFrame(event))

### Length of events

In [None]:
length = []
for key in hf.keys():
    length.append(len(hf[key]))
length = np.asarray(length)

In [None]:
plt.figure()
plt.hist(length, bins=100)
plt.xlabel("Length (items)")
plt.show()

## Visualisation

In [None]:
%matplotlib inline

event_i = 1

fig = plt.figure(figsize=(12,6))
ax = plt.subplot(131)
sc = plt.scatter(dp.get_event_by_index(hf, event_i)["x"], dp.get_event_by_index(hf, event_i)["y"], c=dp.get_event_by_index(hf, event_i)["A"], cmap='Greys')
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_title("XY projection, linear scale")

ax = plt.subplot(132)
sc = plt.scatter(dp.get_event_by_index(hf, event_i)["x"], dp.get_event_by_index(hf, event_i)["y"], c=np.log(dp.get_event_by_index(hf, event_i)["A"]), cmap='Greys')
ax.set_xlabel("x")
ax.set_ylabel("z")
ax.set_title("XY projection, log scale")

#cbar = fig.colorbar(sc, orientation='vertical', label='A')    

plt.show()

In [None]:
%matplotlib inline

event_i = 5

fig = plt.figure(figsize=(12,6))
ax = plt.subplot(131)
sc = plt.scatter(dp.get_event_by_index(hf, event_i)["x"], dp.get_event_by_index(hf, event_i)["y"], c=np.log(dp.get_event_by_index(hf, event_i)["A"]), cmap='Greys')
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_title("XY projection")

ax = plt.subplot(132)
sc = plt.scatter(dp.get_event_by_index(hf, event_i)["x"], dp.get_event_by_index(hf, event_i)["z"], c=np.log(dp.get_event_by_index(hf, event_i)["A"]), cmap='Greys')
ax.set_xlabel("x")
ax.set_ylabel("z")
ax.set_title("XZ projection")

ax = plt.subplot(133)
sc = plt.scatter(dp.get_event_by_index(hf, event_i)["y"], dp.get_event_by_index(hf, event_i)["z"], c=np.log(dp.get_event_by_index(hf, event_i)["A"]), cmap='Greys')
ax.set_xlabel("y")
ax.set_ylabel("z")
ax.set_title("YZ projection")

cbar = fig.colorbar(sc, orientation='vertical', label='A')    

plt.show()

In [None]:
n_rows, n_cols = 4, 4

fig = plt.figure(figsize=(12,12))

for i in range(n_rows*n_cols):
    ax = plt.subplot(n_rows, n_cols, i+1)
    sc = plt.scatter(dp.get_event_by_index(hf, i)["x"], dp.get_event_by_index(hf, i)["y"], c=np.log(dp.get_event_by_index(hf, i)["A"]), cmap='Greys')
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_title("Event: {}".format(i))
    #plt.xlim(-275.0, 275.0)
    #plt.ylim((-275.0, 275.0))
    #ax.set_aspect('equal', adjustable='box')
plt.tight_layout()
plt.show()

## Read and label data - to dictionary

* Even events: beam -> label = 0
* Odd events: beam -> label = 1

In [None]:
dict_data = dp.read_and_label_data("data_processing/data/")

## Investigate distributions of parameters

In [None]:
cols = ["Event", "Label", "length", 
        "x_mean", "y_mean", "z_mean", "A_mean",
        "x_std", "y_std", "z_std", "A_std",
       ]
df = pd.DataFrame(columns=cols)
for i, key in enumerate(hf.keys()):
#for i, key in enumerate(sorted(dict_data)):
    #if i > 3:
    #    break
    #print(i, key)
    #print(pd.DataFrame(hf[key][:]))
    d = pd.DataFrame(hf[key][:])
    means = np.mean(d, axis=0)
    std_devs = np.std(d, axis=0)

    #print(means)
    df = df.append({"Event": dp.get_event_from_key(key), "Label": dp.get_label_name(key), "length": d.shape[0], 
                    "x_mean": means["x"], "y_mean": means["y"], "z_mean": means["z"], "A_mean": means["A"],
                    "x_std": std_devs["x"], "y_std": std_devs["y"], "z_std": std_devs["z"], "A_std": std_devs["A"],
                   }, ignore_index=True)
df = df.sort_values("Event")
display(df.head())

In [None]:
import seaborn as sns
sns.set(context='notebook', style='darkgrid', palette='deep', font_scale=2., color_codes=True, rc=None)

In [None]:
sns.pairplot(df, hue="Label", vars=["x_mean", "y_mean", "z_mean", "A_mean"], kind='scatter', diag_kind='hist');
plt.show()

In [None]:
sns.pairplot(df, hue="Label", vars=["x_std", "y_std", "z_std", "A_std"], kind='scatter', diag_kind='hist');
plt.show()