# Data Mining and Visualization 

In [None]:
# Import
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Global Variables
DATA_DIR = os.path.abspath('../Data')
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TRAIN_FILES_LIST = [os.path.join(TRAIN_DIR, i) for i in os.listdir(TRAIN_DIR)]

NUM_OF_TIMESTAMPS = 50
REP_2e6 = 0
REP_1e6 = 0

In [None]:
# Preparing Data
timestamp_to_file = {}
for filename in TRAIN_FILES_LIST:
    with h5py.File(filename) as d:
        timestamp_to_file = {
            **timestamp_to_file,
            **dict(map(lambda x: (x, filename), d.keys()))
        }

timestamps = np.unique(sorted(timestamp_to_file.keys()))

data_values = np.empty((22, 1))
for timestamp in timestamps[:NUM_OF_TIMESTAMPS]:
    with h5py.File(timestamp_to_file[timestamp]) as d:
        buff = d[timestamp]["intensity"][:].reshape((1, 63504))
        buff = np.append(buff, d[timestamp]["events"][:].reshape((1, 63504)), axis=0)
        buff = np.append(buff, d[timestamp]["radial_velocity"][:].reshape((10, 63504)), axis=0)
        buff = np.append(buff, d[timestamp]["reflectivity"][:].reshape((10, 63504)), axis=0)
        
        data_values = np.append(data_values, buff, axis=1)

data_values = data_values[:, 1:]
data_values[data_values == -2e6] = REP_2e6
data_values[data_values == -1e6] = REP_1e6
data_values = pd.DataFrame(data_values.T, columns=["intensity", "events", *[f'radial_velocity_{i}' for i in range(10)], *[f'reflectivity_{i}' for i in range(10)]])

data_sample = {
    "intensity": None,
    "events": None,
    "reflectivity": None,
    "radial_velocity": None
}
with h5py.File(timestamp_to_file[timestamps[0]]) as d:
    data_sample["intensity"] = d[timestamps[0]]["intensity"][:]
    data_sample["events"] = d[timestamps[0]]["events"][:]
    data_sample["radial_velocity"] = [d[timestamps[0]]["radial_velocity"][:][i] for i in range(10)]
    data_sample["reflectivity"] = [d[timestamps[0]]["reflectivity"][:][i] for i in range(10)]

In [None]:
# Basic Information
print(timestamps.shape, '- timestamps shape')
print(data_values.info())
print(data_values.describe())

In [None]:
# Visualization
# Samples
fig_samples_single_row, ax_samples_single_row = plt.subplots(nrows=1, ncols=3)
fig_samples_single_row.set_figwidth(20)

# intensity
ax_samples_single_row[0].set_title("intensity")
sns.heatmap(data_sample["intensity"], ax=ax_samples_single_row[0])

# events
ax_samples_single_row[1].set_title("events")
sns.heatmap(data_sample["events"], ax=ax_samples_single_row[1])

# correlation
ax_samples_single_row[2].set_title("correlation")
sns.heatmap(data_values.corr(), ax=ax_samples_single_row[2])

fig_samples_mult_row, ax_samples_mult_row = plt.subplots(nrows=2, ncols=10)
fig_samples_mult_row.set_figwidth(20)
fig_samples_mult_row.set_figheight(3)
fig_samples_mult_row.suptitle("radial_velocity and reflectivity")

# radial_velocity
for i in range(10):
    g = sns.heatmap(data_sample["radial_velocity"][i], ax=ax_samples_mult_row[0][i])
    g.set(xticklabels=[], yticklabels=[])

# reflectivity
for i in range(10):
    g = sns.heatmap(data_sample["reflectivity"][i], ax=ax_samples_mult_row[1][i])
    g.set(xticklabels=[], yticklabels=[])

# histograms
fig_hist, ax_hist = plt.subplots(nrows=1, ncols=4)
fig_hist.set_figwidth(20)

# intensity
ax_hist[0].set_title("intensity")
ax_hist[0].set_ylim(5e4)
sns.histplot(data_values["intensity"], ax=ax_hist[0])

# events
ax_hist[1].set_title("events")
ax_hist[1].set_ylim(4e5)
sns.histplot(data_values["events"], ax=ax_hist[1], binwidth=1)

# radial_velocity
ax_hist[2].set_title("radial_velocity")
ax_hist[2].set_ylim(4e5)
sns.histplot(np.concatenate([data_values[f'radial_velocity_{i}'] for i in range(10)], axis=0), ax=ax_hist[2])

# reflectivity
ax_hist[3].set_title("reflectivity")
ax_hist[3].set_ylim(4e5)
sns.histplot(np.concatenate([data_values[f'reflectivity_{i}'] for i in range(10)], axis=0), ax=ax_hist[3])