In [1]:
import numpy as np

from data_utils import normalize_data, \
                       flatten_data, \
                       split_fluxes, \
                       save_numpy_array, \
                       save_scaler
                      

# 0. Objective

With this notebook we will process and save the flux, amplitude and phase data, dividing them into train, validation and test datasets.
The dataset sizes are the following:
- Train: 70000
- Validation: 10000
- Test: 10000

# 1. Flux Data

In [2]:
from constants import ORIGINAL_FLUXES_FILE

In [3]:
TRAIN_SIZE = 70000
VALIDATION_SIZE = 10000
TEST_SIZE = 10000

## 1.1 Flux Data for Fully Connected Architectures
For the FC Architectures we need to:
 - Normalize
 - Flatten

In [4]:
# Load flux data
fluxes_array = np.load(ORIGINAL_FLUXES_FILE)

Check the shape of the data, there should be 90000 data points

In [5]:
fluxes_array.shape

(90000, 55, 24)

Process_data

In [6]:
# Normalize data
fc_normalized_fluxes_array, fc_flux_scaler = normalize_data(fluxes_array)

In [7]:
# Flatten data
fc_flattened_normalized_fluxes_array = flatten_data(fc_normalized_fluxes_array)

In [8]:
# Split data
train_fc_fluxes, val_fc_fluxes, test_fc_fluxes = split_fluxes(fc_flattened_normalized_fluxes_array,
                                                              TRAIN_SIZE,
                                                              VALIDATION_SIZE,
                                                              TEST_SIZE)

In [9]:
train_fc_fluxes.shape

(70000, 1320)

In [10]:
val_fc_fluxes.shape

(10000, 1320)

In [11]:
test_fc_fluxes.shape

(10000, 1320)

Save data and scalers

In [12]:
# Load data paths
from constants import TRAIN_FC_FLUXES_PATH, \
                      VALIDATION_FC_FLUXES_PATH, \
                      TEST_FC_FLUXES_PATH, \
                      FC_FLUX_SCALER_PATH

# Save train fluxes
save_numpy_array(train_fc_fluxes, TRAIN_FC_FLUXES_PATH)
# Save validation fluxes
save_numpy_array(val_fc_fluxes, VALIDATION_FC_FLUXES_PATH)
# Save test fluxes
save_numpy_array(test_fc_fluxes, TEST_FC_FLUXES_PATH)
# Save fully connected flux scaler
save_scaler(fc_flux_scaler, FC_FLUX_SCALER_PATH)

## 1.2 Flux data for CNN Architectures

For the CNN Architectures we need to:
- Normalize

In [13]:
# Load flux data
fluxes_array = np.load(ORIGINAL_FLUXES_FILE)

Process data

In [14]:
# Normalize data
conv_normalized_fluxes_array, conv_flux_scaler = normalize_data(fluxes_array)

In [15]:
# Split data
train_conv_fluxes, val_conv_fluxes, test_conv_fluxes = split_fluxes(conv_normalized_fluxes_array,
                                                                    TRAIN_SIZE,
                                                                    VALIDATION_SIZE,
                                                                    TEST_SIZE)

In [9]:
train_fc_fluxes.shape

(70000, 1320)

In [10]:
val_fc_fluxes.shape

(10000, 1320)

In [11]:
test_fc_fluxes.shape

(10000, 1320)

Save data and scalers

In [16]:
# Load data paths
from constants import TRAIN_CNN_FLUXES_PATH, \
                      VALIDATION_CNN_FLUXES_PATH, \
                      TEST_CNN_FLUXES_PATH, \
                      CNN_FLUX_SCALER_PATH

# Save train fluxes
save_numpy_array(train_conv_fluxes, TRAIN_CNN_FLUXES_PATH)
# Save validation fluxes
save_numpy_array(val_conv_fluxes, VALIDATION_CNN_FLUXES_PATH)
# Save test fluxes
save_numpy_array(test_conv_fluxes, TEST_CNN_FLUXES_PATH)
# Save fully connected flux scaler
save_scaler(conv_flux_scaler, CNN_FLUX_SCALER_PATH)

## 1.3 Flux data for Autoencoder Architecture

For the Autoencoder Architecture we need to:
- Add padding
- Normalize