In [1]:
import numpy as np

from data_utils import normalize_data, \
                       flatten_data, \
                       add_row_padding, \
                       split_fluxes, \
                       fuse_amplitude_and_phase, \
                       save_numpy_array, \
                       save_scaler
                      

# 0. Objective

With this notebook we will process and save the flux, amplitude and phase data, dividing them into train, validation and test datasets.
The dataset sizes are the following:
- Train: 70000
- Validation: 10000
- Test: 10000

# 1. Flux Data

In [2]:
from constants import ORIGINAL_FLUXES_FILE

In [3]:
TRAIN_SIZE = 70000
VALIDATION_SIZE = 10000
TEST_SIZE = 10000

## 1.1 Flux Data for Fully Connected Architectures
For the FC Architectures we need to:
 - Normalize
 - Flatten

In [4]:
# Load flux data
fluxes_array = np.load(ORIGINAL_FLUXES_FILE)

Check the shape of the data, there should be 90000 data points

In [5]:
fluxes_array.shape

(90000, 55, 24)

Process_data

In [6]:
# Normalize data
fc_normalized_fluxes_array, fc_flux_scaler = normalize_data(fluxes_array)

In [7]:
# Flatten data
fc_flattened_normalized_fluxes_array = flatten_data(fc_normalized_fluxes_array)

In [8]:
# Split data
train_fc_fluxes, val_fc_fluxes, test_fc_fluxes = split_fluxes(fc_flattened_normalized_fluxes_array,
                                                              TRAIN_SIZE,
                                                              VALIDATION_SIZE,
                                                              TEST_SIZE)

In [9]:
train_fc_fluxes.shape

(70000, 1320)

In [10]:
val_fc_fluxes.shape

(10000, 1320)

In [11]:
test_fc_fluxes.shape

(10000, 1320)

Save data and scalers

In [12]:
# Load data paths
from constants import TRAIN_FC_FLUXES_PATH, \
                      VALIDATION_FC_FLUXES_PATH, \
                      TEST_FC_FLUXES_PATH, \
                      FC_FLUX_SCALER_PATH

# Save train fluxes
save_numpy_array(train_fc_fluxes, TRAIN_FC_FLUXES_PATH)
# Save validation fluxes
save_numpy_array(val_fc_fluxes, VALIDATION_FC_FLUXES_PATH)
# Save test fluxes
save_numpy_array(test_fc_fluxes, TEST_FC_FLUXES_PATH)
# Save fully connected flux scaler
save_scaler(fc_flux_scaler, FC_FLUX_SCALER_PATH)

## 1.2 Flux data for CNN Architectures

For the CNN Architectures we need to:
- Normalize

In [13]:
# Load flux data
fluxes_array = np.load(ORIGINAL_FLUXES_FILE)

Process data

In [14]:
# Normalize data
conv_normalized_fluxes_array, conv_flux_scaler = normalize_data(fluxes_array)

In [15]:
# Split data
train_conv_fluxes, val_conv_fluxes, test_conv_fluxes = split_fluxes(conv_normalized_fluxes_array,
                                                                    TRAIN_SIZE,
                                                                    VALIDATION_SIZE,
                                                                    TEST_SIZE)

In [16]:
train_conv_fluxes.shape

(70000, 55, 24)

In [17]:
val_conv_fluxes.shape

(10000, 55, 24)

In [18]:
test_conv_fluxes.shape

(10000, 55, 24)

Save data and scalers

In [19]:
# Load data paths
from constants import TRAIN_CNN_FLUXES_PATH, \
                      VALIDATION_CNN_FLUXES_PATH, \
                      TEST_CNN_FLUXES_PATH, \
                      CNN_FLUX_SCALER_PATH

# Save train fluxes
save_numpy_array(train_conv_fluxes, TRAIN_CNN_FLUXES_PATH)
# Save validation fluxes
save_numpy_array(val_conv_fluxes, VALIDATION_CNN_FLUXES_PATH)
# Save test fluxes
save_numpy_array(test_conv_fluxes, TEST_CNN_FLUXES_PATH)
# Save fully connected flux scaler
save_scaler(conv_flux_scaler, CNN_FLUX_SCALER_PATH)

## 1.3 Flux data for Autoencoder Architecture

For the Autoencoder Architecture we need to:
- Add padding
- Normalize

In [20]:
# Load flux data
fluxes_array = np.load(ORIGINAL_FLUXES_FILE)

Process data

In [21]:
# Add padding to data
padded_fluxes = add_row_padding(fluxes_array, top_rows=1, bottom_rows=0)

In [22]:
# Normalize data
normalized_padded_fluxes_array, autoencoder_flux_scaler = normalize_data(padded_fluxes)

In [23]:
# Split data
train_autoencoder_fluxes, val_autoencoder_fluxes, test_autoencoder_fluxes = split_fluxes(normalized_padded_fluxes_array,
                                                                                         TRAIN_SIZE,
                                                                                         VALIDATION_SIZE,
                                                                                         TEST_SIZE)

In [24]:
train_autoencoder_fluxes.shape

(70000, 56, 24)

In [25]:
val_autoencoder_fluxes.shape

(10000, 56, 24)

In [26]:
test_autoencoder_fluxes.shape

(10000, 56, 24)

Save data and scalers

In [27]:
# Load data paths
from constants import TRAIN_AUTOENCODER_FLUXES_PATH, \
                      VALIDATION_AUTOENCODER_FLUXES_PATH, \
                      TEST_AUTOENCODER_FLUXES_PATH, \
                      AUTOENCODER_FLUX_SCALER_PATH

# Save train fluxes
save_numpy_array(train_autoencoder_fluxes, TRAIN_AUTOENCODER_FLUXES_PATH)
# Save validation fluxes
save_numpy_array(val_autoencoder_fluxes, VALIDATION_AUTOENCODER_FLUXES_PATH)
# Save test fluxes
save_numpy_array(test_autoencoder_fluxes, TEST_AUTOENCODER_FLUXES_PATH)
# Save fully connected flux scaler
save_scaler(autoencoder_flux_scaler, AUTOENCODER_FLUX_SCALER_PATH)

# 2. Amplitude and Phase Data

## 2.1 Amplitude and Phase for Fully Connected Architecture

For the FC architectures we need to:
- Normalize

In [28]:
from constants import ORIGINAL_SLM_FOLDER, \
                      ORIGINAL_AMPLITUDE_FILENAME, \
                      ORIGINAL_PHASE_FILENAME, \
                      TRAIN_AMP_PHASE_FILE_SUFFIXES, \
                      VAL_AMP_PHASE_FILE_SUFFIX, \
                      TEST_AMP_PHASE_FILE_SUFFIX, \
                      NUMPY_SUFFIX

In [29]:
# Load data
amplitudes = []
phases = []

for file_number in TRAIN_AMP_PHASE_FILE_SUFFIXES + [VAL_AMP_PHASE_FILE_SUFFIX] + [TEST_AMP_PHASE_FILE_SUFFIX]:
    amp_filename = f"{ORIGINAL_SLM_FOLDER}{file_number}/{ORIGINAL_AMPLITUDE_FILENAME}"
    phase_filename = f"{ORIGINAL_SLM_FOLDER}{file_number}/{ORIGINAL_PHASE_FILENAME}"
    amplitudes.append(np.float32(np.load(amp_filename)))
    phases.append(np.float32(np.load(phase_filename)))

amplitudes = np.concatenate(amplitudes, axis=0)
phases = np.concatenate(phases, axis=0)


Now process the data

In [30]:
# Normalize data
normalized_amplitudes, fc_amplitude_scaler = normalize_data(amplitudes)
normalized_phases, fc_phase_scaler = normalize_data(phases)

In [31]:
normalized_amplitudes.shape
normalized_phases.shape

(90000, 96, 96)

In [32]:
# Stack amplitude and phase
amp_phase_array = fuse_amplitude_and_phase(normalized_amplitudes, normalized_phases)
amp_phase_array.shape

(90000, 2, 96, 96)

Save data

In [33]:
from constants import TRAIN_FC_AMP_PHASE_PATH, \
                      VALIDATION_FC_AMP_PHASE_PATH, \
                      TEST_FC_AMP_PHASE_PATH, \
                      FC_AMP_SCALER_PATH, \
                      FC_PHASE_SCALER_PATH

In [34]:
start = 0
end = 10000
# Save train data
for train_file in TRAIN_AMP_PHASE_FILE_SUFFIXES:
    filename = f"{TRAIN_FC_AMP_PHASE_PATH}{train_file}{NUMPY_SUFFIX}"
    save_numpy_array(amp_phase_array[start:end], filename)
    start += 10000
    end += 10000

# Save validation data
save_numpy_array(amp_phase_array[start:end], VALIDATION_FC_AMP_PHASE_PATH)
start += 10000
end += 10000

# Save test data
save_numpy_array(amp_phase_array[start:end], TEST_FC_AMP_PHASE_PATH)

# Save scalers
save_scaler(fc_amplitude_scaler, FC_AMP_SCALER_PATH)
save_scaler(fc_phase_scaler, FC_PHASE_SCALER_PATH)