In [1]:
print("Importing...")

import sys
import os
from os import mkdir
from os.path import join
from os.path import isdir
from os.path import isfile
from os.path import abspath
from shutil import rmtree

import numpy as np
import pandas as pd

sys.path.insert(0, "../scs")
import data_degrading as dd
import data_preparation as dp
import data_augmentation as da
from prepare_datasets_for_training import extract
from learn import compile_model, get_callbacks, train
from lr_schedules import get_lr_schedule
import data_plotting as dplt

sys.path.insert(0, "../scs/models")
import feed_forward
import transformer_encoder
import dash

from importlib import reload

Importing...


2023-07-28 16:43:17.808583: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
R = 100
complete_overwrite = False
overwrite_degraded_data = False  # Degraded data should not have to be made more than once per R.

# These paremeters are more or less set and I personally should not be tweaking these very much anymore.
phase_range = (-20, 50)
ptp_range = (0.1, 100)
wvl_range = (4500, 7000)

train_frac = 0.65
noise_scale = 0.25
spike_scale = 3.0
max_spikes = 5
random_state = 1415

In [3]:
data_dir_original = "/home/2649/repos/SCS/data/"
assert isdir(data_dir_original)

file_raw_data = join(data_dir_original, "sn_data.parquet")
assert isfile(file_raw_data)

In [4]:
# Load the raw data
print(f"Reading in raw data file: {abspath(file_raw_data)}")
df_raw = pd.read_parquet(file_raw_data)

Reading in raw data file: /home/2649/repos/SCS/data/sn_data.parquet


In [5]:
# Directory that contains all of the degraded spectral data.
data_dir_degraded = "/lustre/lrspec/users/2649/spectralib"
assert isdir(data_dir_degraded)

# Directory that contains the parquet files for all of the relevent steps in the data preparation process.
data_dir_degraded_R = join(data_dir_degraded, f"{R}")
if not isdir(data_dir_degraded_R):
    print(f"Creating '{data_dir_degraded_R}'.")
    mkdir(data_dir_degraded_R)
assert isdir(data_dir_degraded_R)

if complete_overwrite:
    print(f"Deleting and remaking '{data_dir_degraded_R}'.")
    rmtree(data_dir_degraded_R)
    mkdir(data_dir_degraded_R)

In [6]:
file_df_C = join(data_dir_degraded_R, "sn_data.C.parquet")
file_df_R = join(data_dir_degraded_R, "sn_data.R.parquet")

if overwrite_degraded_data or (not isfile(file_df_C)) or (not isfile(file_df_R)):
    print(f"Degrading the dataset to R = {R}.")
    df_C, df_R = dd.degrade_dataframe(R, df_raw)

    df_C.to_parquet(file_df_C)
    df_R.to_parquet(file_df_R)

else:
    print(f"Loading data previously degraded to R = {R}: '{file_df_C}' and '{file_df_R}'.")

    assert isfile(file_df_C)
    assert isfile(file_df_R)
    df_C = pd.read_parquet(file_df_C)
    df_R = pd.read_parquet(file_df_R)

print("Done.")

Loading data previously degraded to R = 100: '/lustre/lrspec/users/2649/spectralib/100/sn_data.C.parquet' and '/lustre/lrspec/users/2649/spectralib/100/sn_data.R.parquet'.
Done.


In [7]:
# Preprocess the dataset
print("Preprocessing the dataset.")
print(f"Phase Range (in days): {phase_range}")
print(f"Peak-to-Peak Range (in spectral units): {ptp_range}")
print(f"Wavelength Range (in Angstroms): {wvl_range}")

df_RP = dp.preproccess_dataframe(
    df_R,
    phase_range=phase_range,
    ptp_range=ptp_range,
    wvl_range=wvl_range,
)

df_CP = dp.preproccess_dataframe(
    df_C,
    phase_range=phase_range,
    ptp_range=ptp_range,
    wvl_range=wvl_range,
)

file_df_CP = join(data_dir_degraded_R, "sn_data.CP.parquet")
file_df_RP = join(data_dir_degraded_R, "sn_data.RP.parquet")
df_CP.to_parquet(file_df_CP)
df_RP.to_parquet(file_df_RP)

print("Done.")

Preprocessing the dataset.
Phase Range (in days): (-20, 50)
Peak-to-Peak Range (in spectral units): (0.1, 100)
Wavelength Range (in Angstroms): (4500, 7000)
Done.


In [8]:
# Perform the special train-test split
print("Perform a special train-test split on the dataset.")
print("This train-test split splits the dataset by SNe, not by spectra.")
print(f"Fraction of SNe in the training set: {train_frac}")
rng = np.random.RandomState(random_state)

df_RP_trn, df_RP_tst = dp.split_data(df_RP, train_frac, rng)
df_CP_trn, df_CP_tst = dp.split_data(df_CP, train_frac, rng)

file_df_RP_trn = join(data_dir_degraded_R, "sn_data_trn.RP.parquet")
file_df_RP_tst = join(data_dir_degraded_R, "sn_data_tst.RP.parquet")
file_df_CP_trn = join(data_dir_degraded_R, "sn_data_trn.CP.parquet")
file_df_CP_tst = join(data_dir_degraded_R, "sn_data_tst.CP.parquet")

df_RP_trn.to_parquet(file_df_RP_trn)
df_RP_tst.to_parquet(file_df_RP_tst)
df_CP_trn.to_parquet(file_df_CP_trn)
df_CP_tst.to_parquet(file_df_CP_tst)

print("Done.")

Perform a special train-test split on the dataset.
This train-test split splits the dataset by SNe, not by spectra.
Fraction of SNe in the training set: 0.65
Done.


In [9]:
# Augment the training set
print("Augmenting the dataset.")
print(f"Scale of noise to augment the data with: {noise_scale}")
print(f"Scale of the spikes to augment the data with: {spike_scale}")
print(f"Maximum spikes to augment the dataset with: {max_spikes}")
df_RPA_trn = da.augment(
    df_RP_trn,
    rng, 
    wvl_range=wvl_range,
    noise_scale=noise_scale,
    spike_scale=spike_scale,
    max_spikes=max_spikes
)

df_CPA_trn = da.augment(
    df_CP_trn,
    rng, 
    wvl_range=wvl_range,
    noise_scale=noise_scale,
    spike_scale=spike_scale,
    max_spikes=max_spikes
)

file_df_RPA_trn = join(data_dir_degraded_R, "sn_data_trn.RPA.parquet")
file_df_CPA_trn = join(data_dir_degraded_R, "sn_data_trn.CPA.parquet")
df_RPA_trn.to_parquet(file_df_RPA_trn)
df_CPA_trn.to_parquet(file_df_CPA_trn)

print("Done.")
print("Data preparation complete.")

Augmenting the dataset.
Scale of noise to augment the data with: 0.25
Scale of the spikes to augment the data with: 3.0
Maximum spikes to augment the dataset with: 5
Done.
Data preparation complete.
