In [1]:
import sys
# import json
# import argparse
# from copy import deepcopy
# from os import mkdir
# from os.path import join
# from os.path import isfile
# from os.path import isdir
# from os.path import abspath
# from os.path import basename
from icecream import ic
# from glob import glob

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
# from scipy import stats
# from scipy.signal import savgol_filter
# from sklearn.metrics import confusion_matrix
# from sklearn.model_selection import ParameterGrid

sys.path.insert(0, "../scs")

# from tensorflow import keras
# from keras import layers
# from keras import callbacks
# from keras.regularizers import L1L2
# from keras.losses import CategoricalCrossentropy
# from keras.metrics import CategoricalAccuracy
# from keras.optimizers import Nadam
# from keras.utils import to_categorical
# from tensorflow_addons.metrics import F1Score

import scs_config
import data_degrading as dd
import data_preparation as dp
import data_augmentation as da
# import data_plotting as dplt
import squarify

In [2]:
def load_original_dataset():
    ic()
    file_df_raw = "/home/2649/repos/SCS/data/raw/sn_data.parquet"
    df_raw = pd.read_parquet(file_df_raw)
    return df_raw


def load_R100_data():
    ic()
    file_df_R = "/home/2649/repos/SCS/data/R100/df_R.parquet"
    file_df_C = "/home/2649/repos/SCS/data/R100/df_C.parquet"
    df_R = pd.read_parquet(file_df_R)
    df_C = pd.read_parquet(file_df_C)
    return df_C, df_R


def degrade_data(df_raw, R):
    ic()
    df_C, df_R = dd.degrade_dataframe(R, df_raw)
    return df_C, df_R


def clean_data(df_C, df_R, phase_range, ptp_range, wvl_range):
    ic()
    df_CP = dp.preproccess_dataframe(
        df_C,
        phase_range=phase_range,
        ptp_range=ptp_range,
        wvl_range=wvl_range,
    )
    df_RP = dp.preproccess_dataframe(
        df_R,
        phase_range=phase_range,
        ptp_range=ptp_range,
        wvl_range=wvl_range,
    )
    return df_CP, df_RP


def split_train_test(df_CP, df_RP, train_frac, rng):
    ic()
    df_CP_trn, df_CP_tst = dp.split_data(df_CP, train_frac, rng)
    df_RP_trn, df_RP_tst = dp.split_data(df_RP, train_frac, rng)
    return df_CP_trn, df_CP_tst, df_RP_trn, df_RP_tst


def augment_training_set(df_CP_trn, df_RP_trn, rng, wvl_range, noise_scale, spike_scale, max_spikes):
    ic()
    df_CPA_trn = da.augment(
        df_CP_trn,
        rng,
        wvl_range=wvl_range,
        noise_scale=noise_scale,
        spike_scale=spike_scale,
        max_spikes=max_spikes,
    )
    df_RPA_trn = da.augment(
        df_RP_trn,
        rng,
        wvl_range=wvl_range,
        noise_scale=noise_scale,
        spike_scale=spike_scale,
        max_spikes=max_spikes,
    )
    return df_CPA_trn, df_RPA_trn

In [3]:
phase_range_start = -20
phase_range_end = 50
ptp_range_start = 0.1
ptp_range_end = 100
wvl_range_start = 4500
wvl_range_end = 7000

train_frac = 0.50
noise_scale = 0.1
spike_scale = 1.0
max_spikes = 3

In [4]:
phase_range = (phase_range_start, phase_range_end)
ptp_range = (ptp_range_start, ptp_range_end)
wvl_range = (wvl_range_start, wvl_range_end)

In [5]:
rng = np.random.RandomState(1415)

df_C, df_R = load_R100_data()

df_CP, df_RP = clean_data(df_C, df_R, phase_range, ptp_range, wvl_range)

df_CP_trn, df_CP_tst, df_RP_trn, df_RP_tst = split_train_test(
    df_CP, df_RP, train_frac, rng
)

df_CPA_trn, df_RPA_trn = augment_training_set(
    df_CP_trn, df_RP_trn, rng, wvl_range, noise_scale, spike_scale, max_spikes
)

ic| 1730223752.py:9 in load_R100_data() at 14:06:42.076
ic| 1730223752.py:24 in clean_data() at 14:06:42.685
ic| 1730223752.py:41 in split_train_test() at 14:06:44.157
ic| 1730223752.py:48 in augment_training_set() at 14:06:44.242


In [41]:
def papertablething(df, df_P, df_P_trn, df_PA_trn, df_P_tst):
    col1 = df.groupby("SN Subtype ID").count()["SN Subtype"].values
    col2 = df_P.groupby("SN Subtype ID").count()["SN Subtype"].values
    col3 = df_P_trn.groupby("SN Subtype ID").count()["SN Subtype"].values
    col4 = df_PA_trn.groupby("SN Subtype ID").count()["SN Subtype"].values
    col5 = df_P_tst.groupby("SN Subtype ID").count()["SN Subtype"].values
    
    # After the train-test-split, the II-pec class is removed because at the time of writing there exists only one II-pec SN in the dataset.
    col3 = np.append(col3, 0)
    col4 = np.append(col4, 0)
    col5 = np.append(col5, 0)
    

    pct_col1 = col1 / col1.sum()
    pct_col2 = col2 / col2.sum()
    pct_col3 = col3 / col3.sum()
    pct_col4 = col4 / col4.sum()
    pct_col5 = col5 / col5.sum()
    
    
    str_col1 = [f"{val} ({pct*100:.1f}%)" for val, pct in zip(col1, pct_col1)]
    str_col2 = [f"{val} ({pct*100:.1f}%)" for val, pct in zip(col2, pct_col2)]
    str_col3 = [f"{val} ({pct*100:.1f}%)" for val, pct in zip(col3, pct_col3)]
    str_col4 = [f"{val} ({pct*100:.1f}%)" for val, pct in zip(col4, pct_col4)]
    str_col5 = [f"{val} ({pct*100:.1f}%)" for val, pct in zip(col5, pct_col5)]
    
    data = np.array([
        str_col1,
        str_col2,
        str_col3,
        str_col4,
        str_col5,
    ]).T
    
    columns = [
        "Before PP",
        "After PP",
        "Trn Set",
        "Trn Set (w/ Aug)",
        "Tst Set",
    ]


    table = pd.DataFrame(data=data, index=scs_config.SN_Stypes_str, columns=columns)
    table.index.name = "SN Subtype"
    
    return table

table = papertablething(df_R, df_RP, df_RP_trn, df_RPA_trn, df_RP_tst)
table

Unnamed: 0_level_0,Before PP,After PP,Trn Set,Trn Set (w/ Aug),Tst Set
SN Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ia-norm,2387 (47.7%),2114 (52.8%),966 (50.4%),966 (6.1%),1148 (56.2%)
Ia-91T,398 (8.0%),348 (8.7%),192 (10.0%),1152 (7.2%),156 (7.6%)
Ia-91bg,264 (5.3%),232 (5.8%),115 (6.0%),1035 (6.5%),117 (5.7%)
Ia-csm,30 (0.6%),16 (0.4%),14 (0.7%),966 (6.1%),2 (0.1%)
Iax,68 (1.4%),62 (1.5%),24 (1.3%),984 (6.2%),38 (1.9%)
Ia-pec,141 (2.8%),111 (2.8%),55 (2.9%),990 (6.2%),56 (2.7%)
Ib-norm,270 (5.4%),211 (5.3%),109 (5.7%),981 (6.1%),102 (5.0%)
Ibn,31 (0.6%),27 (0.7%),22 (1.1%),968 (6.1%),5 (0.2%)
IIb,328 (6.6%),233 (5.8%),116 (6.1%),1044 (6.5%),117 (5.7%)
Ib-pec,15 (0.3%),12 (0.3%),7 (0.4%),966 (6.1%),5 (0.2%)


In [44]:
print(table.to_latex())

\begin{tabular}{llllll}
\toprule
{} &     Before PP &      After PP &      Trn Set & Trn Set (w/ Aug) &       Tst Set \\
SN Subtype &               &               &              &                  &               \\
\midrule
Ia-norm    &  2387 (47.7\%) &  2114 (52.8\%) &  966 (50.4\%) &       966 (6.1\%) &  1148 (56.2\%) \\
Ia-91T     &    398 (8.0\%) &    348 (8.7\%) &  192 (10.0\%) &      1152 (7.2\%) &    156 (7.6\%) \\
Ia-91bg    &    264 (5.3\%) &    232 (5.8\%) &   115 (6.0\%) &      1035 (6.5\%) &    117 (5.7\%) \\
Ia-csm     &     30 (0.6\%) &     16 (0.4\%) &    14 (0.7\%) &       966 (6.1\%) &      2 (0.1\%) \\
Iax        &     68 (1.4\%) &     62 (1.5\%) &    24 (1.3\%) &       984 (6.2\%) &     38 (1.9\%) \\
Ia-pec     &    141 (2.8\%) &    111 (2.8\%) &    55 (2.9\%) &       990 (6.2\%) &     56 (2.7\%) \\
Ib-norm    &    270 (5.4\%) &    211 (5.3\%) &   109 (5.7\%) &       981 (6.1\%) &    102 (5.0\%) \\
Ibn        &     31 (0.6\%) &     27 (0.7\%) &    22 (1.1\%) &     

  print(table.to_latex())
