In [1]:
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from icecream import ic
from importlib import reload

figdir = "./"

In [2]:
subtypes_subset = np.array([
    "Ia-norm", "Ia-91T", "Ia-91bg", "Iax", #"Ia-csm", "Ia-pec",
    "Ib-norm", "Ibn", "IIb", #"Ib-pec",
    "Ic-norm", "Ic-broad", #"Ic-pec",
    "IIP", #"IIL", "IIn", "II-pec",
])
subtypes_subset

array(['Ia-norm', 'Ia-91T', 'Ia-91bg', 'Iax', 'Ib-norm', 'Ibn', 'IIb',
       'Ic-norm', 'Ic-broad', 'IIP'], dtype='<U8')

In [3]:
df_beforePP = pd.read_parquet("../data/original_resolution_parquet/original_data.parquet")
df_afterPP = pd.read_parquet("../data/original_resolution_parquet/df_preprocessed.parquet")
df_trn_set = pd.read_parquet("../data/original_resolution_parquet/df_P_trn.parquet")
df_trn_set_aug = pd.read_parquet("../data/original_resolution_parquet/df_PA_trn.parquet")
df_tst = pd.read_parquet("../data/original_resolution_parquet/df_P_tst.parquet")

In [4]:
def get_counts(df, subtypes_subset):
    counts = np.array([0 for subtype in subtypes_subset])
    for i, subtype in enumerate(subtypes_subset):
        ind = (df["SN Subtype"] == subtype)
        num_spec = df[ind].shape[0]
        counts[i] = num_spec

    return counts

In [5]:
columns = [
    "Before PP",
    "After PP",
    "Trn Set",
    "Trn Set (w/ Aug)",
    "Tst Set",
]

col1 = get_counts(df_beforePP, subtypes_subset)
col2 = get_counts(df_afterPP, subtypes_subset)
col3 = get_counts(df_trn_set, subtypes_subset)
col4 = get_counts(df_trn_set_aug, subtypes_subset)
col5 = get_counts(df_tst, subtypes_subset)

pct1 = col1 / col1.sum()
pct2 = col2 / col2.sum()
pct3 = col3 / col3.sum()
pct4 = col4 / col4.sum()
pct5 = col5 / col5.sum()

str_col1 = [f"{val:} ({pct*100:.1f}%)" for val, pct in zip(col1, pct1)]
str_col2 = [f"{val:} ({pct*100:.1f}%)" for val, pct in zip(col2, pct2)]
str_col3 = [f"{val:} ({pct*100:.1f}%)" for val, pct in zip(col3, pct3)]
str_col4 = [f"{val:} ({pct*100:.1f}%)" for val, pct in zip(col4, pct4)]
str_col5 = [f"{val:} ({pct*100:.1f}%)" for val, pct in zip(col5, pct5)]

data = np.array([
    str_col1,
    str_col2,
    str_col3,
    str_col4,
    str_col5,
]).T

data

array([['2387 (53.5%)', '2114 (56.2%)', '1058 (56.2%)', '1058 (9.6%)',
        '1056 (56.2%)'],
       ['398 (8.9%)', '348 (9.2%)', '163 (8.7%)', '1141 (10.3%)',
        '185 (9.8%)'],
       ['264 (5.9%)', '232 (6.2%)', '101 (5.4%)', '1111 (10.1%)',
        '131 (7.0%)'],
       ['68 (1.5%)', '62 (1.6%)', '28 (1.5%)', '1064 (9.6%)',
        '34 (1.8%)'],
       ['270 (6.0%)', '211 (5.6%)', '99 (5.3%)', '1089 (9.9%)',
        '112 (6.0%)'],
       ['31 (0.7%)', '27 (0.7%)', '9 (0.5%)', '1062 (9.6%)', '18 (1.0%)'],
       ['328 (7.3%)', '232 (6.2%)', '139 (7.4%)', '1112 (10.1%)',
        '93 (4.9%)'],
       ['263 (5.9%)', '206 (5.5%)', '112 (5.9%)', '1120 (10.2%)',
        '94 (5.0%)'],
       ['279 (6.2%)', '228 (6.1%)', '117 (6.2%)', '1170 (10.6%)',
        '111 (5.9%)'],
       ['176 (3.9%)', '104 (2.8%)', '58 (3.1%)', '1102 (10.0%)',
        '46 (2.4%)']], dtype='<U12')

In [6]:
table = pd.DataFrame(data=data, index=subtypes_subset, columns=columns)
table.index.name = "SN Subtype"
table

Unnamed: 0_level_0,Before PP,After PP,Trn Set,Trn Set (w/ Aug),Tst Set
SN Subtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ia-norm,2387 (53.5%),2114 (56.2%),1058 (56.2%),1058 (9.6%),1056 (56.2%)
Ia-91T,398 (8.9%),348 (9.2%),163 (8.7%),1141 (10.3%),185 (9.8%)
Ia-91bg,264 (5.9%),232 (6.2%),101 (5.4%),1111 (10.1%),131 (7.0%)
Iax,68 (1.5%),62 (1.6%),28 (1.5%),1064 (9.6%),34 (1.8%)
Ib-norm,270 (6.0%),211 (5.6%),99 (5.3%),1089 (9.9%),112 (6.0%)
Ibn,31 (0.7%),27 (0.7%),9 (0.5%),1062 (9.6%),18 (1.0%)
IIb,328 (7.3%),232 (6.2%),139 (7.4%),1112 (10.1%),93 (4.9%)
Ic-norm,263 (5.9%),206 (5.5%),112 (5.9%),1120 (10.2%),94 (5.0%)
Ic-broad,279 (6.2%),228 (6.1%),117 (6.2%),1170 (10.6%),111 (5.9%)
IIP,176 (3.9%),104 (2.8%),58 (3.1%),1102 (10.0%),46 (2.4%)


In [7]:
print(table.to_latex())

\begin{tabular}{llllll}
\toprule
 & Before PP & After PP & Trn Set & Trn Set (w/ Aug) & Tst Set \\
SN Subtype &  &  &  &  &  \\
\midrule
Ia-norm & 2387 (53.5%) & 2114 (56.2%) & 1058 (56.2%) & 1058 (9.6%) & 1056 (56.2%) \\
Ia-91T & 398 (8.9%) & 348 (9.2%) & 163 (8.7%) & 1141 (10.3%) & 185 (9.8%) \\
Ia-91bg & 264 (5.9%) & 232 (6.2%) & 101 (5.4%) & 1111 (10.1%) & 131 (7.0%) \\
Iax & 68 (1.5%) & 62 (1.6%) & 28 (1.5%) & 1064 (9.6%) & 34 (1.8%) \\
Ib-norm & 270 (6.0%) & 211 (5.6%) & 99 (5.3%) & 1089 (9.9%) & 112 (6.0%) \\
Ibn & 31 (0.7%) & 27 (0.7%) & 9 (0.5%) & 1062 (9.6%) & 18 (1.0%) \\
IIb & 328 (7.3%) & 232 (6.2%) & 139 (7.4%) & 1112 (10.1%) & 93 (4.9%) \\
Ic-norm & 263 (5.9%) & 206 (5.5%) & 112 (5.9%) & 1120 (10.2%) & 94 (5.0%) \\
Ic-broad & 279 (6.2%) & 228 (6.1%) & 117 (6.2%) & 1170 (10.6%) & 111 (5.9%) \\
IIP & 176 (3.9%) & 104 (2.8%) & 58 (3.1%) & 1102 (10.0%) & 46 (2.4%) \\
\bottomrule
\end{tabular}

