In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline
import random
import h5py
import os

from scipy import stats
def scale_data(data):
     return (data-np.min(data))/(np.max(data)-np.min(data))
    
df_test = pd.read_csv('../data/sample_submission.csv')
df_test['id'] = df_test['id'].apply(lambda x: Path("../data/test/")/f"{x}.hdf5")
def read_file(filename):
    file_id = Path(filename).stem
    img = np.empty((2, 360, 128), dtype=np.float32)
    with h5py.File(filename, "r") as f:
        g = f[file_id]

        for ch, s in enumerate(["H1", "L1"]):
            a = g[s]["SFTs"][:, :4096] * 1e22  # Fourier coefficient complex64
            p = a.real**2 + a.imag**2  # power
            p /= np.mean(p)  # normalize
            p = np.mean(p.reshape(360, 128, 32), axis=2)  # compress 4096 -> 128
            img[ch] = p
    return img


class DatasetVisualizer():
    """
    dataset = Dataset(data_type, df)

    img, y = dataset[i]
      img (np.float32): 2 x 360 x 128
      y (np.float32): label 0 or 1
    """
    def __init__(self, df):
        self.df = df
        

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        """
        i (int): get ith data
        """
        r = self.df.iloc[i]
        y = np.float32(r.target)
        filename=r.id
        file_id = Path(r.id).stem
        img = np.empty((2, 360, 128), dtype=np.float32)
        with h5py.File(filename, 'r') as f:
            g = f[file_id]

            for ch, s in enumerate(['H1', 'L1']):
                a = g[s]['SFTs'][:, :4096] * 1e22  # Fourier coefficient complex64

                p = a.real**2 + a.imag**2  # power
                p /= np.mean(p)  # normalize
                p = np.mean(p.reshape(360, 128, 32), axis=2)  # compress 4096 -> 128

                img[ch] = p

        return img, y.astype('int')
    
def plot_sft(img):
    fig, axes = plt.subplots(1, 2, figsize=(5, 5))
    axes[0].imshow(img[0])
    axes[0].set_title('Detector=H1')
    axes[1].imshow(img[0])
    axes[1].set_title('Detector=L1')
    plt.tight_layout()
    plt.show()

In [3]:
def get_custom_dataset(version="DATA_V10"):
    df = (
        pd.read_csv(f"../data/custom_data/{version}/train.csv")

    )

    fns = list(Path(f"../data/custom_data/{version}/").glob("*/*.csv"))
    meta = pd.concat(
        [
            pd.read_csv(i)
            for i in fns
        ],
        ignore_index=True,
    )
    meta['id_csv'] = fns
    meta['uniq_id'] = meta['id_csv'].apply(lambda x: x.stem)
    df['uniq_id'] = df['id'].apply(lambda x: Path(x).stem)
    return pd.merge(df, meta, on='uniq_id')

In [4]:
#name = 'V_10'
#save_path = Path("../data/SPLITS")/name
#os.makedirs(save_path, exist_ok=True)
#
#df_10 = get_custom_dataset()
#df_11 = get_custom_dataset("DATA_V11")
#df_12 = get_custom_dataset("DATA_V12")
#
#
#df_comb = (
#    pd.concat([df_10, df_11, df_12], ignore_index=True)
#    .sample(frac=1.0)
#    .reset_index(drop=True)
#)
#
#index_100 = list(df_comb.query("snr>60 & snr<100").sample(n=500).index)
#index_60 =  list(df_comb.query("snr>50 & snr<60").sample(n=500).index)
#index_50 =  list(df_comb.query("snr>40 & snr<50").sample(n=500).index)
#index_40 =  list(df_comb.query("snr>0 & snr<40").sample(n=1500).index)
#index_neg = list(df_comb.query("snr==0").sample(n=4000).index)
#
#val_idx = index_100 + index_60 + index_50 + index_40 + index_neg
#val_df = df_comb.iloc[val_idx].sample(frac=1.).reset_index(drop=True)
#trn_df = df_comb.loc[~df_comb.index.isin(val_idx)].sample(frac=1.).reset_index(drop=True)
#val_df.to_csv(save_path/'val_df.csv', index=False)
#trn_df.to_csv(save_path/'trn_df.csv', index=False)

In [5]:
#name = 'V_11'
#save_path = Path("../data/SPLITS")/name
#os.makedirs(save_path, exist_ok=True)
#
#df_10 = get_custom_dataset()
#df_11 = get_custom_dataset("DATA_V11")
#df_12 = get_custom_dataset("DATA_V12")
#
#
#df_comb = (
#    pd.concat([df_10, df_11, df_12], ignore_index=True)
#    .sample(frac=1.0)
#    .reset_index(drop=True)
#)
#
#index_40 =  list(df_comb.query("snr>0 & snr<45").sample(n=2200).index)
#index_neg = list(df_comb.query("snr==0").sample(n=5000).index)
#val_idx = index_40 + index_neg
#val_df = df_comb.iloc[val_idx].sample(frac=1.).reset_index(drop=True)
#trn_df = df_comb.loc[~df_comb.index.isin(val_idx)].sample(frac=1.).reset_index(drop=True)
#val_df.to_csv(save_path/'val_df.csv', index=False)
#trn_df.to_csv(save_path/'trn_df.csv', index=False)

In [6]:
#name = 'V_12'
#save_path = Path("../data/SPLITS")/name
#os.makedirs(save_path, exist_ok=True)
#
#df_10 = get_custom_dataset()
#df_11 = get_custom_dataset("DATA_V11")
#df_12 = get_custom_dataset("DATA_V12")
#df_13 = get_custom_dataset("DATA_V13")
#df_14 = get_custom_dataset("DATA_V14")
#df_15 = get_custom_dataset("DATA_V15")
#df_16 = get_custom_dataset("DATA_V16")
#
#
#
#df_comb = (
#    pd.concat([df_10, df_11, df_12, df_13, df_14, df_15, df_16], ignore_index=True)
#    .sample(frac=1.0)
#    .reset_index(drop=True)
#)
#
#index_100 = list(df_comb.query("snr>50 & snr<100").sample(n=700).index)
#index_50 =  list(df_comb.query("snr>30 & snr<50").sample(n=1250).index)
#index_30 =  list(df_comb.query("snr>0 & snr<30").sample(n=1250).index)
#index_neg = list(df_comb.query("snr==0").sample(n=4000).index)
#
#val_idx = index_100 + index_50 + index_30 + index_neg
#val_df = df_comb.iloc[val_idx].sample(frac=1.).reset_index(drop=True)
#trn_df = df_comb.loc[~df_comb.index.isin(val_idx)].sample(frac=1.).reset_index(drop=True)
#val_df.to_csv(save_path/'val_df.csv', index=False)
#trn_df.to_csv(save_path/'trn_df.csv', index=False)

In [7]:
#name = 'V_14'
#save_path = Path("../data/SPLITS")/name
#os.makedirs(save_path, exist_ok=True)
#
#
#df_13 = get_custom_dataset("DATA_V13")
#df_14 = get_custom_dataset("DATA_V14")
#df_15 = get_custom_dataset("DATA_V15")
#df_16 = get_custom_dataset("DATA_V16")
#df_17 = get_custom_dataset("DATA_V17")
#df_18 = get_custom_dataset("DATA_V18")
#
#
#
#
#df_comb = (
#   pd.concat([df_13, df_14, df_15, df_16, df_17, df_18], ignore_index=True)
#   .sample(frac=1.0)
#   .reset_index(drop=True))
#    
#val_df = pd.read_csv('../data/SPLITS/V_10/val_df.csv')
#trn_df = pd.concat([pd.read_csv('../data/SPLITS/V_10/trn_df.csv'), df_comb], ignore_index=True)
#val_df.to_csv(save_path/'val_df.csv', index=False)
#trn_df.to_csv(save_path/'trn_df.csv', index=False)
#

In [8]:
#name = 'V_15'
#save_path = Path("../data/SPLITS")/name
#os.makedirs(save_path, exist_ok=True)
#
#
#df_10 = get_custom_dataset("DATA_V10")
#df_11 = get_custom_dataset("DATA_V11")
#df_12 = get_custom_dataset("DATA_V12").query('snr==0').reset_index(drop=True)
#df_13 = get_custom_dataset("DATA_V13")
#df_14 = get_custom_dataset("DATA_V14")
#df_16 = get_custom_dataset("DATA_V16")
#df_17 = get_custom_dataset("DATA_V17")
#
#
#df_comb = (
#    pd.concat(
#        [df_10, df_11, df_12, df_13, df_14, df_16, df_17],
#        ignore_index=True,
#    )
#    .sample(frac=1.0)
#    .reset_index(drop=True)
#)
#
#index_100 = list(df_comb.query("snr>50 & snr<100").sample(n=1250).index)
#index_50 =  list(df_comb.query("snr>30 & snr<50").sample(n=1250).index)
#index_30 =  list(df_comb.query("snr>0 & snr<30").sample(n=1250).index)
#index_neg = list(df_comb.query("snr==0").sample(n=4000).index)
#
#
#val_idx = index_100 + index_50 + index_30 + index_neg
#val_df = df_comb.iloc[val_idx].sample(frac=1.).reset_index(drop=True)
#trn_df = df_comb.loc[~df_comb.index.isin(val_idx)].sample(frac=1.).reset_index(drop=True)
#val_df.to_csv(save_path/'val_df.csv', index=False)
#trn_df.to_csv(save_path/'trn_df.csv', index=False)

In [9]:
#name = 'V_16'
#save_path = Path("../data/SPLITS")/name
#os.makedirs(save_path, exist_ok=True)
#
#
#df_10 = get_custom_dataset("DATA_V10")
#df_11 = get_custom_dataset("DATA_V11")
#df_12 = get_custom_dataset("DATA_V12").query('snr==0').reset_index(drop=True)
#df_13 = get_custom_dataset("DATA_V13")
#df_14 = get_custom_dataset("DATA_V14")
#df_16 = get_custom_dataset("DATA_V16")
#df_17 = get_custom_dataset("DATA_V17")
#
#
#df_comb = (
#    pd.concat(
#        [df_10, df_11, df_12, df_13, df_14, df_16, df_17],
#        ignore_index=True,
#    )
#    .sample(frac=1.0)
#    .reset_index(drop=True)
#)
#
#index_100 = list(df_comb.query("snr>50 & snr<100").sample(n=1500).index)
#index_50 =  list(df_comb.query("snr>0 & snr<50").sample(n=1500).index)
#index_neg = list(df_comb.query("snr==0").sample(n=4000).index)
#
#
#val_idx = index_100 + index_50 + index_neg
#val_df = df_comb.iloc[val_idx].sample(frac=1.).reset_index(drop=True)
#trn_df = df_comb.loc[~df_comb.index.isin(val_idx)].sample(frac=1.).reset_index(drop=True)
#val_df.to_csv(save_path/'val_df.csv', index=False)
#trn_df.to_csv(save_path/'trn_df.csv', index=False)

In [10]:
#name = 'V_17'
#save_path = Path("../data/SPLITS")/name
#os.makedirs(save_path, exist_ok=True)
#
#
#df_10 = get_custom_dataset("DATA_V10")
#df_11 = get_custom_dataset("DATA_V11")
#df_12 = get_custom_dataset("DATA_V12").query('snr==0').reset_index(drop=True)
#df_13 = get_custom_dataset("DATA_V13")
#df_14 = get_custom_dataset("DATA_V14")
#df_16 = get_custom_dataset("DATA_V16")
#df_17 = get_custom_dataset("DATA_V17")
#df_19 = get_custom_dataset("DATA_V19")
#df_20 = get_custom_dataset("DATA_V20")
#
#
#df_comb = (
#    pd.concat(
#        [df_10, df_11, df_12, df_13, df_14, df_16, df_17, df_19, df_20],
#        ignore_index=True,
#    )
#    .sample(frac=1.0)
#    .reset_index(drop=True)
#)
#
#index_100 = list(df_comb.query("snr>50 & snr<100").sample(n=1500).index)
#index_50 =  list(df_comb.query("snr>0 & snr<50").sample(n=2000).index)
#index_neg = list(df_comb.query("snr==0").sample(n=4000).index)
#
#
#val_idx = index_100 + index_50 + index_neg
#val_df = df_comb.iloc[val_idx].sample(frac=1.).reset_index(drop=True)
#trn_df = df_comb.loc[~df_comb.index.isin(val_idx)].sample(frac=1.).reset_index(drop=True)
#val_df.to_csv(save_path/'val_df.csv', index=False)
#trn_df.to_csv(save_path/'trn_df.csv', index=False)

In [11]:
#from sklearn.model_selection import StratifiedKFold
#name = 'V_18'
#save_path = Path("../data/SPLITS")/name
#os.makedirs(save_path, exist_ok=True)
#
#
#df_10 = get_custom_dataset("DATA_V10")
#df_11 = get_custom_dataset("DATA_V11")
#df_12 = get_custom_dataset("DATA_V12").query('snr==0').reset_index(drop=True)
#df_13 = get_custom_dataset("DATA_V13")
#df_14 = get_custom_dataset("DATA_V14")
#df_16 = get_custom_dataset("DATA_V16")
#df_17 = get_custom_dataset("DATA_V17")
#df_19 = get_custom_dataset("DATA_V19")
#df_20 = get_custom_dataset("DATA_V20")
#
#
#df_comb = (
#    pd.concat(
#        [df_10, df_11, df_12, df_13, df_14, df_16, df_17, df_19, df_20],
#        ignore_index=True,
#    )
#    .sample(frac=1.0)
#    .reset_index(drop=True)
#)
#df_comb['t'] = df_comb['snr'].round().astype('int')
#out = []
#skf = StratifiedKFold(n_splits=5)
#for k, (trn, val) in enumerate(skf.split(df_comb['id'], df_comb['t'])):
#    temp = df_comb.iloc[val].copy().reset_index(drop=True)
#    temp['fold']  = k
#    out.append(temp)
#df_comb = pd.concat(out, ignore_index=True)
#df_comb.to_csv(save_path/'trn_df.csv', index=False)

In [13]:
from sklearn.model_selection import StratifiedKFold
name = 'V_19'
save_path = Path("../data/SPLITS")/name
os.makedirs(save_path, exist_ok=True)


df_10 = get_custom_dataset("DATA_V10")
df_11 = get_custom_dataset("DATA_V11")
df_13 = get_custom_dataset("DATA_V13")
df_14 = get_custom_dataset("DATA_V14")
df_16 = get_custom_dataset("DATA_V16")
df_20 = get_custom_dataset("DATA_V20")


df_comb = (
    pd.concat(
        [df_10, df_11, df_13, df_14, df_16, df_20],
        ignore_index=True,
    )
    .sample(frac=1.0)
    .reset_index(drop=True)
)

index_100 = list(df_comb.query("snr>50 & snr<100").sample(n=1500).index)
index_50 =  list(df_comb.query("snr>0 & snr<50").sample(n=2000).index)
index_neg = list(df_comb.query("snr==0").sample(n=4000).index)


val_idx = index_100 + index_50 + index_neg
val_df = df_comb.iloc[val_idx].sample(frac=1.).reset_index(drop=True)
trn_df = df_comb.loc[~df_comb.index.isin(val_idx)].sample(frac=1.).reset_index(drop=True)
val_df.to_csv(save_path/'val_df.csv', index=False)
trn_df.to_csv(save_path/'trn_df.csv', index=False)

In [None]:
#df = get_custom_dataset('DATA_V19')

In [21]:
#ds = DatasetVisualizer(val_df.query('snr>50').reset_index(drop=True)[:50])
#for i in range(50):
#    img, _ = ds[i]
#    plot_sft(img)