In [None]:
import os
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import random
import torch
import pickle
from scipy.stats import binned_statistic
from pathlib import Path

In [None]:
LABELS = '/media/iafoss/New Volume/ML/G2Net2022/data/_labels.csv'
DATA = '/media/iafoss/New Volume/ML/G2Net2022/data/train'
OUT = '/media/iafoss/New Volume/ML/G2Net2022/data/train.pickle'

nbins=128

In [None]:
signal = list(Path("../data/custom_data/SIGNAL_V0/data").glob("*.pth")) + list(Path("../data/custom_data/SIGNAL_V1/data").glob("*.pth"))
#df['id'] = df['id'].apply(lambda x: Path(x))
#dataset = {}
#for index,row in tqdm(df.iterrows(),total=len(df)):
#    idx = row['id'].stem
#
#    target = row['target']
#    data = torch.load(row['id'])
#    time_ids = {'H1':data['H1_ts'], 'L1':data['L1_ts']}
#    
#    mean_statH = binned_statistic(time_ids['H1'], np.abs(data['H1_SFTs_amplitudes']*1e22)**2,
#        statistic='mean', bins=nbins, 
#        range=(max(time_ids['H1'].min(),time_ids['L1'].min()),
#        min(time_ids['H1'].max(),time_ids['L1'].max())))
#    mean_statL = binned_statistic(time_ids['L1'], np.abs(data['L1_SFTs_amplitudes']*1e22)**2,
#        statistic='mean', bins=nbins, 
#        range=(max(time_ids['H1'].min(),time_ids['L1'].min()),
#        min(time_ids['H1'].max(),time_ids['L1'].max())))
#    mean_statH = np.nan_to_num(np.transpose(mean_statH.statistic,(0,1)))
#    mean_statL = np.nan_to_num(np.transpose(mean_statL.statistic,(0,1)))
#    
#    n_statH = binned_statistic(time_ids['H1'], np.ones((1,len(time_ids['H1']))), statistic='sum', bins=nbins, 
#        range=(max(time_ids['H1'].min(),time_ids['L1'].min()),
#        min(time_ids['H1'].max(),time_ids['L1'].max())))
#    n_statL = binned_statistic(time_ids['L1'], np.ones((1,len(time_ids['L1']))), statistic='sum', bins=nbins, 
#        range=(max(time_ids['H1'].min(),time_ids['L1'].min()),
#        min(time_ids['H1'].max(),time_ids['L1'].max())))
#    n_statH = np.nan_to_num(n_statH.statistic)[0].astype(int)
#    n_statL = np.nan_to_num(n_statL.statistic)[0].astype(int)
#    
#    dataset[idx] = {'H1':mean_statH,'L1':mean_statL,'H1_ts':n_statH, 'L1_ts':n_statL}
#
#with open('data/gwaves_train_v5.pickle', 'wb') as handle:
#    pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def clean_gwaves(fn):
    dataset = dict()
    idx = fn.stem

    #target = row["target"]
    data = torch.load(fn)
    time_ids = {"H1": data["H1_ts"], "L1": data["L1_ts"]}

    mean_statH = binned_statistic(
        time_ids["H1"],
        np.abs(data["H1_SFTs_amplitudes"] * 1e22) ** 2,
        statistic="mean",
        bins=nbins,
        range=(
            max(time_ids["H1"].min(), time_ids["L1"].min()),
            min(time_ids["H1"].max(), time_ids["L1"].max()),
        ),
    )
    mean_statL = binned_statistic(
        time_ids["L1"],
        np.abs(data["L1_SFTs_amplitudes"] * 1e22) ** 2,
        statistic="mean",
        bins=nbins,
        range=(
            max(time_ids["H1"].min(), time_ids["L1"].min()),
            min(time_ids["H1"].max(), time_ids["L1"].max()),
        ),
    )
    mean_statH = np.nan_to_num(np.transpose(mean_statH.statistic, (0, 1)))
    mean_statL = np.nan_to_num(np.transpose(mean_statL.statistic, (0, 1)))

    n_statH = binned_statistic(
        time_ids["H1"],
        np.ones((1, len(time_ids["H1"]))),
        statistic="sum",
        bins=nbins,
        range=(
            max(time_ids["H1"].min(), time_ids["L1"].min()),
            min(time_ids["H1"].max(), time_ids["L1"].max()),
        ),
    )
    n_statL = binned_statistic(
        time_ids["L1"],
        np.ones((1, len(time_ids["L1"]))),
        statistic="sum",
        bins=nbins,
        range=(
            max(time_ids["H1"].min(), time_ids["L1"].min()),
            min(time_ids["H1"].max(), time_ids["L1"].max()),
        ),
    )
    n_statH = np.nan_to_num(n_statH.statistic)[0].astype(int)
    n_statL = np.nan_to_num(n_statL.statistic)[0].astype(int)

    dataset[idx] = {
        "H1": mean_statH,
        "L1": mean_statL,
        "H1_ts": n_statH,
        "L1_ts": n_statL,
    }
    return dataset

from joblib import Parallel, delayed

In [None]:
res = Parallel(n_jobs=16)(
    delayed(clean_gwaves)(i)
    for i in tqdm(signal)
)

In [None]:
res[0]

In [None]:
from collections import ChainMap

In [None]:
out = dict(ChainMap(*res))

In [None]:
with open('data/gwaves_train_v5.pickle', 'wb') as handle:
    pickle.dump(out, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def extract_data_from_hdf5(path):
    data = {}
    with h5py.File(path, "r") as f:
        ID_key = list(f.keys())[0]
        # Retrieve the frequency data
        data['freq'] = np.array(f[ID_key]['frequency_Hz'])
        # Retrieve the Livingston decector data
        data['L1_SFTs_amplitudes'] = np.array(f[ID_key]['L1']['SFTs'])
        data['L1_ts'] = np.array(f[ID_key]['L1']['timestamps_GPS'])
        # Retrieve the Hanford decector data
        data['H1_SFTs_amplitudes'] = np.array(f[ID_key]['H1']['SFTs'])
        data['H1_ts'] = np.array(f[ID_key]['H1']['timestamps_GPS'])
    return data

In [None]:
std_flags, std_est = {},[]
for fname in tqdm([f for f in os.listdir('../data/test')]):
    fidx = fname.split('.')[0]
    data = extract_data_from_hdf5(os.path.join('../data/test',fname))
    std_H = np.abs(data['H1_SFTs_amplitudes']*1e22).std()
    std_L = np.abs(data['L1_SFTs_amplitudes']*1e22).std()
    #train min,max,mean: 0.69375944,0.6990975,0.6950916
    f0 = std_H < 0.693 or std_H > 0.700 # find nonstationary noise cases
    f1 = std_L < 0.693 or std_L > 0.700
    std_flags[fidx] = int(f0) + 2*int(f1)
    if f0 == 0 and f1 == 0: continue 
    
    time_ids = {'H1':data['H1_ts'], 'L1':data['L1_ts']}    
    std_H = 0.5*((data['H1_SFTs_amplitudes'].real*1e22).std(0)**2 + 
             (data['H1_SFTs_amplitudes'].imag*1e22).std(0)**2)
    std_statH = binned_statistic(time_ids['H1'], std_H,
            statistic='mean', bins=nbins, 
            range=(max(time_ids['H1'].min(),time_ids['L1'].min()),
            min(time_ids['H1'].max(),time_ids['L1'].max())))
    std_statH = np.nan_to_num(std_statH.statistic**0.5,nan=1.0)
    
    std_L = 0.5*((data['L1_SFTs_amplitudes'].real*1e22).std(0)**2 + 
             (data['L1_SFTs_amplitudes'].imag*1e22).std(0)**2)
    std_statL = binned_statistic(time_ids['L1'], std_L,
            statistic='mean', bins=nbins, 
            range=(max(time_ids['H1'].min(),time_ids['L1'].min()),
            min(time_ids['H1'].max(),time_ids['L1'].max())))
    std_statL = np.nan_to_num(std_statL.statistic**0.5,nan=1.0)
    
    n_statH = binned_statistic(time_ids['H1'], np.ones((1,len(time_ids['H1']))), statistic='sum', bins=nbins, 
        range=(max(time_ids['H1'].min(),time_ids['L1'].min()),
        min(time_ids['H1'].max(),time_ids['L1'].max())))
    n_statL = binned_statistic(time_ids['L1'], np.ones((1,len(time_ids['L1']))), statistic='sum', bins=nbins, 
        range=(max(time_ids['H1'].min(),time_ids['L1'].min()),
        min(time_ids['H1'].max(),time_ids['L1'].max())))
    n_statH = np.nan_to_num(n_statH.statistic)[0].astype(int)
    n_statL = np.nan_to_num(n_statL.statistic)[0].astype(int)
    
    std_est.append({'H1_std':std_statH,'L1_std':std_statL,'H1_ts':n_statH, 'L1_ts':n_statL})

with open('data/real_noise_std.pickle', 'wb') as handle:
    pickle.dump(std_est, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
df_test_std = pd.DataFrame({'id':std_flags.keys(),'std_flag':std_flags.values()})
df_test_std.to_csv('data/test_std.csv',index=False)
df_test_std.head()

In [None]:
with open('data/real_noise_std.pickle', 'rb') as handle:
    std_est = pickle.load(handle)

i=0
plt.plot(std_est[i]['H1_std'])
plt.plot(std_est[i]['L1_std'])

In [None]:
import os
import h5py
import pickle

paths = ['../data/train',
         '../data/test']

def get_ts(idx):
    data = {}
    path = os.path.join(p)
    with h5py.File(path, "r") as f:
        ID_key = list(f.keys())[0]
        data['L1_ts'] = np.array(f[ID_key]['L1']['timestamps_GPS'])
        data['H1_ts'] = np.array(f[ID_key]['H1']['timestamps_GPS'])
    return data

timestamps_all = []
for path in paths:
    for p in tqdm([os.path.join(path,p) for p in os.listdir(path)]):
        timestamps = get_ts(p)
        if len(timestamps['L1_ts']) < 4000 or len(timestamps['H1_ts']) < 4000: continue
        timestamps_all.append(timestamps)

with open('data/timestamps_all.pickle', 'wb') as handle:
    pickle.dump(timestamps_all, handle, protocol=pickle.HIGHEST_PROTOCOL)