In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt

%matplotlib inline
import random
import h5py
import os
from tqdm import tqdm
from scipy import stats
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from scipy.stats import norm
from timm import create_model
import gc
import torch.nn.functional as F
from joblib import Parallel, delayed

def normalize(X):
    X = (X[..., None].view(X.real.dtype) ** 2).sum(-1)
    POS = int(X.size * 0.99903)
    EXP = norm.ppf((POS + 0.4) / (X.size + 0.215))
    scale = np.partition(X.flatten(), POS, -1)[POS]
    X /= scale / EXP.astype(scale.dtype) ** 2
    return X


def preprocess(num, input, H1, L1):
    input = torch.from_numpy(input).to("cpu", non_blocking=True)
    rescale = torch.tensor([[H1, L1]]).to("cpu", non_blocking=True)
    tta = (
        torch.randn([num, *input.shape, 2], device=input.device, dtype=torch.float32)
        .square_()
        .sum(-1)
    )
    tta *= rescale[..., None, None] / 2
    valid = ~torch.isnan(input)
    tta[:, valid] = input[valid].float()
    return tta


def remove_nan(img):
    h_img = img[0].copy()
    l_img = img[1].copy()
    h_img = h_img[~np.isnan(h_img)].reshape(360, -1)[:, :4096]
    l_img = l_img[~np.isnan(l_img)].reshape(360, -1)[:, :4096]
    return np.stack([h_img, l_img])


def scale_and_norm(x):
    x -= x.min()  # bring the lower range to 0
    x /= x.max()  # bring the upper range to 1
    x = 2 * x - 1
    return x


pool = nn.Sequential(nn.AvgPool2d((1, 9), (1, 8), (0, 4), count_include_pad=False))


def open_file(filepath):
    astime = np.full([2, 360, 5760], np.nan, dtype=np.float32)
    with h5py.File(filepath, "r") as f:
        fid, _ = os.path.splitext(os.path.split(filepath)[1])
        HT = (
            (np.asarray(f[fid]["H1"]["timestamps_GPS"]) / 1800).round().astype(np.int64)
        )
        LT = (
            (np.asarray(f[fid]["L1"]["timestamps_GPS"]) / 1800).round().astype(np.int64)
        )
        MIN = min(HT.min(), LT.min())
        HT -= MIN
        LT -= MIN
        H1 = normalize(np.asarray(f[fid]["H1"]["SFTs"], np.complex128))
        valid = HT < 5760
        astime[0][:, HT[valid]] = H1[:, valid]
        L1 = normalize(np.asarray(f[fid]["L1"]["SFTs"], np.complex128))
        valid = LT < 5760
        astime[1][:, LT[valid]] = L1[:, valid]
    gc.collect()
    return astime


def read_file(fn):
    img = torch.tensor(np.expand_dims(remove_nan(open_file(fn)), 0))
    img = pool(img)[0]
    h = scale_and_norm(img[0].clone())
    l = scale_and_norm(img[1].clone())
    img = torch.stack([h, l])
    return img

In [7]:

df_test = pd.read_csv("../data/sample_submission.csv")
df_test["id"] = df_test["id"].apply(lambda x: Path("../data/test/") / f"{x}.hdf5")

#split_voldf = Path("../data/SPLITS/V_19")
#trn_df = pd.read_csv(split_voldf/'trn_df.csv')

In [None]:
def save_pytorch_dict(fn):
    out = read_file(fn)
    save_str = str(fn)
    save_str = save_str.replace('.hdf5', '.pth')
    torch.save(out, save_str)
    
fns =  df_test["id"].to_list()
Parallel(n_jobs=16)(
    delayed(save_pytorch_dict)(i)
    for i in tqdm(fns)
)

 52%|████████████████████████████████████████▊                                     | 4176/7975 [02:23<02:04, 30.40it/s]