In [1]:
from einops import rearrange
import copy
import h5py
from pathlib import Path
import numpy as np
import pandas as pd
import torch
torch.cuda.set_device(1)
from pdb import set_trace
import matplotlib.pyplot as plt
from torch import nn
from x_transformers import  Encoder, Decoder
from x_transformers.autoregressive_wrapper import exists
from torch.utils.data import DataLoader
from sklearn.metrics import roc_auc_score
from fastai.vision.all import BCEWithLogitsLossFlat
from transformers.optimization import (
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
)
from fastprogress.fastprogress import master_bar, progress_bar
import os
from timm import create_model
import random
from tqdm import tqdm

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from torch import Tensor
import torch.nn.functional as F
from typing import Tuple
import pickle

In [2]:
class CFG:
    bs = 32
    nw = 4
    model_name = "convnext_large_in22k"
    lr = 1e-4
    wd = 1e-4
    epoch = 12
    warmup_pct = 0.1
    num_classes = 1
    dropout_rate = 0.3
    folder = "EXP_200_BASELINE_CASHE_V3"
    mixup=False
    exp_name = f"{folder}_{model_name}"

In [3]:
def get_snr(left, right, df):
    df_ = pd.concat([df.query(f"snr>{left} & snr<{right}"), df.query("snr==0")])
    return df_


def generate_report(df):
    val_df_eval = df.copy()

    roc_100 = roc_auc_score(val_df_eval["target"], val_df_eval["pred"])

    roc_25_50 = roc_auc_score(
        get_snr(30, 50, val_df_eval)["target"], get_snr(30, 50, val_df_eval)["pred"]
    )

    return {
        "roc_all": roc_100,
        "roc_30_50": roc_25_50,

    }



In [4]:
def time_mask(spec, T=10):
    cloned = spec.clone().detach()
    len_spectro = cloned.shape[2]
    num_masks = np.random.randint(3, 8)
    for i in range(0, num_masks):
        t = random.randrange(0, T)
        t_zero = random.randrange(0, len_spectro - t)

        # avoids randrange error if values are equal and range is empty
        if (t_zero == t_zero + t): return cloned

        mask_end = random.randrange(t_zero, t_zero + t)
        cloned[:, :,t_zero:mask_end] = 0
    return cloned




def freq_mask(spec, F=30):
    cloned = spec.clone().detach()
    num_mel_channels = cloned.shape[1]
    num_masks = np.random.randint(3, 8)
    for i in range(0, num_masks):        
        f = random.randrange(0, F)
        f_zero = random.randrange(0, num_mel_channels - f)

        # avoids randrange error if values are equal and range is empty
        if (f_zero == f_zero + f): return cloned

        mask_end = random.randrange(f_zero, f_zero + f) 
        cloned[:, f_zero:mask_end, :] = 0
    
    return cloned

In [5]:
def read_pkl(filename):   
    with open(filename, 'rb') as file1: 
        k = pickle.load(file1)
        h1 = k["H1"]['spectrogram']
        l1 = k["L1"]['spectrogram']
        h1_timestamp = k["H1"]['timestamps']
        l1_timestamp = k["L1"]['timestamps']
        freq = k['frequency']
        
    data_dict = {"sft" : np.stack([h1[:, :4096], l1[:, :4096]]), 
                 "timestamps": {"H1": h1_timestamp, 
                                    "L1": l1_timestamp}}
    return data_dict

In [6]:
def preprocess(sft):
    sft = sft * 1e22
    sft = sft.real**2 + sft.imag**2
    return sft


def normalize(data):
    data[0] = data[0] / data[0].mean()
    data[1] = data[1] / data[1].mean()
    data = data.reshape(2, 360, 128, 32).mean(-1)  # compress 4096 -> 128
    data = data - data.mean()
    data = data / data.std()
    return torch.tensor(data)


def read_h5(file):
    file = Path(file)
    with h5py.File(file, "r") as f:
        filename = file.stem
        k = f[filename]
        h1 = k["H1"]
        l1 = k["L1"]
        h1_stft = h1["SFTs"][()]
        h1_timestamp = h1["timestamps_GPS"][()]
        l1_stft = l1["SFTs"][()]
        l1_timestamp = l1["timestamps_GPS"][()]
        
        data_dict = {"sft" : np.stack([h1_stft[:, :4096], l1_stft[:, :4096]]), 
                 "timestamps": {"H1": h1_timestamp, 
                                    "L1": l1_timestamp}}
        
        return data_dict
    

    
class ValLoader(torch.utils.data.Dataset):
    """
    dataset = Dataset(data_type, df)

    img, y = dataset[i]
      img (np.float32): 2 x 360 x 128
      y (np.float32): label 0 or 1
    """
    def __init__(self, df, freq_tfms=False):
        self.df = df
        self.tfms = freq_tfms
        

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        """
        i (int): get ith data
        """
        r = self.df.iloc[i]
        y = np.float32(r.target)
        img = normalize(preprocess(read_h5(r.id)['sft']))
        return img, y
    
    
class ValLoaderPickle(torch.utils.data.Dataset):
    """
    dataset = Dataset(data_type, df)

    img, y = dataset[i]
      img (np.float32): 2 x 360 x 128
      y (np.float32): label 0 or 1
    """
    def __init__(self, df, freq_tfms=False):
        self.df = df
        self.tfms = freq_tfms
        

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        """
        i (int): get ith data
        """
        r = self.df.iloc[i]
        y = np.float32(r.target)
        img = normalize(read_pkl(str(r.id))['sft'])
        return img.float(), y




In [7]:
def torch_rot90_cw(x):
    return x.rot90(k=-1, dims=(2, 3))


def torch_fliplr(x: Tensor):
    """
    Flip 4D image tensor horizontally
    :param x:
    :return:
    """
    return x.flip(3)


def torch_flipud(x: Tensor):
    """
    Flip 4D image tensor vertically
    :param x:
    :return:
    """
    return x.flip(2)


def tencrop_image2label(model: nn.Module, image: Tensor) -> Tensor:
    """Test-time augmentation for image classification that takes five crops out of input tensor (4 on corners and central)
    and averages predictions from them and from their horisontally-flipped versions (10-Crop TTA).
    :param model: Classification model
    :param image: Input image tensor
    :param crop_size: Crop size. Must be smaller than image size
    :return: Averaged logits
    """

    output = (
        torch.sigmoid(model(image))
        + torch.sigmoid(model(torch_flipud(image)))
        #+ torch.sigmoid(model(torch_fliplr(image)))
       # + torch.sigmoid(model(torch_flipud(torch_fliplr(image))))
    ) / 2.

    return output

In [8]:
def predict_tta(dl, model):
    res = []
    with torch.no_grad():
        for x, y in tqdm(vld_dl):
            out = tencrop_image2label(model, x.cuda()).detach().cpu()
            #out = torch.sigmoid(model(x.cuda())).detach().cpu()
            res.append(out)
    return res

In [None]:

real_noise_fns = sorted(
    Path("../data/custom_data/DATA_V33/data/").glob("*.pth"),
    key=lambda x: str(x).split("_")[-2],
)

fake_noise_fns = sorted(
    Path("../data/custom_data/DATA_V34/data/").glob("*.pth"),
    key=lambda x: str(x).split("_")[-2],
)


noise = (
    list(Path("../data/custom_data/DATA_V31_V32_NOISE").glob("*.pth"))
    + real_noise_fns[:1100]
    + fake_noise_fns
)
cashe_fns = list(Path("cashe_dataset").glob("*.pth"))

val_df = pd.read_csv("../data/SPLITS/V_22/val_df.csv")
comp_train = pd.read_csv("../data/train_labels.csv")
comp_train.columns = ["fn", "target"]
comp_train = comp_train.query("target>=0")
comp_train["fn"] = comp_train["fn"].apply(lambda x: Path("../data/train") / f"{x}.hdf5")
comp_train.columns = ["id", "target"]
comp_train["data_type"] = "comp_train"
real_noise_df = pd.DataFrame({"id": real_noise_fns[1100:], "target": 0.0, "snr": 0})
real_noise_df["id"] = real_noise_df["id"].apply(
    lambda x: Path(str(x).replace(".pth", ".h5"))
)

val_df = pd.concat([val_df, comp_train, real_noise_df], ignore_index=True)
val_df['id']= val_df['id'].apply(lambda x: Path(x))
                             
fns = ["EXP_200_BASELINE_CASHE_V4/EXP_200_BASELINE_CASHE_V4_convnext_large_in22k_0_8.pth"]

custom_model = create_model(
                    CFG.model_name,
                    pretrained=True,
                    num_classes=1,
                    in_chans=2,
                )

custom_model.load_state_dict(torch.load(fns[0]))
custom_model.cuda();
custom_model.eval();
sub_ds = ValLoader(val_df)
vld_dl = DataLoader(
    sub_ds,
    batch_size=CFG.bs,
    shuffle=False,
    num_workers=CFG.nw,
    pin_memory=True,
    drop_last=False
)

res = predict_tta(vld_dl, custom_model)


In [None]:
val_df['pred'] = torch.cat(res).view(-1).numpy()

In [None]:
roc_auc_score(val_df['target'], val_df['pred'])

In [None]:
roc_auc_score(val_df.query('data_type == "comp_train"')['target'], 
              val_df.query('data_type == "comp_train"')['pred'])

In [None]:
fns = ["EXP_200_BASELINE_CASHE_V4/EXP_200_BASELINE_CASHE_V4_convnext_large_in22k_0_8.pth"]

custom_model = create_model(
                    CFG.model_name,
                    pretrained=True,
                    num_classes=1,
                    in_chans=2,
                )

custom_model.load_state_dict(torch.load(fns[0]))
custom_model.cuda();
sub = pd.read_csv('../data/sample_submission.csv')
sub['id'] = sub['id'].apply(lambda x: Path(f'../data/test/{x}.hdf5'))
sub_ds = ValLoader(sub)
vld_dl = DataLoader(
    sub_ds,
    batch_size=CFG.bs,
    shuffle=False,
    num_workers=CFG.nw,
    pin_memory=True,
    drop_last=False
)
vld_dl = DataLoader(
    sub_ds,
    batch_size=CFG.bs,
    shuffle=False,
    num_workers=CFG.nw,
    pin_memory=True,
    drop_last=False
)

res = predict_tta(vld_dl, custom_model)

In [None]:
sub['target'] = torch.cat(res).view(-1).numpy()
sub['id'] = sub['id'].apply(lambda x: x.stem)

In [None]:
sub.to_csv('EXP_200_BASELINE_CASHE_V4.csv', index=False)

In [None]:
!ls ../../val/

In [9]:
fns = ["EXP_200_BASELINE_CASHE_V4/EXP_200_BASELINE_CASHE_V4_convnext_large_in22k_0_8.pth"]

In [30]:
#predict train

for mw in fns:
    print('_______')
    print(mw)
    df_eval = pd.read_csv('../../val_v23/v23v.csv')
    df_eval.id = df_eval.id.apply(lambda x: Path(f"../../val_v23/v23_val/{x}.pickle"))
    sub_ds = ValLoaderPickle(df_eval)
    vld_dl = DataLoader(
        sub_ds,
        batch_size=CFG.bs,
        shuffle=False,
        num_workers=CFG.nw,
        pin_memory=True,
        drop_last=False
    )

    custom_model = create_model(
                        CFG.model_name,
                        pretrained=True,
                        num_classes=1,
                        in_chans=2,
                    )

    custom_model.load_state_dict(torch.load(mw, map_location=torch.device('cpu')))
    custom_model.cuda();
    custom_model.eval();

    res = predict_tta(vld_dl, custom_model)
    df_eval['pred'] = torch.cat(res).view(-1).numpy()
    break
    #df_eval['snr'] = df_eval['snr'].replace(np.nan, 0)
    #df_eval = df_eval.dropna(subset='pred')
    #dict_res = generate_report(df_eval)
    #dict_res_400_500 = generate_report(df_eval.query('freq>400 and freq<500'))
    #dict_res_300_400 = generate_report(df_eval.query('freq>300 and freq<400'))
    #dict_res_200_300 = generate_report(df_eval.query('freq>200 and freq<300'))
    #dict_res_50_200 = generate_report(df_eval.query('freq>50 and freq<200'))
    print('___all___')
    print(dict_res)
    #print('freq_400_500:')
    #print(dict_res_400_500)
    #print('freq_300_400:')
    #print(dict_res_300_400)
    #print('freq_200_300:')
    #print(dict_res_200_300)
    #print('freq_50_200:')
    #print(dict_res_50_200)



_______
EXP_200_BASELINE_CASHE_V4/EXP_200_BASELINE_CASHE_V4_convnext_large_in22k_0_8.pth


  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  data[1] = data[1] / data[1].mean()
  data[0] = data[0] / data[0].mean()
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
  data[1] = data[1] / data[1].mean()
100%|████████████████████████████████████████████████████████████████████████████████| 250/250 [02:39<00:00,  1.57it/s]


In [31]:
#df_eval = df_eval.dropna(subset='pred')
#roc_auc_score(df_eval['target'], df_eval['pred'])

In [32]:
df_eval['target']

0       0
1       1
2       0
3       1
4       0
       ..
7970    1
7971    0
7972    1
7973    1
7974    0
Name: target, Length: 7975, dtype: int64

In [33]:
df_eval['pred']

0       0.265349
1       0.297388
2       0.421611
3       0.999995
4       0.244278
          ...   
7970    0.999996
7971    0.300561
7972    0.246879
7973    0.999998
7974    0.324846
Name: pred, Length: 7975, dtype: float32

In [34]:
df_eval.id = df_eval.id.apply(lambda x: x.stem)
df_eval.to_csv('EXP_200_BASELINE_CASHE_V4_EVAL_V23_CORRECT.csv', index=False)