# Task 1: CVD - Unimodality (Pretrained Encoder)

## Setting Environment

In [None]:
%load_ext autoreload
%autoreload 2

from IPython.core.magic import register_cell_magic

@register_cell_magic
def skip(line, cell):
    return

In [None]:
import sys
import os

root_path = ...

from config import device, data_folder, log_folder
import pickle
task_dir = "CVD"
data_folder+=task_dir+"/"
log_folder+=task_dir+"/"

from itertools import combinations,product
from models.unimodal import create_unimodal_model
from models.multimodal import create_multimodal_model
from training_evaluation import run_kfolds
import torch

In [None]:
from datasets.CVD.static import StaticLoader
static_dataset_path = data_folder+"/static.pkl"
if os.path.exists(static_dataset_path):
    with open(static_dataset_path, "rb") as f:
        static = pickle.load(f)
else:
    static = StaticLoader()
    with open(static_dataset_path, "wb") as f:
        pickle.dump(static, f)
ids = static.get_ids()
targets_df = static.get_targets()
targets = targets_df.labels.values
targets_dict = static.get_labels_dict()
targets_num = len(targets_dict)
subject_ids = targets_df.subject_id.values

In [None]:
import src.utils.data_selection as stool
import pickle
kfolds_fpath = root_path+"datasets/CVD/kfolds.pkl"
if os.path.exists(kfolds_fpath):
    with open(kfolds_fpath, "rb") as f:
        kfolds = pickle.load(f)
else:
    kfolds = stool.train_valid_test_kfolds_for_task1(ids, targets, subject_ids)
    with open(kfolds_fpath, "wb") as f:
        pickle.dump(kfolds, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from models.unimodal import create_unimodal_model
from models.multimodal import create_multimodal_model

In [None]:
import numpy as np
def get_loss_weights(targets):
    """
    Get the loss weights for the model.
    """
    _targets, _num = np.unique(targets, return_counts=True)
    loss_weights = _num/np.sum(_num)
    loss_weights = [float(w) for w in loss_weights]
    print(_targets, loss_weights)
    return loss_weights

LOSS_WEIGHTS = get_loss_weights(targets)

In [None]:
import numpy as np
import torch 
def data_normalization(data):
    if type(data) is np.ndarray:
        _mean = np.mean(data, axis=0)
        _std = np.std(data, axis=0) + 1e-8
    elif type(data) is torch.Tensor:
        _mean = torch.mean(data, dim=0)
        _std = torch.std(data, dim=0) + 1e-8
    data = (data - _mean) / _std
    return data

## Static

In [None]:
static_type = "extended" # core or extended
static_dataset = static.get_dataset(type=static_type)
static_dataset_param = {
    "path": static_dataset_path,
    "num": len(static_dataset),
    "type": static_type
}
if static_type == "core":
    static_feats = static.feats_core
else:
    static_feats = static.feats_extended

In [None]:
from blocks.mlp import  MLP, MLPDecoder

FEATS_NUM = len(static_feats)
EMBED_DIM = 128
LR = 0.001
ENCODER_DROPOUT = 0.1
DECODER_DROPOUT = 0.5

#MLP
MLP_param = {
    "in_dim": FEATS_NUM,
    "hidden_dim": [EMBED_DIM, EMBED_DIM],
    "drop_prob": ENCODER_DROPOUT
}

#Setting Decoder
#MLPDecoder
MLP_decoder_param = {
    "in_dim": EMBED_DIM,
    "num_class": targets_num,
    "hidden_dim": [EMBED_DIM//2],
    "drop_prob": DECODER_DROPOUT
}

#Setting Training Parameters
train_param = {
    "DATASET": static_dataset_param,
    "MODEL_NAME": "static_unimodal",
    "ENCODER_PARAM": [MLP_param],
    "ENCODER_MODEL": [MLP.__name__],
    "DECODER_PARAM": MLP_decoder_param,
    "DECODER_MODEL": MLPDecoder.__name__,
    "BATCH_SIZE": 128,
    "LR": LR,
    "MAX_EPOCHS": 20,
    "OPTIMIZER": "Adam"
}

model = create_unimodal_model(train_param["ENCODER_MODEL"], 
                            train_param["ENCODER_PARAM"], 
                            train_param["DECODER_MODEL"], 
                            train_param["DECODER_PARAM"], device)

log = run_kfolds(train_param, model, static_dataset, kfolds, log_folder=log_folder)

## Labs

In [None]:
from datasets.CVD.labs import LabsLoader
labs_dataset_path = data_folder+"/labs.pkl"
if os.path.exists(labs_dataset_path):
    with open(labs_dataset_path, "rb") as f:
        labs = pickle.load(f)
else:
    labs = LabsLoader(ids, targets)
    with open(labs_dataset_path, "wb") as f:
        pickle.dump(labs, f, protocol=pickle.HIGHEST_PROTOCOL)

labs_dataset = labs.get_dataset(only_valid=True)
labs_ids = labs.get_ids(only_valid=True)

labs_dataset_param = {
    "path": labs_dataset_path,
    "only_valid": True,
    "num": len(labs_dataset)
}

#getting the labs kfolds(subsets of the kfolds, only with the valid labs)
labs_kfolds = stool.get_sub_kfolds(ids, labs_ids, kfolds)

In [None]:
from blocks.embedding import TimeWinEmbedding
from blocks.rnn import LSTM
from blocks.mlp import  MLP, MLPDecoder
from datasets.collate_fun import CreateCustomDataset, time_win_tokens_batch

#get values/souces vocab numbers
labs_val_vsize = labs.values_None_label+1
labs_src_vsize = labs.sources_None_label+1 

EMBED_DIM = 512
LR = 7.5e-05
ENCODER_DROPOUT = 0.05
DECODER_DROPOUT = 0.1

#Setting Encoder
#TimeWinEmbedding
TWEmbed_param = {
    "value_vocab_size":labs_val_vsize, 
    "source_vocab_size":labs_src_vsize, 
    "win_size":labs.win_num, 
    "embed_dim":EMBED_DIM, 
    "device":device, 
    "temporal_weighted":False, 
    "shared_embedding":True
}

#BiLSTM
LSTM_param = {
    "input_size": EMBED_DIM,
    "hidden_size": EMBED_DIM//2,
    "num_layers": 2,
    "bidirectional":True
}

#MLP
MLP_param = {
    "in_dim": EMBED_DIM,
    "hidden_dim": [EMBED_DIM, EMBED_DIM],
    "drop_prob": ENCODER_DROPOUT
}

#Setting Decoder
#MLPDecoder
MLP_decoder_param = {
    "in_dim": EMBED_DIM,
    "num_class": targets_num,
    "hidden_dim": [EMBED_DIM//2],
    "drop_prob": DECODER_DROPOUT
}

collate_fn_params = [
    {"name":time_win_tokens_batch.__name__, "param":{"accum":True,"onset":False}},
    {"name":time_win_tokens_batch.__name__, "param":{"accum":True,"onset":False}},
]

#Setting Training Parameters
train_param = {
    "DATASET": labs_dataset_param,
    "MODEL_NAME": "labs_unimodal",
    "ENCODER_PARAM": [TWEmbed_param, LSTM_param, MLP_param],
    "ENCODER_MODEL": [TimeWinEmbedding.__name__, LSTM.__name__, MLP.__name__],
    "DECODER_PARAM": MLP_decoder_param,
    "DECODER_MODEL": MLPDecoder.__name__,
    "BATCH_SIZE": 128,
    "LR": LR,
    "MAX_EPOCHS": 20,
    "OPTIMIZER": "Adam",
    "COLLATE_FN_PARAMS": collate_fn_params
}

model = create_unimodal_model(train_param["ENCODER_MODEL"], 
                            train_param["ENCODER_PARAM"], 
                            train_param["DECODER_MODEL"], 
                            train_param["DECODER_PARAM"], device)

collate_batch = CreateCustomDataset(len(collate_fn_params), train_param["COLLATE_FN_PARAMS"])
log = run_kfolds(train_param, model, labs_dataset, labs_kfolds, collate_fun=collate_batch, log_folder=log_folder)

## ECG

In [None]:
from datasets.CVD.ecg import ECGLoader
ecg_dataset_path = data_folder+"/ecg.pkl"
if os.path.exists(ecg_dataset_path):
    with open(ecg_dataset_path, "rb") as f:
        ecg = pickle.load(f)
else:
    ecg = ECGLoader(ids, targets)
    with open(ecg_dataset_path, "wb") as f:
        pickle.dump(ecg, f, protocol=pickle.HIGHEST_PROTOCOL)

ecg_ids = ecg.get_ids(only_valid=True)
ecg_kfolds = stool.get_sub_kfolds(ids, ecg_ids, kfolds)
ecg_sig_kfolds = []
for _train_idx, _valid_idx, _test_idx in ecg_kfolds:
    ecg_sig_kfolds.append([ecg.get_recordi2ecgi(_train_idx, only_valid=True), 
                           ecg.get_recordi2ecgi(_valid_idx, only_valid=True),
                           ecg.get_recordi2ecgi(_test_idx, only_valid=True)])

#### Note(statement)

In [None]:
ecg_note_dataset = ecg.get_ecg_dataset(type="tokens")
ecg_note_dataset_param = {
    "path": ecg_dataset_path,
    "num": len(ecg_note_dataset)
}

In [None]:
from blocks.embedding import Embedding
from blocks.mlp import MLP, MLPDecoder
from datasets.collate_fun import CreateCustomDataset, tokens_batch
ecg_vocab_size = int(ecg.ecg_statement_None_label+1)
EMBED_DIM = 512
LR = 0.0005

Embedding_param = {
    "vocab_size": ecg_vocab_size,
    "embed_size": EMBED_DIM,
}

MLP_param = {
    "in_dim": EMBED_DIM,
    "hidden_dim": [EMBED_DIM, EMBED_DIM],
    "drop_prob": 0.25
}

MLP_decoder_param = {
    "in_dim": EMBED_DIM,
    "num_class": targets_num,
    "hidden_dim": [EMBED_DIM//2],
    "drop_prob": 0.25
}

collate_fn_params = [{"name": tokens_batch.__name__, "param": {"accum": False, "onset": True}}]

#Setting Training Parameters
train_param = {
    "DATASET": ecg_note_dataset_param,
    "MODEL_NAME": "ecg_note_unimodal",
    "ENCODER_PARAM": [Embedding_param, MLP_param],
    "ENCODER_MODEL": [Embedding.__name__, MLP.__name__],
    "DECODER_PARAM": MLP_decoder_param,
    "DECODER_MODEL": MLPDecoder.__name__,
    "BATCH_SIZE": 128,
    "LR": LR,
    "MAX_EPOCHS": 20,
    "OPTIMIZER": "Adam",
    "COLLATE_FN_PARAMS": collate_fn_params
}

model = create_unimodal_model(train_param["ENCODER_MODEL"],
                            train_param["ENCODER_PARAM"],
                            train_param["DECODER_MODEL"],
                            train_param["DECODER_PARAM"], device)

collate_batch = CreateCustomDataset(len(collate_fn_params), train_param["COLLATE_FN_PARAMS"])   
log = run_kfolds(train_param, model, ecg_note_dataset, ecg_sig_kfolds, log_folder=log_folder, collate_fun=collate_batch)

#### Signal

In [None]:
ecg_sig_dataset = ecg.get_ecg_dataset(type="sig")
ecg_sig_dataset_param = {
    "path": ecg_dataset_path,
    "num": len(ecg_sig_dataset)
}

In [None]:
from blocks.resnet import ResNet1d
from blocks.mlp import MLP, MLPDecoder
EMBED_DIM = 640
LR = 0.0005
ECG_CHANNEL = 12
SIG_LEN = 640
FILTER_SIZE = [64, 128, 196, 256, 320]
SEQ_LEN = [640, 320, 160, 40, 20]

ResNet_param = {
    "input_dim": (ECG_CHANNEL, SIG_LEN),
    "blocks_dim": list(zip(FILTER_SIZE, SEQ_LEN)),
    "kernel_size": 5,
    "dropout_rate": 0.3
}

MLP_param = {
    "in_dim": FILTER_SIZE[-1] * SEQ_LEN[-1],
    "hidden_dim": [EMBED_DIM],
    "drop_prob": 0.1
}

MLP_decoder_param = {
    "in_dim": MLP_param["hidden_dim"][-1],
    "num_class": targets_num,
    "hidden_dim": [EMBED_DIM//2],
    "drop_prob": 0.1
}

train_param = {
    "DATASET": ecg_sig_dataset_param,
    "MODEL_NAME": "ecg_sig_unimodal",
    "ENCODER_PARAM": [ResNet_param, MLP_param],
    "ENCODER_MODEL": [ResNet1d.__name__, MLP.__name__],
    "DECODER_PARAM": MLP_decoder_param,
    "DECODER_MODEL": MLPDecoder.__name__,
    "BATCH_SIZE": 128,
    "LR": LR,
    "MAX_EPOCHS": 1,
    "OPTIMIZER": "Adam"
}

model = create_unimodal_model(train_param["ENCODER_MODEL"],
                                train_param["ENCODER_PARAM"],
                                train_param["DECODER_MODEL"],
                                train_param["DECODER_PARAM"], device)

log = run_kfolds(train_param, model, ecg_sig_dataset, ecg_sig_kfolds, log_folder=log_folder)

#### Feats

In [None]:
ecg_feats_dataset = ecg.get_ecg_dataset(type="feats")
ecg_feats_dataset_param = {
    "path": ecg_dataset_path,
    "num": len(ecg_feats_dataset)
}

In [None]:
from blocks.mlp import MLP, MLPDecoder

FEATS_NUM = ecg.ecg_feats.shape[1]
EMBED_DIM = 64
LR = 0.005

MLP_param = {
    "in_dim": FEATS_NUM,
    "hidden_dim": [EMBED_DIM],
    "drop_prob": 0.05
}

MLP_decoder_param = {
    "in_dim": MLP_param["hidden_dim"][-1],
    "num_class": targets_num,
    "hidden_dim": [EMBED_DIM//2],
    "drop_prob": 0.05
}

train_param = {
    "DATASET": ecg_feats_dataset_param,
    "MODEL_NAME": "ecg_feats_unimodal",
    "ENCODER_PARAM": [MLP_param],
    "ENCODER_MODEL": [MLP.__name__],
    "DECODER_PARAM": MLP_decoder_param,
    "DECODER_MODEL": MLPDecoder.__name__,
    "BATCH_SIZE": 128,
    "LR": LR,
    "MAX_EPOCHS": 20,
    "OPTIMIZER": "Adam"
    # "LOSS_WEIGHT": [0.1,0.9]
}

model = create_unimodal_model(train_param["ENCODER_MODEL"],
                                train_param["ENCODER_PARAM"],
                                train_param["DECODER_MODEL"],
                                train_param["DECODER_PARAM"], device)


log = run_kfolds(train_param, model, ecg_feats_dataset, ecg_kfolds, log_folder=log_folder)

#### Fusion

In [None]:
import numpy as np
from datasets.CVD.ecg import ECGFusionDataset

ecg_fusion_dataset = ecg.get_ecg_dataset(type="fusion")

ecg_fusion_dataset_param = {
    "path": ecg_dataset_path,
    "num": len(ecg_fusion_dataset),
}

In [None]:
from models.multimodal import BiModalAttn
from blocks.mlp import MLPDecoder, MLP
from datasets.collate_fun import CreateCustomDataset, tokens_batch, basic_collate_fn
import math

k_models = True
EMBED_DIM = 256
BATCH_SIZE = 128
LR = 0.00005

encoders_i = [0,1,2]

ecg_sig_model_param = {
    "model_path": log_folder+"/ecg_sig_unimodal/.../",
    "out_dim": 640
}

ecg_feats_model_param = {
    "model_path": log_folder+"/ecg_feats_unimodal/.../",
    "out_dim": 512
}

ecg_note_model_param = {
    "model_path": log_folder+"/ecg_note_unimodal/.../",
    "out_dim": 256
}

BiModelAttn_param = {
    "embed_size": EMBED_DIM,
    "num_blocks": 1,
    "num_heads": 64,
    "drop_prob": 0.1,
    "fusion_type": "add"
}

shared_layer_param = {
    "in_dim": EMBED_DIM,
    "hidden_dim": [EMBED_DIM],
    "drop_prob": 0.05,
    "BatchNorm": False
}

DECODER_IN_DIM = int(EMBED_DIM*len(encoders_i) + EMBED_DIM*math.comb(len(encoders_i), 2))

MLP_decoder_param = {
    "in_dim": DECODER_IN_DIM,
    "num_class": targets_num,
    "hidden_dim": [DECODER_IN_DIM//2],
    "drop_prob": 0.1
}

collate_fn_params = [
    {"name": basic_collate_fn.__name__},
    {"name": basic_collate_fn.__name__},
    {"name": tokens_batch.__name__, "param": {"accum": False, "onset": True}}
]

train_param = {
    "DATASET": ecg_fusion_dataset_param,
    "MODEL_NAME": "ecg_fusion_unimodal",
    "ENCODERS_I": encoders_i,
    "ENCODERS_PARAM": [ecg_sig_model_param, ecg_feats_model_param, ecg_note_model_param],
    "INTER_MODEL": BiModalAttn.__name__,
    "INTER_MODEL_PARAM": BiModelAttn_param,
    "SHARED_LAYER_PARAM": shared_layer_param,
    "DECODER_MODEL": MLPDecoder.__name__,
    "DECODER_PARAM": MLP_decoder_param,
    "EMBED_DIM": EMBED_DIM,
    "BATCH_SIZE": BATCH_SIZE,
    "LR": LR,
    "MAX_EPOCHS": 20,
    "OPTIMIZER": "Adam",
    "COLLATE_FN_PARAMS": collate_fn_params
}

model = create_multimodal_model(train_param, device, k_models=k_models)

collate_batch = CreateCustomDataset(len(collate_fn_params), train_param["COLLATE_FN_PARAMS"])
log = run_kfolds(train_param, model, ecg_fusion_dataset, ecg_sig_kfolds, log_folder=log_folder, collate_fun = collate_batch)

#### Hierarchical Time-Aware Fusion

In [None]:
ecg_dataset = ecg.get_dataset(only_valid=True)
ecg_dataset_param = {
    "path": ecg_dataset_path,
    "only_valid": True,
    "num": len(ecg_dataset)
}
del ecg

In [None]:
from torch import nn
from datasets.collate_fun import CreateCustomDataset, ecg_bags_batch
from blocks.mlp import MLP, MLPDecoder
from models.unimodal import TemporalPooling
from blocks.resnet import ResNet1d

WIN_NUM = 5
EMBED_DIM = 256
LR = 0.00001
BATCH_SIZE = 32
k_models = 5

TemporalPooling_param = {
    "model_path": log_folder+"/ecg_fusion_unimodal/250522013858/",
    "win_size": WIN_NUM,
    "device": device
}

embeds_model = torch.load(TemporalPooling_param["model_path"]+"model_0.pth", weights_only=False)
for i in range(2):
    embeds_model.shared_decoder.mlp[-(i+1)] = nn.Identity()
embeds_dim = embeds_model.shared_decoder.mlp[-6].out_features
TemporalPooling_param["embeds_model"] = embeds_model

ResNet_param = {
    "input_dim": (embeds_dim, WIN_NUM),
    "blocks_dim": [(embeds_dim//2, 5)],
    "kernel_size": 3,
    "dropout_rate": 0.3
}

MLP_param = {
    "in_dim": ResNet_param["blocks_dim"][-1][0] * ResNet_param["blocks_dim"][-1][1],
    "hidden_dim": [EMBED_DIM],
    "drop_prob": 0.05
}

MLP_decoder_param = {
    "in_dim": EMBED_DIM,
    "num_class": targets_num,
    "hidden_dim": [EMBED_DIM//2],
    "drop_prob": 0.1
}

collate_fn_params = [{"name":ecg_bags_batch.__name__}]

train_param = {
    "DATASET": ecg_dataset_param,
    "MODEL_NAME": "ecg_unimodal",
    "ENCODER_PARAM": [TemporalPooling_param, ResNet_param, MLP_param],
    "ENCODER_MODEL": [TemporalPooling.__name__, ResNet1d.__name__, MLP.__name__],
    "DECODER_PARAM": MLP_decoder_param,
    "DECODER_MODEL": MLPDecoder.__name__,
    "BATCH_SIZE": BATCH_SIZE,
    "LR": LR,
    "MAX_EPOCHS": 20,
    "OPTIMIZER": "Adam",
    "COLLATE_FN_PARAMS": collate_fn_params
}

model = []
if k_models is not None:
    for mi in range(k_models):
        embeds_model = torch.load(TemporalPooling_param["model_path"]+"model_%i.pth"%(mi), weights_only=False)
        for i in range(2):
            embeds_model.shared_decoder.mlp[-(i+1)] = nn.Identity()
        TemporalPooling_param["embeds_model"] = embeds_model
        
        _model = create_unimodal_model(train_param["ENCODER_MODEL"],
                                    train_param["ENCODER_PARAM"],
                                    train_param["DECODER_MODEL"],
                                    train_param["DECODER_PARAM"], device)
        model.append(_model)
else:   
    model = create_unimodal_model(train_param["ENCODER_MODEL"],
                                    train_param["ENCODER_PARAM"],
                                    train_param["DECODER_MODEL"],
                                    train_param["DECODER_PARAM"], device)

del TemporalPooling_param["embeds_model"]

collate_batch = CreateCustomDataset(1, train_param["COLLATE_FN_PARAMS"])

log = run_kfolds(train_param, model, ecg_dataset, ecg_kfolds, collate_fun=collate_batch, log_folder=log_folder)