In [1]:
from pathlib import Path
import sys
sys.path.append(str(Path.cwd() / "../code"))
from os import environ
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import logging
from sklearn.preprocessing import LabelEncoder

from training import training_loop
from utils import balanced_data_shuffle, get_df_raw_data
from models import LinearLayerShared
## Data path ##
DATA_PATH = (Path.cwd().parent / "DATA").resolve()
print(f"Data path: {DATA_PATH}")
DATA_PATH = str(DATA_PATH)
logging.basicConfig(level=logging.INFO)

# set deterministic behavior
seed = 53498298
torch.manual_seed(seed)
np.random.seed(seed)

torch.use_deterministic_algorithms(True)
environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

Data path: C:\Users\Cyril\Desktop\Code\MIPLab-TeamCEE-DeepLearningforBiomed\DATA


## Config

In [2]:
config = {
    # data
    "stratify": True,
    "validation_split": 0.2,
    # general
    "epochs": 30,
    "batch_size": 32,
    "lr": 1e-4,
    "use_scheduler": False,
    "do_early_stopping": False,
    "patience": 10,
    "best_loss": 10,
    # model
    "d_model_input": 400,
    "d_model_intermediate": [1024],
    "d_model_task_output": 8,
    "d_model_fingerprint_output": None,  # needs to be determined from data
    "dropout": 0.1,
    "attention_dropout": 0.1,
    "num_heads": 1,
    # optimizer
    "lambda_si": 0.5,
    "lambda_td": 0.5,
    "weight_decay": 1,
}

In [3]:
IDs = [
    100307,
    117122,
    131722,
    153025,
    211720,
    100408,
    118528,
    133019,
    154734,
    212318,
    101107,
    118730,
    133928,
    156637,
    214423,
    101309,
    118932,
    135225,
    159340,
    221319,
    101915,
    120111,
    135932,
    160123,
    239944,
    103111,
    122317,
    136833,
    161731,
    245333,
    103414,
    122620,
    138534,
    162733,
    280739,
    103818,
    123117,
    139637,
    163129,
    298051,
    105014,
    123925,
    140925,
    176542,
    366446,
    105115,
    124422,
    144832,
    178950,
    397760,
    106016,
    125525,
    146432,
    188347,
    414229,
    108828,
    126325,
    147737,
    189450,
    499566,
    110411,
    127630,
    148335,
    190031,
    654754,
    111312,
    127933,
    148840,
    192540,
    672756,
    111716,
    128127,
    149337,
    196750,
    751348,
    113619,
    128632,
    149539,
    198451,
    756055,
    113922,
    129028,
    149741,
    199655,
    792564,
    114419,
    130013,
    151223,
    201111,
    856766,
    115320,
    130316,
    151526,
    208226,
    857263,
]


In [4]:
def _show_df_distribution(df):
    print("Number of samples:", len(df))
    print("Unique subjects:", df["subject_id"].nunique())
    print("Unique tasks:", df["task"].nunique())
    print("*" * 50)

## Joining train and test dataframes from all subjects

In [5]:
# data_dict_train, data_dict_test = get_dict_raw_data(DATA_PATH, IDs[0:3])
data_df_train, data_df_test = get_df_raw_data(DATA_PATH, IDs[:])

train_dataframe, valid_dataframe = balanced_data_shuffle(
    data_df_train,
    val_frac=config["validation_split"],
    stratify=config["stratify"],
)
NUM_SUBJECTS = len(data_df_train["subject_id"].unique())
print(f"Number of subjects: {NUM_SUBJECTS}")
NUM_TASKS = data_df_train["task"].nunique()
print(f"Number of tasks: {NUM_TASKS}")

#
###-------------------------------------------------------------------------------------------------------------------
#         label encoding
###-------------------------------------------------------------------------------------------------------------------
enc_labels = LabelEncoder()
enc_tasks = LabelEncoder()

enc_labels.fit(data_df_train["subject_id"].tolist())
enc_tasks.fit(data_df_train["task"].tolist())

enc_train_label_encodings = enc_labels.transform(
    train_dataframe["subject_id"].tolist()
)
enc_train_task_encodings = enc_tasks.transform(
    train_dataframe["task"].tolist()
)

enc_test_label_encodings = enc_labels.transform(
    data_df_test["subject_id"].tolist()
)
enc_test_task_encodings = enc_tasks.transform(
    data_df_test["task"].tolist()
)

train_dataframe["enc_label_id"] = enc_train_label_encodings
train_dataframe["enc_task"] = enc_train_task_encodings
data_df_test["enc_label_id"] = enc_test_label_encodings
data_df_test["enc_task"] = enc_test_task_encodings

print("Subjects present in train set but not in test set:")
overlap_set = set(train_dataframe["subject_id"].unique()) - set(
    data_df_test["subject_id"].unique()
)
print(overlap_set)
if len(overlap_set) != 0:
    print("WARNING: subjects present in train set but not in test set")

print("Train set:")
_show_df_distribution(train_dataframe)
print("Test set:")
_show_df_distribution(data_df_test)

Number of subjects: 95
Number of tasks: 8
Subjects present in train set but not in test set:
set()
Train set:
Number of samples: 608
Unique subjects: 95
Unique tasks: 8
**************************************************
Test set:
Number of samples: 758
Unique subjects: 95
Unique tasks: 8
**************************************************


## Dataloaders

In [6]:
train_dataset = TensorDataset(
    torch.tensor(
        np.array(train_dataframe["mat"].tolist()).astype(np.float32)
    ),
    torch.tensor(train_dataframe["enc_label_id"].to_numpy()),
    torch.tensor(train_dataframe["enc_task"].to_numpy()),
)
train_loader = DataLoader(
    train_dataset, batch_size=config["batch_size"], shuffle=True
)

test_dataset = TensorDataset(
    torch.tensor(
        np.array(data_df_test["mat"].tolist()).astype(np.float32)
    ),
    torch.tensor(data_df_test["enc_label_id"].to_numpy()),
    torch.tensor(data_df_test["enc_task"].to_numpy()),
)
test_loader = DataLoader(
    test_dataset, batch_size=config["batch_size"], shuffle=False
)

valid_loader = None
if valid_dataframe is not None:
    enc_valid_label_encodings = enc_labels.transform(
        valid_dataframe["subject_id"].tolist()
    )
    enc_valid_task_encodings = enc_tasks.transform(
        valid_dataframe["task"].tolist()
    )
    valid_dataframe["enc_label_id"] = enc_valid_label_encodings
    valid_dataframe["enc_task"] = enc_valid_task_encodings
    print("Subjects present in validation set but not in train set:")
    overlap_set = set(valid_dataframe["subject_id"].unique()) - set(
        train_dataframe["subject_id"].unique()
    )
    print(overlap_set)
    if len(overlap_set) != 0:
        print(
            "WARNING: subjects present in validation set but not in train set"
        )
    valid_dataset = TensorDataset(
        torch.tensor(
            np.array(valid_dataframe["mat"].tolist()).astype(np.float32)
        ),
        torch.tensor(valid_dataframe["enc_label_id"].to_numpy()),
        torch.tensor(valid_dataframe["enc_task"].to_numpy()),
    )
    valid_loader = DataLoader(
        valid_dataset, batch_size=config["batch_size"], shuffle=False
    )
    print("Validation set:")
    _show_df_distribution(valid_dataframe)


Subjects present in validation set but not in train set:
set()
Validation set:
Number of samples: 152
Unique subjects: 76
Unique tasks: 8
**************************************************


## Training

In [7]:
###-------------------------------------------------------------------------------------------------------------------
#         Model
###-------------------------------------------------------------------------------------------------------------------

# list all available torch devices
device_list = ["cpu"] + [
    f"cuda:{i}" for i in range(torch.cuda.device_count())
]
device = device_list[-1] if torch.cuda.is_available() else device_list[0]
print(f"Using device: {device}")
model = LinearLayerShared(
    output_size_tasks=NUM_TASKS,
    output_size_subjects=NUM_SUBJECTS,
    input_size=config["d_model_input"],
    intermediate_size=config["d_model_intermediate"],
    dropout=config["dropout"],
).to(device)
wandb_run_name = (
    "LinearShared_best_repro_W_VAL"
)


###-------------------------------------------------------------------------------------------------------------------
#         training
###-------------------------------------------------------------------------------------------------------------------

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config["lr"],
    weight_decay=config["weight_decay"],
)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=20, gamma=0.1
)

training_loop(
    config["epochs"],
    model,
    train_loader,
    valid_loader,
    criterion,
    optimizer,
    device,
    config,
    scheduler=scheduler if config["use_scheduler"] else None,
    save_model=False,
    save_attention_weights=False,
    test_loader=test_loader,
    run_name=wandb_run_name,
    job_name="LinearShared",
    use_deeplift=True,
    use_early_stopping=config["do_early_stopping"],
)


Using device: cuda:0
Using cuda:0


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mc-achard[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch: 1/50 - loss_total: 3.3018 - acc: SI 2.14% / TD 35.03%
 - val-loss_total: 3.1710 - val-acc: SI 1.88% / TD 60.42% - val-f1: SI 0.0161 / TD 0.5466 - (1.59s/epoch)
Epoch: 2/50 - loss_total: 2.9354 - acc: SI 17.76% / TD 76.48%
 - val-loss_total: 2.9712 - val-acc: SI 2.50% / TD 80.21% - val-f1: SI 0.0232 / TD 0.7546 - (0.96s/epoch)
Epoch: 3/50 - loss_total: 2.6332 - acc: SI 30.76% / TD 85.36%
 - val-loss_total: 2.6970 - val-acc: SI 10.42% / TD 84.79% - val-f1: SI 0.0767 / TD 0.8208 - (0.98s/epoch)
Epoch: 4/50 - loss_total: 2.1215 - acc: SI 54.61% / TD 90.62%
 - val-loss_total: 2.2663 - val-acc: SI 25.00% / TD 83.75% - val-f1: SI 0.1769 / TD 0.8076 - (0.99s/epoch)
Epoch: 5/50 - loss_total: 1.3998 - acc: SI 80.92% / TD 95.39%
 - val-loss_total: 1.9589 - val-acc: SI 45.42% / TD 86.46% - val-f1: SI 0.3792 / TD 0.8421 - (0.98s/epoch)
Epoch: 6/50 - loss_total: 0.8155 - acc: SI 97.70% / TD 98.68%
 - val-loss_total: 1.4635 - val-acc: SI 73.54% / TD 91.25% - val-f1: SI 0.6495 / TD 0.9074 - (0.

               activations. The hooks and attributes will be removed
            after the attribution is finished


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Epoch/Epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Test/acc_si,▁
Test/acc_td,▁
Test/f1_si,▁
Test/f1_td,▁
Test/loss_si,▁
Test/loss_td,▁
Test/total_loss,▁
Train/Epoch-acc_si,▁▂▃▅████████████████████████████████████
Train/Epoch-acc_td,▁▅▆▇████████████████████████████████████

0,1
Epoch/Epoch,50.0
Test/acc_si,99.08854
Test/acc_td,94.35369
Test/f1_si,0.95296
Test/f1_td,0.94203
Test/loss_si,0.46356
Test/loss_td,0.18779
Test/total_loss,0.32568
Train/Epoch-acc_si,100.0
Train/Epoch-acc_td,100.0


Finished Training.


{'epoch': 50,
 'loss_total': [3.30184080726222,
  2.935366618005853,
  2.633178761130885,
  2.121483878085488,
  1.399751901626587,
  0.8154524470630445,
  0.3835297399445584,
  0.19859100486102857,
  0.11484789377764652,
  0.07171060105687693,
  0.054259371012449265,
  0.04486438141841637,
  0.03937306804092307,
  0.03533292679410232,
  0.03218385930124082,
  0.02965838530738103,
  0.027402092163500032,
  0.025566780057392623,
  0.023889921997722826,
  0.022500808105656977,
  0.021258727892449026,
  0.020121032274083087,
  0.01906267132021879,
  0.01811728695113408,
  0.017216212361266737,
  0.01642622102640177,
  0.01571830523837554,
  0.015060148299916795,
  0.014444554371661261,
  0.013857873352734666,
  0.013302491683708994,
  0.012808495348221377,
  0.012351369730343944,
  0.01190065112161009,
  0.011490144080629474,
  0.011116717304838332,
  0.010740474258598528,
  0.010386343310145955,
  0.010058236700531683,
  0.00974701104783698,
  0.00944216917023847,
  0.00915094009159427,
