In [1]:
from pathlib import Path
import sys
sys.path.append(str(Path.cwd() / "../code"))
from os import environ
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import logging
from sklearn.preprocessing import LabelEncoder

from training import training_loop
from utils import balanced_data_shuffle, get_df_raw_data
from models import LinearLayerShared
## Data path ##
DATA_PATH = (Path.cwd().parent / "DATA").resolve()
print(f"Data path: {DATA_PATH}")
DATA_PATH = str(DATA_PATH)
logging.basicConfig(level=logging.INFO)

# set deterministic behavior
seed = 53498298
torch.manual_seed(seed)
np.random.seed(seed)

torch.use_deterministic_algorithms(True)
environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

Data path: C:\Users\Cyril\Desktop\Code\MIPLab-TeamCEE-DeepLearningforBiomed\DATA


## Config

In [2]:
config = {
    # data
    "stratify": True,
    "validation_split": 0,
    # general
    "epochs": 30,
    "batch_size": 32,
    "lr": 1e-4,
    "use_scheduler": False,
    "do_early_stopping": False,
    "patience": 10,
    "best_loss": 10,
    # model
    "d_model_input": 400,
    "d_model_intermediate": [1000],
    "d_model_task_output": 8,
    "d_model_fingerprint_output": None,  # needs to be determined from data
    "dropout": 0,
    "attention_dropout": 0.1,
    "num_heads": 1,
    # optimizer
    "lambda_si": 0.5,
    "lambda_td": 0.5,
    "weight_decay": 1,
}

In [3]:
IDs = [
    100307,
    117122,
    131722,
    153025,
    211720,
    100408,
    118528,
    133019,
    154734,
    212318,
    101107,
    118730,
    133928,
    156637,
    214423,
    101309,
    118932,
    135225,
    159340,
    221319,
    101915,
    120111,
    135932,
    160123,
    239944,
    103111,
    122317,
    136833,
    161731,
    245333,
    103414,
    122620,
    138534,
    162733,
    280739,
    103818,
    123117,
    139637,
    163129,
    298051,
    105014,
    123925,
    140925,
    176542,
    366446,
    105115,
    124422,
    144832,
    178950,
    397760,
    106016,
    125525,
    146432,
    188347,
    414229,
    108828,
    126325,
    147737,
    189450,
    499566,
    110411,
    127630,
    148335,
    190031,
    654754,
    111312,
    127933,
    148840,
    192540,
    672756,
    111716,
    128127,
    149337,
    196750,
    751348,
    113619,
    128632,
    149539,
    198451,
    756055,
    113922,
    129028,
    149741,
    199655,
    792564,
    114419,
    130013,
    151223,
    201111,
    856766,
    115320,
    130316,
    151526,
    208226,
    857263,
]


In [4]:
def _show_df_distribution(df):
    print("Number of samples:", len(df))
    print("Unique subjects:", df["subject_id"].nunique())
    print("Unique tasks:", df["task"].nunique())
    print("*" * 50)

## Joining train and test dataframes from all subjects

In [5]:
# data_dict_train, data_dict_test = get_dict_raw_data(DATA_PATH, IDs[0:3])
data_df_train, data_df_test = get_df_raw_data(DATA_PATH, IDs[:])

train_dataframe, valid_dataframe = balanced_data_shuffle(
    data_df_train,
    val_frac=config["validation_split"],
    stratify=config["stratify"],
)
NUM_SUBJECTS = len(data_df_train["subject_id"].unique())
print(f"Number of subjects: {NUM_SUBJECTS}")
NUM_TASKS = data_df_train["task"].nunique()
print(f"Number of tasks: {NUM_TASKS}")

#
###-------------------------------------------------------------------------------------------------------------------
#         label encoding
###-------------------------------------------------------------------------------------------------------------------
enc_labels = LabelEncoder()
enc_tasks = LabelEncoder()

enc_labels.fit(data_df_train["subject_id"].tolist())
enc_tasks.fit(data_df_train["task"].tolist())

enc_train_label_encodings = enc_labels.transform(
    train_dataframe["subject_id"].tolist()
)
enc_train_task_encodings = enc_tasks.transform(
    train_dataframe["task"].tolist()
)

enc_test_label_encodings = enc_labels.transform(
    data_df_test["subject_id"].tolist()
)
enc_test_task_encodings = enc_tasks.transform(
    data_df_test["task"].tolist()
)

train_dataframe["enc_label_id"] = enc_train_label_encodings
train_dataframe["enc_task"] = enc_train_task_encodings
data_df_test["enc_label_id"] = enc_test_label_encodings
data_df_test["enc_task"] = enc_test_task_encodings

print("Subjects present in train set but not in test set:")
overlap_set = set(train_dataframe["subject_id"].unique()) - set(
    data_df_test["subject_id"].unique()
)
print(overlap_set)
if len(overlap_set) != 0:
    print("WARNING: subjects present in train set but not in test set")

print("Train set:")
_show_df_distribution(train_dataframe)
print("Test set:")
_show_df_distribution(data_df_test)

Number of subjects: 95
Number of tasks: 8
Subjects present in train set but not in test set:
set()
Train set:
Number of samples: 760
Unique subjects: 95
Unique tasks: 8
**************************************************
Test set:
Number of samples: 758
Unique subjects: 95
Unique tasks: 8
**************************************************


## Dataloaders

In [6]:
train_dataset = TensorDataset(
    torch.tensor(
        np.array(train_dataframe["mat"].tolist()).astype(np.float32)
    ),
    torch.tensor(train_dataframe["enc_label_id"].to_numpy()),
    torch.tensor(train_dataframe["enc_task"].to_numpy()),
)
train_loader = DataLoader(
    train_dataset, batch_size=config["batch_size"], shuffle=True
)

test_dataset = TensorDataset(
    torch.tensor(
        np.array(data_df_test["mat"].tolist()).astype(np.float32)
    ),
    torch.tensor(data_df_test["enc_label_id"].to_numpy()),
    torch.tensor(data_df_test["enc_task"].to_numpy()),
)
test_loader = DataLoader(
    test_dataset, batch_size=config["batch_size"], shuffle=False
)

valid_loader = None
if valid_dataframe is not None:
    enc_valid_label_encodings = enc_labels.transform(
        valid_dataframe["subject_id"].tolist()
    )
    enc_valid_task_encodings = enc_tasks.transform(
        valid_dataframe["task"].tolist()
    )
    valid_dataframe["enc_label_id"] = enc_valid_label_encodings
    valid_dataframe["enc_task"] = enc_valid_task_encodings
    print("Subjects present in validation set but not in train set:")
    overlap_set = set(valid_dataframe["subject_id"].unique()) - set(
        train_dataframe["subject_id"].unique()
    )
    print(overlap_set)
    if len(overlap_set) != 0:
        print(
            "WARNING: subjects present in validation set but not in train set"
        )
    valid_dataset = TensorDataset(
        torch.tensor(
            np.array(valid_dataframe["mat"].tolist()).astype(np.float32)
        ),
        torch.tensor(valid_dataframe["enc_label_id"].to_numpy()),
        torch.tensor(valid_dataframe["enc_task"].to_numpy()),
    )
    valid_loader = DataLoader(
        valid_dataset, batch_size=config["batch_size"], shuffle=False
    )
    print("Validation set:")
    _show_df_distribution(valid_dataframe)


## Training

In [7]:
###-------------------------------------------------------------------------------------------------------------------
#         Model
###-------------------------------------------------------------------------------------------------------------------

# list all available torch devices
device_list = ["cpu"] + [
    f"cuda:{i}" for i in range(torch.cuda.device_count())
]
device = device_list[-1] if torch.cuda.is_available() else device_list[0]
print(f"Using device: {device}")
model = LinearLayerShared(
    output_size_tasks=NUM_TASKS,
    output_size_subjects=NUM_SUBJECTS,
    input_size=config["d_model_input"],
    intermediate_size=config["d_model_intermediate"],
    dropout=config["dropout"],
).to(device)
wandb_run_name = (
    "LinearShared_best_repro_1000"
)


###-------------------------------------------------------------------------------------------------------------------
#         training
###-------------------------------------------------------------------------------------------------------------------

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config["lr"],
    weight_decay=config["weight_decay"],
)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=20, gamma=0.1
)

training_loop(
    config["epochs"],
    model,
    train_loader,
    valid_loader,
    criterion,
    optimizer,
    device,
    config,
    scheduler=scheduler if config["use_scheduler"] else None,
    save_model=False,
    save_attention_weights=False,
    test_loader=test_loader,
    run_name=wandb_run_name,
    job_name="LinearShared",
    use_deeplift=True,
    use_early_stopping=config["do_early_stopping"],
)


Using device: cuda:0
Using cuda:0


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mc-achard[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch: 1/30 - loss_total: 3.3751 - acc: SI 1.87% / TD 33.29%
 - (8.05s/epoch)
Epoch: 2/30 - loss_total: 3.0493 - acc: SI 12.80% / TD 76.82%
 - (2.43s/epoch)
Epoch: 3/30 - loss_total: 2.8336 - acc: SI 27.60% / TD 82.12%
 - (2.40s/epoch)
Epoch: 4/30 - loss_total: 2.4820 - acc: SI 45.57% / TD 84.64%
 - (2.37s/epoch)
Epoch: 5/30 - loss_total: 1.8355 - acc: SI 73.39% / TD 93.36%
 - (2.39s/epoch)
Epoch: 6/30 - loss_total: 1.0819 - acc: SI 93.97% / TD 97.40%
 - (2.38s/epoch)
Epoch: 7/30 - loss_total: 0.5439 - acc: SI 99.18% / TD 99.35%
 - (2.41s/epoch)
Epoch: 8/30 - loss_total: 0.2682 - acc: SI 100.00% / TD 99.87%
 - (2.32s/epoch)
Epoch: 9/30 - loss_total: 0.1474 - acc: SI 100.00% / TD 100.00%
 - (2.32s/epoch)
Epoch: 10/30 - loss_total: 0.0902 - acc: SI 100.00% / TD 100.00%
 - (2.35s/epoch)
Epoch: 11/30 - loss_total: 0.0640 - acc: SI 100.00% / TD 100.00%
 - (2.33s/epoch)
Epoch: 12/30 - loss_total: 0.0509 - acc: SI 100.00% / TD 100.00%
 - (2.39s/epoch)
Epoch: 13/30 - loss_total: 0.0430 - acc: 

               activations. The hooks and attributes will be removed
            after the attribution is finished


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Epoch/Epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
Test/acc_si,▁
Test/acc_td,▁
Test/f1_si,▁
Test/f1_td,▁
Test/loss_si,▁
Test/loss_td,▁
Test/total_loss,▁
Train/Epoch-acc_si,▁▂▃▄▆█████████████████████████
Train/Epoch-acc_td,▁▆▆▆▇█████████████████████████

0,1
Epoch/Epoch,30.0
Test/acc_si,99.86979
Test/acc_td,96.10559
Test/f1_si,0.99259
Test/f1_td,0.95984
Test/loss_si,0.37053
Test/loss_td,0.15122
Test/total_loss,0.26088
Train/Epoch-acc_si,100.0
Train/Epoch-acc_td,100.0


Finished Training.


{'epoch': 30,
 'loss_total': [3.37506032983462,
  3.049334943294525,
  2.8336475988229117,
  2.4820052285989127,
  1.8355181366205215,
  1.0818914845585823,
  0.5439204735060533,
  0.26824660412967205,
  0.14743680010239282,
  0.09020197112113237,
  0.06398172847305734,
  0.05091770893583695,
  0.04299478077640136,
  0.038361032803853355,
  0.034443633475651346,
  0.031444006211434804,
  0.028880763954172533,
  0.026703542020792764,
  0.024795867114638288,
  0.023199283595507342,
  0.02167311031371355,
  0.02047078249355157,
  0.019255402963608503,
  0.018233076203614473,
  0.01735010075693329,
  0.016448276466690004,
  0.01565183113173892,
  0.014926119125448167,
  0.014252072937476138,
  0.013665597303770483],
 'loss_si': [4.6769729653994245,
  4.4445816079775495,
  4.310614089171092,
  4.019453247388204,
  3.228326062361399,
  1.9565102408329647,
  0.9222585732738177,
  0.4004078308741252,
  0.19587869103997946,
  0.11293818522244692,
  0.08047678228467703,
  0.06416000192984939,
  