In [1]:
from pathlib import Path
import sys
sys.path.append(str(Path.cwd() / "../code"))
from os import environ
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import logging
from sklearn.preprocessing import LabelEncoder

from training import training_loop
from utils import balanced_data_shuffle, get_df_raw_data
from models import LinearLayer
## Data path ##
DATA_PATH = (Path.cwd().parent / "DATA").resolve()
print(f"Data path: {DATA_PATH}")
DATA_PATH = str(DATA_PATH)
logging.basicConfig(level=logging.INFO)

# set deterministic behavior
seed = 53498298
torch.manual_seed(seed)
np.random.seed(seed)

torch.use_deterministic_algorithms(True)
environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
# environ["WANDB_MODE"] = "disabled"

Data path: C:\Users\Cyril\Desktop\Code\MIPLab-TeamCEE-DeepLearningforBiomed\DATA


## Config

In [2]:
config = {
    # data
    "stratify": True,
    "validation_split": 0,
    # general
    "epochs": 30,
    "batch_size": 32,
    "lr": 1e-4,
    "use_scheduler": False,
    "do_early_stopping": False,
    "patience": 10,
    "best_loss": 10,
    # model
    "d_model_input": 400,
    "d_model_intermediate": [1000],
    "d_model_task_output": 8,
    "d_model_fingerprint_output": None,  # needs to be determined from data
    "dropout": 0,
    "attention_dropout": 0.1,
    "num_heads": 1,
    # optimizer
    "lambda_si": 0.5,
    "lambda_td": 0.5,
    "weight_decay": 1,
}

In [3]:
IDs = [
    100307,
    117122,
    131722,
    153025,
    211720,
    100408,
    118528,
    133019,
    154734,
    212318,
    101107,
    118730,
    133928,
    156637,
    214423,
    101309,
    118932,
    135225,
    159340,
    221319,
    101915,
    120111,
    135932,
    160123,
    239944,
    103111,
    122317,
    136833,
    161731,
    245333,
    103414,
    122620,
    138534,
    162733,
    280739,
    103818,
    123117,
    139637,
    163129,
    298051,
    105014,
    123925,
    140925,
    176542,
    366446,
    105115,
    124422,
    144832,
    178950,
    397760,
    106016,
    125525,
    146432,
    188347,
    414229,
    108828,
    126325,
    147737,
    189450,
    499566,
    110411,
    127630,
    148335,
    190031,
    654754,
    111312,
    127933,
    148840,
    192540,
    672756,
    111716,
    128127,
    149337,
    196750,
    751348,
    113619,
    128632,
    149539,
    198451,
    756055,
    113922,
    129028,
    149741,
    199655,
    792564,
    114419,
    130013,
    151223,
    201111,
    856766,
    115320,
    130316,
    151526,
    208226,
    857263,
]


In [4]:
def _show_df_distribution(df):
    print("Number of samples:", len(df))
    print("Unique subjects:", df["subject_id"].nunique())
    print("Unique tasks:", df["task"].nunique())
    print("*" * 50)

## Joining train and test dataframes from all subjects

In [5]:
# data_dict_train, data_dict_test = get_dict_raw_data(DATA_PATH, IDs[0:3])
data_df_train, data_df_test = get_df_raw_data(DATA_PATH, IDs[:])

train_dataframe, valid_dataframe = balanced_data_shuffle(
    data_df_train,
    val_frac=config["validation_split"],
    stratify=config["stratify"],
)
NUM_SUBJECTS = len(data_df_train["subject_id"].unique())
print(f"Number of subjects: {NUM_SUBJECTS}")
NUM_TASKS = data_df_train["task"].nunique()
print(f"Number of tasks: {NUM_TASKS}")

#
###-------------------------------------------------------------------------------------------------------------------
#         label encoding
###-------------------------------------------------------------------------------------------------------------------
enc_labels = LabelEncoder()
enc_tasks = LabelEncoder()

enc_labels.fit(data_df_train["subject_id"].tolist())
enc_tasks.fit(data_df_train["task"].tolist())

enc_train_label_encodings = enc_labels.transform(
    train_dataframe["subject_id"].tolist()
)
enc_train_task_encodings = enc_tasks.transform(
    train_dataframe["task"].tolist()
)

enc_test_label_encodings = enc_labels.transform(
    data_df_test["subject_id"].tolist()
)
enc_test_task_encodings = enc_tasks.transform(
    data_df_test["task"].tolist()
)

train_dataframe["enc_label_id"] = enc_train_label_encodings
train_dataframe["enc_task"] = enc_train_task_encodings
data_df_test["enc_label_id"] = enc_test_label_encodings
data_df_test["enc_task"] = enc_test_task_encodings

print("Subjects present in train set but not in test set:")
overlap_set = set(train_dataframe["subject_id"].unique()) - set(
    data_df_test["subject_id"].unique()
)
print(overlap_set)
if len(overlap_set) != 0:
    print("WARNING: subjects present in train set but not in test set")

print("Train set:")
_show_df_distribution(train_dataframe)
print("Test set:")
_show_df_distribution(data_df_test)

Number of subjects: 95
Number of tasks: 8
Subjects present in train set but not in test set:
set()
Train set:
Number of samples: 760
Unique subjects: 95
Unique tasks: 8
**************************************************
Test set:
Number of samples: 758
Unique subjects: 95
Unique tasks: 8
**************************************************


## Dataloaders

In [6]:
train_dataset = TensorDataset(
    torch.tensor(
        np.array(train_dataframe["mat"].tolist()).astype(np.float32)
    ),
    torch.tensor(train_dataframe["enc_label_id"].to_numpy()),
    torch.tensor(train_dataframe["enc_task"].to_numpy()),
)
train_loader = DataLoader(
    train_dataset, batch_size=config["batch_size"], shuffle=True
)

test_dataset = TensorDataset(
    torch.tensor(
        np.array(data_df_test["mat"].tolist()).astype(np.float32)
    ),
    torch.tensor(data_df_test["enc_label_id"].to_numpy()),
    torch.tensor(data_df_test["enc_task"].to_numpy()),
)
test_loader = DataLoader(
    test_dataset, batch_size=config["batch_size"], shuffle=False
)

valid_loader = None
if valid_dataframe is not None:
    enc_valid_label_encodings = enc_labels.transform(
        valid_dataframe["subject_id"].tolist()
    )
    enc_valid_task_encodings = enc_tasks.transform(
        valid_dataframe["task"].tolist()
    )
    valid_dataframe["enc_label_id"] = enc_valid_label_encodings
    valid_dataframe["enc_task"] = enc_valid_task_encodings
    print("Subjects present in validation set but not in train set:")
    overlap_set = set(valid_dataframe["subject_id"].unique()) - set(
        train_dataframe["subject_id"].unique()
    )
    print(overlap_set)
    if len(overlap_set) != 0:
        print(
            "WARNING: subjects present in validation set but not in train set"
        )
    valid_dataset = TensorDataset(
        torch.tensor(
            np.array(valid_dataframe["mat"].tolist()).astype(np.float32)
        ),
        torch.tensor(valid_dataframe["enc_label_id"].to_numpy()),
        torch.tensor(valid_dataframe["enc_task"].to_numpy()),
    )
    valid_loader = DataLoader(
        valid_dataset, batch_size=config["batch_size"], shuffle=False
    )
    print("Validation set:")
    _show_df_distribution(valid_dataframe)


## Training

In [7]:
###-------------------------------------------------------------------------------------------------------------------
#         Model
###-------------------------------------------------------------------------------------------------------------------

# list all available torch devices
device_list = ["cpu"] + [
    f"cuda:{i}" for i in range(torch.cuda.device_count())
]
device = device_list[-1] if torch.cuda.is_available() else device_list[0]
print(f"Using device: {device}")
model = LinearLayer(
    output_size_tasks=NUM_TASKS,
    output_size_subjects=NUM_SUBJECTS,
    input_size=config["d_model_input"],
    intermediate_size=config["d_model_intermediate"],
    dropout=config["dropout"],
).to(device)
wandb_run_name = (
    "LinearSplit_best_repro_FINAL"
)


###-------------------------------------------------------------------------------------------------------------------
#         training
###-------------------------------------------------------------------------------------------------------------------

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config["lr"],
    weight_decay=config["weight_decay"],
)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, step_size=20, gamma=0.1
)

training_loop(
    config["epochs"],
    model,
    train_loader,
    valid_loader,
    criterion,
    optimizer,
    device,
    config,
    scheduler=scheduler if config["use_scheduler"] else None,
    save_model=True,
    save_attention_weights=False,
    test_loader=test_loader,
    run_name=wandb_run_name,
    job_name="LinearSplit",
    use_deeplift=True,
    use_early_stopping=config["do_early_stopping"],
)


Using device: cuda:0
Using cuda:0


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
INFO:wandb:Watching


Epoch: 1/30 - loss_total: 3.2544 - acc: SI 1.48% / TD 43.53%
 - (2.44s/epoch)
Epoch: 2/30 - loss_total: 2.7205 - acc: SI 32.60% / TD 76.17%
 - (1.94s/epoch)
Epoch: 3/30 - loss_total: 2.2973 - acc: SI 45.79% / TD 89.45%
 - (1.93s/epoch)
Epoch: 4/30 - loss_total: 1.4951 - acc: SI 80.86% / TD 93.36%
 - (1.91s/epoch)
Epoch: 5/30 - loss_total: 0.5752 - acc: SI 98.70% / TD 97.35%
 - (1.92s/epoch)
Epoch: 6/30 - loss_total: 0.3768 - acc: SI 99.87% / TD 98.57%
 - (1.91s/epoch)
Epoch: 7/30 - loss_total: 0.2257 - acc: SI 100.00% / TD 99.35%
 - (1.91s/epoch)
Epoch: 8/30 - loss_total: 0.1332 - acc: SI 100.00% / TD 99.87%
 - (1.91s/epoch)
Epoch: 9/30 - loss_total: 0.0801 - acc: SI 100.00% / TD 100.00%
 - (1.91s/epoch)
Epoch: 10/30 - loss_total: 0.0614 - acc: SI 100.00% / TD 100.00%
 - (1.91s/epoch)
Epoch: 11/30 - loss_total: 0.0501 - acc: SI 100.00% / TD 100.00%
 - (1.91s/epoch)
Epoch: 12/30 - loss_total: 0.0432 - acc: SI 100.00% / TD 100.00%
 - (1.91s/epoch)
Epoch: 13/30 - loss_total: 0.0384 - acc:

               activations. The hooks and attributes will be removed
            after the attribution is finished


Running DeepLIFT for task REST1
TD attributions shape :  torch.Size([24, 400, 400])
Running DeepLIFT for task EMOTION
TD attributions shape :  torch.Size([24, 400, 400])
Running DeepLIFT for task GAMBLING
TD attributions shape :  torch.Size([24, 400, 400])
Running DeepLIFT for task LANGUAGE
TD attributions shape :  torch.Size([24, 400, 400])
Running DeepLIFT for task MOTOR
TD attributions shape :  torch.Size([24, 400, 400])
Running DeepLIFT for task RELATIONAL
TD attributions shape :  torch.Size([24, 400, 400])
Running DeepLIFT for task SOCIAL
TD attributions shape :  torch.Size([24, 400, 400])
Running DeepLIFT for task WM
TD attributions shape :  torch.Size([24, 400, 400])
Finished Training.


{'epoch': 30,
 'loss_total': [3.254369010527929,
  2.720523029565811,
  2.2972715298334756,
  1.4951352725426357,
  0.5752105688055357,
  0.3767905669907729,
  0.225746746485432,
  0.13317152919868627,
  0.08012466412037611,
  0.061437145651628576,
  0.05007062417765459,
  0.04319208969051639,
  0.03835362164924542,
  0.03474711145584782,
  0.03170205761368076,
  0.029246449160079162,
  0.027028008131310344,
  0.025143034057691693,
  0.02350838699688514,
  0.022005045398448903,
  0.02062260825186968,
  0.019507000300412376,
  0.018324597040191293,
  0.01712477017038812,
  0.016329101674879592,
  0.015474456634062031,
  0.014706248766742647,
  0.014063984815341731,
  0.013315659947693348,
  0.012716010639754435],
 'loss_si': [4.669316093126933,
  4.215941846370697,
  3.8209771116574607,
  2.5727484623591104,
  0.9793021803100904,
  0.673474540313085,
  0.39477939158678055,
  0.24367517046630383,
  0.15120556764304638,
  0.1165308402851224,
  0.09486217331141233,
  0.0816636960953474,
  