In [33]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Prepare the SCARFLightning Module
from ts3l.pl_modules import SCARFLightning
from ts3l.utils.scarf_utils import SCARFDataset
from ts3l.utils import TS3LDataModule
from ts3l.utils.scarf_utils import SCARFConfig
from ts3l.utils.embedding_utils import IdentityEmbeddingConfig
from ts3l.utils.backbone_utils import MLPBackboneConfig
from pytorch_lightning import Trainer


# Evaluation
from sklearn.metrics import accuracy_score
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, SequentialSampler

In [2]:
TEST_DATA_PATH = "../data/test_data.csv"
TRAIN_DATA_PATH = "../data/train_data.csv"
UNLABELLED_DATA_PATH = "../data/unlabelled_data.csv"
PSEUDO_LABELLED_DATA_PATH = "../data/pseudo_labelled_scarf.csv"

In [20]:
def get_dataframes(test_path, train_path, unlabelled_path, with_clinical=False):
    # Load the data
    test_df = pd.read_csv(test_path)
    train_df = pd.read_csv(train_path)
    unlabelled_df = pd.read_csv(unlabelled_path)

    # Drop the columns that are not needed
    test_df = test_df.drop(columns=['DssTime', 'Event'])
    train_df = train_df.drop(columns=['DssTime', 'Event'])

    # Extract numerical and categorical columns
    # Numerical cols: Gene + Age
    numerical_cols = test_df.columns[:21].tolist()
    # But also Size
    numerical_cols.append('Size')
    # Categorical cols: Clinical
    categorical_cols = test_df.drop(columns=['Label', 'Size']).columns[21:].tolist()
    if not with_clinical:
        test_df = test_df.drop(columns=categorical_cols)
        train_df = train_df.drop(columns=categorical_cols)
        unlabelled_df = unlabelled_df.drop(columns=categorical_cols)
        categorical_cols = []
    else:
        categorical_cols = ['Chemotherapy', 'Menopausal State', 'Radio Therapy', 'Hormone Therapy', 'Surgery-breast conserving', 'Surgery-mastectomy']
        # The model has problems with these columns
        not_cols = ['Neoplasm Histologic Grade', 'Cellularity']
        test_df = test_df.drop(columns=not_cols)
        train_df = train_df.drop(columns=not_cols)
        unlabelled_df = unlabelled_df.drop(columns=not_cols)

    print(f'Train data shape: {train_df.shape}')
    print(f'Test data shape: {test_df.shape}')
    print(f'Unlabelled data shape: {unlabelled_df.shape}')
    print(f'Numerical columns: {numerical_cols}')
    if with_clinical:
        print(f'Categorical columns: {categorical_cols}')
    return test_df, train_df, unlabelled_df, numerical_cols, categorical_cols

In [38]:
test_data, train_data, unlabelled_data, numerical_cols, categorical_cols = get_dataframes(
    TEST_DATA_PATH,
    TRAIN_DATA_PATH,
    UNLABELLED_DATA_PATH,
    with_clinical=True)

Train data shape: (465, 29)
Test data shape: (117, 29)
Unlabelled data shape: (1168, 28)
Numerical columns: ['ESR1', 'PGR', 'ERBB2', 'MKI67', 'PLAU', 'ELAVL1', 'EGFR', 'BTRC', 'FBXO6', 'SHMT2', 'KRAS', 'SRPK2', 'YWHAQ', 'PDHA1', 'EWSR1', 'ZDHHC17', 'ENO1', 'DBN1', 'PLK1', 'GSK3B', 'Age', 'Size']
Categorical columns: ['Chemotherapy', 'Menopausal State', 'Radio Therapy', 'Hormone Therapy', 'Surgery-breast conserving', 'Surgery-mastectomy']


In [39]:
full_X_train = train_data.drop(columns=['Label'])
full_y_train = train_data['Label']

X_test = test_data.drop(columns=['Label'])
y_test = test_data['Label']

In [40]:
# Split the train_data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    full_X_train,
    full_y_train,
    test_size=0.2,
    random_state=42,
    stratify=full_y_train)

print(f'Training data shape: {X_train.shape}')
print(f'Validation data shape: {X_val.shape}')

Training data shape: (372, 28)
Validation data shape: (93, 28)


In [41]:
metric = "accuracy_score"
input_dim = X_train.shape[1]
pretraining_head_dim = 1024
output_dim = 2
head_depth = 2
dropout_rate = 0.04

corruption_rate = 0.6

batch_size = 128
max_epochs = 10

In [42]:
embedding_config = IdentityEmbeddingConfig(input_dim = input_dim)
backbone_config = MLPBackboneConfig(input_dim = embedding_config.output_dim)

In [43]:
config = SCARFConfig( 
    task="classification",
    loss_fn="CrossEntropyLoss",
    metric=metric, metric_hparams={},
    embedding_config=embedding_config,
    backbone_config=backbone_config,
    pretraining_head_dim=pretraining_head_dim,
    output_dim=output_dim,
    head_depth=head_depth,
    dropout_rate=dropout_rate,
    corruption_rate = corruption_rate
)

In [44]:
pl_scarf = SCARFLightning(config)

Seed set to 42


In [45]:
### First Phase Learning
train_ds = SCARFDataset(
    X_train,
    unlabeled_data=unlabelled_data,
    config = config,
    continuous_cols=numerical_cols,
    category_cols=categorical_cols)

valid_ds = SCARFDataset(
    X_val,
    config=config,
    continuous_cols=numerical_cols,
    category_cols=categorical_cols
)

datamodule = TS3LDataModule(train_ds, valid_ds, batch_size=batch_size, train_sampler="random")


In [None]:
trainer = Trainer(
    accelerator = 'cpu',
    max_epochs = max_epochs,
    num_sanity_val_steps = 2,
    )

trainer.fit(pl_scarf, datamodule)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | task_loss_fn     | CrossEntropyLoss | 0      | train
1 | contrastive_loss | NTXentLoss       | 0      | train
2 | model            | SCARF            | 1.2 M  | train
--------------------------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.819     Total estimated model params size (MB)
23        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/sonk/envs/jupyter/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (13) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [None]:
### Second Phase Learning

pl_scarf.set_second_phase()


In [None]:
train_ds = SCARFDataset(
    X_train,
    y_train.values,
    continuous_cols=numerical_cols,
    category_cols=categorical_cols,
    is_second_phase=True)

valid_ds = SCARFDataset(
    X_val,
    y_val.values,
    continuous_cols=numerical_cols,
    category_cols=categorical_cols,
    is_second_phase=True)

datamodule = TS3LDataModule(train_ds, valid_ds, batch_size = batch_size, train_sampler="weighted")


In [None]:
trainer = Trainer(
                    accelerator = 'cpu',
                    max_epochs = max_epochs,
                    num_sanity_val_steps = 2,
    )

trainer.fit(pl_scarf, datamodule)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | task_loss_fn     | CrossEntropyLoss | 0      | train
1 | contrastive_loss | NTXentLoss       | 0      | train
2 | model            | SCARF            | 1.2 M  | train
--------------------------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.816     Total estimated model params size (MB)
23        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/sonk/envs/jupyter/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (3) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
test_ds = SCARFDataset(
    X_test,
    continuous_cols=numerical_cols,
    category_cols=categorical_cols,
    is_second_phase=True)

test_dl = DataLoader(
    test_ds,
    batch_size,
    shuffle=False,
    sampler=SequentialSampler(test_ds),
    num_workers=4)


In [None]:
preds = trainer.predict(pl_scarf, test_dl)
        
preds = F.softmax(torch.concat([out.cpu() for out in preds]).squeeze(),dim=1)

accuracy = accuracy_score(y_test, preds.argmax(1))

print("Accuracy %.2f" % accuracy)



Predicting: |          | 0/? [00:00<?, ?it/s]

Accuracy 0.55
