## Geneformer Fine-Tuning for Classification of Cardiomyopathy Disease States
Please note that, as usual with deep learning models, we **highly** recommend tuning learning hyperparameters for all fine-tuning applications as this can significantly improve model performance. Example below uses previously optimized hyperparameters, but one can optimize hyperparameters with the argument n_hyperopt_trials=n in cc.validate() where n>0 and represents the number of trials for hyperparameter optimization.

In [1]:
import sys
import torch
import os
import pandas as pd
from geneformer import Classifier

from datasets import Dataset, load_from_disk
from datasets import load_dataset
from geneformer import EmbExtractor

# local imports
sys.path.insert(0, '../../scripts/')
import geneformer_utils as gtu

torch.cuda.empty_cache()

CORES = os.cpu_count()
GPUS = torch.cuda.device_count()
print(f"{CORES=}")
print(f"{GPUS=}")

  from .autonotebook import tqdm as notebook_tqdm


CORES=40
GPUS=3


# load the data



In [2]:
data_path = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/hsc.dataset"

data = load_from_disk(data_path)
cell_types = data.unique("standardized_cell_type")

data

Dataset({
    features: ['input_ids', 'cell_type', 'dataset', 'length', 'ignore', 'standardized_cell_type', 'broad_type', '__index_level_0__'],
    num_rows: 214715
})

In [3]:
[x for x in cell_types if not "iHSC" in x]

['B Cell',
 'Common Myeloid Progenitor',
 'Granulocyte-Macrophage Progenitor',
 'HSC',
 'T Cell',
 'Megakaryocyte-Erythroid Progenitor',
 'Plasma Cell',
 'Monocyte',
 'Multipotent Progenitor',
 'Dendritic Cell',
 'Common Lymphoid Progenitor',
 'NK Cell',
 'Multi-Lymphoid Progenitor',
 'Fibroblast',
 'Macrophage',
 'Endothelial Cell',
 'Smooth Muscle Cell',
 'Mast Cell',
 'Erythrocyte',
 'Neutrophil',
 'NK T Cell',
 'Granulocyte']

# Set up the classifier

In [4]:
filter_data_dict = {
    "standardized_cell_type": [x for x in cell_types if not "iHSC" in x], 
}
    
training_args = {
    "num_train_epochs" : 3,
    "lr_scheduler_type" : "polynomial",
    "per_device_train_batch_size" : 20,
    "seed" : 73,
    "learning_rate" : 0.000804,
    "warmup_steps" : 1812,
    "weight_decay" : 0.258828,
}

cell_state_dict = {
    "state_key" : "standardized_cell_type", 
    "states" : "all",
}

torch.cuda.empty_cache()

sample_size = None

cc = Classifier(classifier = "cell",
                cell_state_dict = cell_state_dict,
                training_args = training_args,
                filter_data=filter_data_dict,
                max_ncells = sample_size,
                freeze_layers = 2,
                num_crossval_splits = 1,
                forward_batch_size = 200,
                nproc = CORES,
                ngpu = GPUS)

# prepare data

In [5]:
data_output_path = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/hsc.dataset"
output_dir = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/"
output_prefix = "no_induced"

torch.cuda.empty_cache()

cc.prepare_data(input_data_file=data_output_path,
                output_directory=output_dir,
                output_prefix=output_prefix,
                test_size=0.3)

print('done')

Map (num_proc=40): 100%|██████████| 206141/206141 [04:56<00:00, 696.22 examples/s]  
Saving the dataset (2/2 shards): 100%|██████████| 144298/144298 [1:25:12<00:00, 28.22 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 61843/61843 [20:57<00:00, 49.19 examples/s]

done





In [6]:
# break

# Train

In [None]:
torch.cuda.empty_cache()

model_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer-12L-30M/"
output_dir = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/"
output_prefix = "no_induced"

n_hyperopt_trials = 0

all_metrics = cc.validate(model_directory=model_path,
                          prepared_input_data_file=f"{output_dir}/{output_prefix}_labeled_train.dataset",
                          id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
                          output_directory=output_dir,
                          n_hyperopt_trials=n_hyperopt_trials,
                          output_prefix=output_prefix)

print('done')

mkdir: cannot create directory ‘/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/240715_geneformer_cellClassifier_no_induced/’: File exists
  0%|          | 0/1 [00:00<?, ?it/s]mkdir: cannot create directory ‘/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/240715_geneformer_cellClassifier_no_induced/ksplit1’: File exists


****** Validation split: 1/1 ******



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer-12L-30M/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,0.3294,0.297368,0.897468,0.799918
2,0.2382,0.23816,0.917613,0.845888


  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


In [None]:
break