## Geneformer Fine-Tuning for Classification of Cardiomyopathy Disease States
Please note that, as usual with deep learning models, we **highly** recommend tuning learning hyperparameters for all fine-tuning applications as this can significantly improve model performance. Example below uses previously optimized hyperparameters, but one can optimize hyperparameters with the argument n_hyperopt_trials=n in cc.validate() where n>0 and represents the number of trials for hyperparameter optimization.

In [1]:
import sys
import torch
import os
import pandas as pd
from geneformer import Classifier

from datasets import Dataset, load_from_disk
from datasets import load_dataset
from geneformer import EmbExtractor

# local imports
sys.path.insert(0, '../../scripts/')
import geneformer_utils as gtu

torch.cuda.empty_cache()

CORES = os.cpu_count()
GPUS = torch.cuda.device_count()
print(f"{CORES=}")
print(f"{GPUS=}")

  from .autonotebook import tqdm as notebook_tqdm


CORES=40
GPUS=3


In [2]:
# dpath = "/scratch/indikar_root/indikar1/shared_data/geneformer/datasets/"


# def load_data(path, sample_size=None):
#     """
#     Loads data from a file, processes cell types, and returns a DataFrame.

#     Args:
#         path (str): The path to the dataset file.
#         sample_size (int, optional): Number of cells to sample. Defaults to None.
#     Returns:
#         pandas.DataFrame: The processed DataFrame.
#     """

#     df = gtu.load_data_as_dataframe(path, num_cells=sample_size, shuffle=True)

#     if "iHSC" in path:
#         df['cell_type'] = "iHSC"
#     elif "pellin" in path:
#         df['cell_type'] = df['dataset']
#     elif "weng" in path:
#         df['cell_type'] = df['STD.CellType']
#     else:
#         df['cell_type'] = df['free_annotation']

#     # Extract basename without extension and assign to 'dataset' column
#     df['dataset'] = os.path.splitext(os.path.basename(path))[0]
#     df = df[['input_ids', 'cell_type', 'dataset', 'length']]

#     return df

# sample_size = None

# df = []

# for dataset in os.listdir(dpath):
#     print(f"{dataset=}")
#     data_path = f"{dpath}{dataset}"
#     tmp = load_data(data_path, sample_size)
    
#     df.append(tmp)
    
# df = pd.concat(df)
# df.head()

In [3]:
# fpath = "../ihsc_cell_types.csv"
# cell_map = pd.read_csv(fpath, comment="#")

# df = pd.merge(df, 
#               cell_map,
#               how='left',
#               left_on='cell_type',
#               right_on='label',
# )

# df = df.rename(columns={'label' : 'ignore'})

# df = df[df['standardized_cell_type'].notna()]
# print(f"{df.shape=}")
# print()
# print(df['standardized_cell_type'].value_counts())
# print()

# df.head()

# save the data to disk to make it easier



In [4]:
# data_output_path = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/hsc.dataset"

# data = Dataset.from_pandas(df)
# data.save_to_disk(data_output_path)
# print('done')

# Set up the classifier

In [5]:
training_args = {
    "num_train_epochs" : 0.9,
    "lr_scheduler_type" : "polynomial",
    "per_device_train_batch_size" : 20,
    "seed" : 73,
    "learning_rate" : 0.000804,
    "warmup_steps" : 1812,
    "weight_decay" : 0.258828,
}

cell_state_dict = {
    "state_key" : "standardized_cell_type", 
     "states" : "all",
}

torch.cuda.empty_cache()

sample_size = None

cc = Classifier(classifier = "cell",
                cell_state_dict = cell_state_dict,
                training_args = training_args,
                max_ncells = sample_size,
                freeze_layers = 2,
                num_crossval_splits = 1,
                forward_batch_size = 200,
                nproc = CORES,
                ngpu = GPUS)

# prepare data

In [6]:
# data_output_path = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/hsc.dataset"
# output_dir = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/"
# output_prefix = "prepared_hsc"

# cc.prepare_data(input_data_file=data_output_path,
#                 output_directory=output_dir,
#                 output_prefix=output_prefix,
#                 test_size=0.3)

# print('done')

In [7]:
# break

In [None]:
# from torch.profiler import profile, record_function, ProfilerActivity

# with profile(
#     activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], 
#     record_shapes=True,
#     profile_memory=True,  
#     with_stack=True  
# ) as prof:
#     with record_function("model_inference"):
#         # Your training loop code here 

# print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))  # View top 10 GPU operations



In [8]:
model_path = "/nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer-12L-30M/"
output_dir = "/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/"
output_prefix = "prepared_hsc"

n_hyperopt_trials = 0

all_metrics = cc.validate(model_directory=model_path,
                          prepared_input_data_file=f"{output_dir}/{output_prefix}_labeled_train.dataset",
                          id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
                          output_directory=output_dir,
                          n_hyperopt_trials=n_hyperopt_trials,
                          output_prefix=output_prefix)

print('done')

mkdir: cannot create directory ‘/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/240711_geneformer_cellClassifier_prepared_hsc/’: File exists
  0%|          | 0/1 [00:00<?, ?it/s]

****** Validation split: 1/1 ******



mkdir: cannot create directory ‘/scratch/indikar_root/indikar1/shared_data/geneformer/fine_tune/240711_geneformer_cellClassifier_prepared_hsc/ksplit1’: File exists
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /nfs/turbo/umms-indikar/shared/projects/geneformer/geneformer-12L-30M/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
0,0.3101,0.237481,0.915389,0.84341


  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}

  0%|          | 0/84 [00:00<?, ?it/s][A
  1%|          | 1/84 [00:22<31:09, 22.53s/it][A
  2%|▏         | 2/84 [00:32<20:46, 15.20s/it][A
  4%|▎         | 3/84 [00:39<15:08, 11.22s/it][A
  5%|▍         | 4/84 [00:44<12:04,  9.06s/it][A
  6%|▌         | 5/84 [00:50<10:11,  7.73s/it][A
  7%|▋         | 6/84 [00:55<09:03,  6.97s/it][A
  8%|▊         | 7/84 [01:01<08:21,  6.51s/it][A
 10%|▉         | 8/84 [01:06<07:45,  6.13s/it][A
 11%|█         | 9/84 [01:11<07:20,  5.87s/it][A
 12%|█▏        | 10/84 [01:17<07:01,  5.70s/it][A
 13%|█▎        | 11/84 [01:22<06:48,  5.59s/it][A
 14%|█▍        | 12/84 [01:27<06:37,  5.53s/it][A
 15%|█▌        | 13/84 [01:33<06:29,  5.48s/it][A
 17%|█▋        | 14/84 [01:38<06:21,  5.44s/it][A
 18%|█▊        | 15/84 [01:44<06:14,  5.42s/it][A
 19%|█▉        | 16/84 [01:49<06:08,  5.43s/it][A
 20%|██        | 17/84 [01:54<06:02,  5.41s/it][A
 21%|██▏       | 18/84 

done





In [9]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
train_valid_id_split_dict = {"attr_key": "individual",
                            "train": train_ids,
                            "eval": eval_ids}

# 6 layer Geneformer: https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors
all_metrics = cc.validate(model_directory="/path/to/Geneformer",
                          prepared_input_data_file=f"{output_dir}/{output_prefix}_labeled_train.dataset",
                          id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
                          output_directory=output_dir,
                          output_prefix=output_prefix,
                          split_id_dict=train_valid_id_split_dict)
                          # to optimize hyperparameters, set n_hyperopt_trials=100 (or alternative desired # of trials)

### Evaluate the model

In [None]:
cc = Classifier(classifier="cell",
                cell_state_dict = {"state_key": "disease", "states": "all"},
                forward_batch_size=200,
                nproc=16)

In [None]:
all_metrics_test = cc.evaluate_saved_model(
        model_directory=f"{output_dir}/{datestamp_min}_geneformer_cellClassifier_{output_prefix}/ksplit1/",
        id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
        test_data_file=f"{output_dir}/{output_prefix}_labeled_test.dataset",
        output_directory=output_dir,
        output_prefix=output_prefix,
    )

In [None]:
cc.plot_conf_mat(
        conf_mat_dict={"Geneformer": all_metrics_test["conf_matrix"]},
        output_directory=output_dir,
        output_prefix=output_prefix,
        custom_class_order=["nf","hcm","dcm"],
)

In [None]:
cc.plot_predictions(
    predictions_file=f"{output_dir}/{output_prefix}_pred_dict.pkl",
    id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
    title="disease",
    output_directory=output_dir,
    output_prefix=output_prefix,
    custom_class_order=["nf","hcm","dcm"],
)