In [1]:
from gears import PertData
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import datetime
from geneformer import Classifier
from geneformer import EmbExtractor

In [2]:
## Load Norman data
pert_data = PertData("data/")
pert_data.load(data_name="norman")
pert_data.prepare_split(split = 'simulation', seed = 1) # train/test 나누기
pert_adata = pert_data.adata # 전체 데이터

Found local copy...
Found local copy...
Found local copy...
These perturbations are not in the GO graph and their perturbation can thus not be predicted
['RHOXF2BB+ctrl' 'LYL1+IER5L' 'ctrl+IER5L' 'KIAA1804+ctrl' 'IER5L+ctrl'
 'RHOXF2BB+ZBTB25' 'RHOXF2BB+SET']
Local copy of pyg dataset is detected. Loading...
Done!
Local copy of split is detected. Loading...


here1


Simulation split test composition:
combo_seen0:9
combo_seen1:43
combo_seen2:19
unseen_single:36
Done!


In [19]:
## 필요한 column 생성
adata = pert_adata.copy()
adata.X = csr_matrix(adata.layers["counts"])
adata.obs["n_counts"] = adata.X.sum(axis=1)
adata.obs["perturbation"] = adata.obs["condition"].astype(str)
adata.obs["state"] = np.where(adata.obs["control"] == 1, "ctrl", "perturbed")
adata.var["ensembl_id"] = adata.var.index

In [20]:
adata.write_h5ad("./input_data/norman_geneformer.h5ad")

In [21]:
custom_attr = {
    "cell_type": "cell_type",
    "perturbation": "perturbation",
    "state": "state",
    "split": "split",
}

In [22]:
## Tokenize data
from geneformer import TranscriptomeTokenizer

tk = TranscriptomeTokenizer(
  custom_attr_name_dict=custom_attr,
  #chunk_size=512, # can adjust based on memory available
  nproc=1,
  model_version="V2" # default is V2, here set to V1 model to fit into Colab 40G GPU resources
)

tk.tokenize_data(
    data_directory="./input_data",
    output_directory="./input_data",
    output_prefix="cm_tokenized",
    file_format="h5ad"
)

Tokenizing input_data/norman_geneformer.h5ad


100%|██████████████████████████████████████████| 175/175 [00:38<00:00,  4.49it/s]


input_data/norman_geneformer.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


In [29]:
## Fine-tuning
!mkdir ./finetuned_models

mkdir: cannot create directory './finetuned_models': File exists


In [7]:
# Generate a unique date-based identifier for each experiment
current_date = datetime.datetime.now()
datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}{current_date.hour:02d}{current_date.minute:02d}{current_date.second:02d}"
datestamp_min = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
print(datestamp)
print(datestamp_min)

251207030439
251207


In [2]:
datestamp = '251207030439'
datestamp_min = '251207'
output_prefix = "cm_dz_classifier"
output_dir = f"./finetuned_models/{datestamp}"

In [32]:
!mkdir $output_dir

In [71]:
training_args = {
    "num_train_epochs": 0.9,
    "learning_rate": 0.000804,
    "lr_scheduler_type": "polynomial",
    "warmup_steps": 1812,
    "weight_decay":0.258828,
    "per_device_train_batch_size": 12,
    "seed": 73,
}

cc = Classifier(classifier="cell",
                cell_state_dict={"state_key": "perturbation", "states": "all"},
                #filter_data={"split": ["train"]},
                training_args=training_args,
                max_ncells=None,
                freeze_layers=2,
                num_crossval_splits=1,
                forward_batch_size=32,
                nproc=8,
                model_version="V2" # default is V2, here set to V1 model to fit into Colab 40G GPU resources
  )

In [72]:
train_test_id_split_dict = {
    "attr_key": "split",
    "train": ["train"],
    "test": ["test"],
}

In [73]:
cc.prepare_data(input_data_file="./input_data/cm_tokenized.dataset",
                output_directory=output_dir,
                output_prefix=output_prefix,
                split_id_dict=train_test_id_split_dict)

In [80]:
# Perform fine-tuning with the training dataset, validating performance with held-out patients in the eval dataset.
os.environ["WANDB_DISABLED"] = "true"
all_metrics = cc.validate(
    model_directory=os.path.abspath("/home/c22200541/Geneformer/Geneformer-V2-104M/"), # set to V1 model to fit into Colab 40G GPU resources
    prepared_input_data_file=f"{output_dir}/{output_prefix}_labeled_train.dataset",
    id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
    output_directory=os.path.abspath(output_dir),
    output_prefix=output_prefix,
    #attr_to_split="individual",
    #attr_to_balance=["disease","age","sex"],
    n_hyperopt_trials=0 	# Number of trials to run for hyperparameter optimization. Set it to 0 for direct training without hyperparameter optimization.
)

mkdir: cannot create directory '/home/c22200541/Geneformer/examples/finetuned_models/251207030439/251207_geneformer_cellClassifier_cm_dz_classifier/': File exists


  0%|          | 0/1 [00:00<?, ?it/s]

mkdir: cannot create directory '/home/c22200541/Geneformer/examples/finetuned_models/251207030439/251207_geneformer_cellClassifier_cm_dz_classifier/ksplit1': File exists


****** Validation split: 1/1 ******



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/c22200541/Geneformer/Geneformer-V2-104M and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
0,4.5962,4.565081,0.144595,0.001805


  0%|          | 0/178 [00:00<?, ?it/s]

In [81]:
# Evaluate the final model on held-out patients in the test dataset.
all_metrics_test = cc.evaluate_saved_model(
    model_directory=f"{output_dir}/{datestamp_min}_geneformer_cellClassifier_{output_prefix}/ksplit1/",
    id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
    test_data_file=f"{output_dir}/{output_prefix}_labeled_test.dataset",
    output_directory=output_dir,
    output_prefix=output_prefix,
)

  0%|          | 0/1071 [00:00<?, ?it/s]

In [82]:
## Embedding extraction
!mkdir ./embs

In [3]:
embex = EmbExtractor(
    model_type="CellClassifier", # set to GeneClassifier or Pretrained for those model types
    num_classes=277, # number of classes of fine-tuned model
    #filter_data={"cell_type":["Cardiomyocyte"]}, # optionally can extract embeddings from a subset of cells based on the chosen attributes
    max_ncells=10000,
    emb_layer=0, # extracts embeddings from last layer
    emb_label=["state", "perturbation"],
    labels_to_plot=["state", "perturbation"],
    forward_batch_size=128,
    nproc=1,
    model_version="V2" # default is V2, here set to V1 model to fit into Colab 40G GPU resources
)

embs = embex.extract_embs(f"{output_dir}/{datestamp_min}_geneformer_cellClassifier_{output_prefix}/ksplit1/",
                          "./input_data/cm_tokenized.dataset",
                          "./embs",
                          "cm_finetuned_embs")

  0%|          | 0/79 [00:00<?, ?it/s]