## 组织数据成GeneFormer支持的格式


Geneformer需要loom格式的单细胞数据来进行tokenize，自己修改代码是可以，但是麻烦，选择套他的代码。

- loom格式，行代表基因，列代表细胞，这里我们是人（为了符合GeneFormer需求）
- ra 用记录蛋白信息的df；必须要有：ensembl_id、
- ca 记录人的各种信息；`n_counts`需要；`filter_pass`可以需要，来表示该列是否需要保留，为0则会删除


>其他需要的信息可以通过 传入一个字典来保留：`{"cell_type": "cell_type", "organ_major": "organ"}`




In [2]:
import pandas as pd

pd.set_option("display.max_columns", 500)
from pathlib import Path

tmpdir = "tmp/"

## load_data

save to loom 

In [3]:
# TODO: eid信息缺失了，并且部分蛋白是用mean填充的，后续可以根本不需要填充，因为可以传入的时候没有他们

train_imputed = pd.read_pickle("result/part1/train_imputed.pkl")
test_imputed = pd.read_pickle("result/part1/test_imputed.pkl")


protein_cols = test_imputed.columns[
    test_imputed.columns.tolist().index("C3") :
].tolist()

In [4]:
train_imputed["incident_cad"] = train_imputed["incident_cad"].astype(int)
test_imputed["incident_cad"] = test_imputed["incident_cad"].astype(int)

In [5]:
from ppp_prediction.geneformer.in_silico_perturber_stats import (
    GENE_NAME_ID_DICTIONARY_FILE,
)
import loompy


def proteomics_to_loom(
    data,
    protein_cols,
    ca_cols=None,
    gene_name_id_dict_path=GENE_NAME_ID_DICTIONARY_FILE,
):
    """

    Return: main_matrix, ra, ca
    """
    gene_name_id_dict = pd.read_pickle(gene_name_id_dict_path)

    ## check olink proteins in geneformer dict
    in_geneforer_proteins = []
    out_geneforer_proteins = []
    for gene in protein_cols:
        if gene not in gene_name_id_dict:
            out_geneforer_proteins.append(gene)
        else:
            in_geneforer_proteins.append(gene)
    print(f"out_geneformer_proteins: {out_geneforer_proteins}")
    print(
        f"Successly found {len(in_geneforer_proteins)} proteins in geneformer, only {len(out_geneforer_proteins)} proteins are not found in geneformer"
    )

    ## ca_attr_cols
    ca_attr_cols = [col for col in data.columns if col not in protein_cols]
    ca_df = data[ca_attr_cols]

    main_df = data[in_geneforer_proteins].rename(columns=gene_name_id_dict).T

    ra_df = main_df.index.to_frame().reset_index(drop=True)
    ra_df.columns = ["ensembl_id"]

    main_df = main_df.values
    assert len(ra_df) == main_df.shape[0]
    assert len(ca_df) == main_df.shape[1]
    print(f"finnal main_df shape: {main_df.shape}")
    ## gene_name => esmble_id

    return main_df, ra_df.to_dict("list"), ca_df.to_dict("list")

2024-04-24 11:51:51.572214: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
Path(tmpdir).mkdir(parents=True, exist_ok=True)

main_matrix, ra, ca = proteomics_to_loom(train_imputed, protein_cols)

loompy.create(f"{tmpdir}/2_train_imputed.loom", main_matrix, ra, ca)

main_matrix, ra, ca = proteomics_to_loom(test_imputed, protein_cols)

loompy.create(f"{tmpdir}/2_test_imputed.loom", main_matrix, ra, ca)

out_geneformer_proteins: ['DEFB103A_DEFB103B', 'BTNL10', 'ARNTL', 'SKIV2L', 'DEFB104A_DEFB104B', 'GBA', 'MYLPF', 'BOLA2_BOLA2B', 'MENT', 'CTAG1A_CTAG1B', 'GPR15L', 'SARG', 'PALM2', 'BAP18', 'SPACA5_SPACA5B', 'CGB3_CGB5_CGB8', 'DDX58', 'CERT', 'DEFA1_DEFA1B', 'DEFB4A_DEFB4B', 'IL12A_IL12B', 'CKMT1A_CKMT1B', 'LGALS7_LGALS7B', 'MICB_MICA', 'NTproBNP', 'WARS', 'EBI3_IL27', 'FUT3_FUT5']
Successly found 2883 proteins in geneformer, only 28 proteins are not found in geneformer
finnal main_df shape: (2883, 36007)
out_geneformer_proteins: ['DEFB103A_DEFB103B', 'BTNL10', 'ARNTL', 'SKIV2L', 'DEFB104A_DEFB104B', 'GBA', 'MYLPF', 'BOLA2_BOLA2B', 'MENT', 'CTAG1A_CTAG1B', 'GPR15L', 'SARG', 'PALM2', 'BAP18', 'SPACA5_SPACA5B', 'CGB3_CGB5_CGB8', 'DDX58', 'CERT', 'DEFA1_DEFA1B', 'DEFB4A_DEFB4B', 'IL12A_IL12B', 'CKMT1A_CKMT1B', 'LGALS7_LGALS7B', 'MICB_MICA', 'NTproBNP', 'WARS', 'EBI3_IL27', 'FUT3_FUT5']
Successly found 2883 proteins in geneformer, only 28 proteins are not found in geneformer
finnal main_df

## tokenize + embedding => datasets

In [6]:
from geneformer.tokenizer import TOKEN_DICTIONARY_FILE

from __future__ import annotations

import logging
import pickle
import warnings
from pathlib import Path
from typing import Literal

import anndata as ad
import numpy as np
import scipy.sparse as sp
from datasets import Dataset

warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")  # noqa
import loompy as lp  # noqa

logger = logging.getLogger(__name__)

import loompy as lp
import numpy as np


def rank_genes(gene_vector, gene_tokens):
    """
    Rank gene expression vector.
    """
    # sort by median-scaled gene values
    sorted_indices = np.argsort(-gene_vector)
    return gene_tokens[sorted_indices]


def tokenize_ind(gene_vector, gene_tokens):
    """
    Convert normalized gene expression vector to tokenized rank value encoding.
    """
    # create array of gene vector with token indices
    # mask undetected genes
    nonzero_mask = np.nonzero(gene_vector)[0]
    # rank by median-scaled gene values
    return rank_genes(gene_vector[nonzero_mask], gene_tokens[nonzero_mask])


class ProteomicsTokenizer:
    def __init__(
        self,
        custom_attr_name_dict=None,
        nproc=1,
        chunk_size=512,
        model_input_size=2048,
        special_token=False,
        # gene_median_file=GENE_MEDIAN_FILE,
        token_dictionary_file=TOKEN_DICTIONARY_FILE,
    ):
        """
        Initialize tokenizer.

        **Parameters:**

        custom_attr_name_dict : None, dict
            | Dictionary of custom attributes to be added to the dataset.
            | Keys are the names of the attributes in the loom file.
            | Values are the names of the attributes in the dataset.
        nproc : int
            | Number of processes to use for dataset mapping.
        chunk_size : int = 512
            | Chunk size for anndata tokenizer.
        model_input_size : int = 2048
            | Max input size of model to truncate input to.
        special_token : bool = False
            | Adds CLS token before and SEP token after rank value encoding.
        # gene_median_file : Path
        #     | Path to pickle file containing dictionary of non-zero median
        #     | gene expression values across Genecorpus-30M.
        token_dictionary_file : Path
            | Path to pickle file containing token dictionary (Ensembl IDs:token).

        """
        # dictionary of custom attributes {output dataset column name: input .loom column name}
        self.custom_attr_name_dict = custom_attr_name_dict

        # number of processes for dataset mapping
        self.nproc = nproc

        # chunk size for anndata tokenizer
        self.chunk_size = chunk_size

        # input size for tokenization
        self.model_input_size = model_input_size

        # add CLS and SEP tokens
        self.special_token = special_token

        # load dictionary of gene normalization factors
        # (non-zero median value of expression across Genecorpus-30M)
        # with open(gene_median_file, "rb") as f:
        #     self.gene_median_dict = pickle.load(f)

        # load token dictionary (Ensembl IDs:token)
        with open(token_dictionary_file, "rb") as f:
            self.gene_token_dict = pickle.load(f)

        # gene keys for full vocabulary
        self.gene_keys = list(self.gene_token_dict.keys())

        # protein-coding and miRNA gene list dictionary for selecting .loom rows for tokenization
        self.genelist_dict = dict(zip(self.gene_keys, [True] * len(self.gene_keys)))

    def tokenize_loom(self, loom_file_path, target_sum=10_000):
        if self.custom_attr_name_dict is not None:
            file_ind_metadata = {
                attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
            }

        with lp.connect(str(loom_file_path)) as data:
            # define coordinates of detected protein-coding or miRNA genes and vector of their normalization factors

            coding_miRNA_loc = np.where(
                [self.genelist_dict.get(i, False) for i in data.ra["ensembl_id"]]
            )[0]

            # norm_factor_vector = np.array(
            #     [
            #         self.gene_median_dict[i]
            #         for i in data.ra["ensembl_id"][coding_miRNA_loc]
            #     ]
            # )
            coding_miRNA_ids = data.ra["ensembl_id"][coding_miRNA_loc]

            not_in_gene_ids = set(data.ra["ensembl_id"]) - set(self.gene_keys)
            print(
                f"{len(not_in_gene_ids)} genes not in gene token dictionary, skipping them, some are: {list(not_in_gene_ids)[:5]}"
            )

            coding_miRNA_tokens = np.array(
                [self.gene_token_dict[i] for i in coding_miRNA_ids]
            )

            # define coordinates of individual passing filters for inclusion (e.g. QC)
            try:
                data.ca["filter_pass"]
            except AttributeError:
                var_exists = False
            else:
                var_exists = True

            if var_exists:
                filter_pass_loc = np.where([i == 1 for i in data.ca["filter_pass"]])[0]
            elif not var_exists:
                print(
                    f"{loom_file_path} has no column attribute 'filter_pass'; tokenizing all inds."
                )
                filter_pass_loc = np.array([i for i in range(data.shape[1])])

            # scan through .loom files and tokenize inds
            tokenized_ind = []
            for _ix, _selection, view in data.scan(
                items=filter_pass_loc, axis=1, batch_size=self.chunk_size
            ):
                # select subview with protein-coding and miRNA genes
                subview = view.view[coding_miRNA_loc, :]
                # Currently do not norm ,as the values is NPX by UKB

                # tokenize subview gene vectors
                tokenized_ind += [
                    tokenize_ind(subview[:, i], coding_miRNA_tokens)
                    for i in range(subview.shape[1])
                ]

                # add custom attributes for subview to dict
                if self.custom_attr_name_dict is not None:
                    for k in file_ind_metadata.keys():
                        file_ind_metadata[k] += subview.ca[k].tolist()
                else:
                    file_ind_metadata = None

        return tokenized_ind, file_ind_metadata

    def create_dataset(
        self,
        tokenized_inds,
        ind_metadata,
        use_generator=False,
        keep_uncropped_input_ids=False,
    ):
        print("Creating dataset.")
        # create dict for dataset creation
        dataset_dict = {"input_ids": tokenized_inds}
        if self.custom_attr_name_dict is not None:
            dataset_dict.update(ind_metadata)

        # create dataset
        if use_generator:

            def dict_generator():
                for i in range(len(tokenized_inds)):
                    yield {k: dataset_dict[k][i] for k in dataset_dict.keys()}

            output_dataset = Dataset.from_generator(dict_generator, num_proc=self.nproc)
        else:
            output_dataset = Dataset.from_dict(dataset_dict)

        def format_ind_features(example):
            # Store original uncropped input_ids in separate feature
            if keep_uncropped_input_ids:
                example["input_ids_uncropped"] = example["input_ids"]
                example["length_uncropped"] = len(example["input_ids"])

            # Truncate/Crop input_ids to input size
            if self.special_token:
                example["input_ids"] = example["input_ids"][
                    0 : self.model_input_size - 2
                ]  # truncate to leave space for CLS and SEP token
                example["input_ids"] = np.insert(
                    example["input_ids"], 0, self.gene_token_dict.get("<cls>")
                )
                example["input_ids"] = np.insert(
                    example["input_ids"],
                    len(example["input_ids"]),
                    self.gene_token_dict.get("<sep>"),
                )
            else:
                # Truncate/Crop input_ids to input size
                example["input_ids"] = example["input_ids"][0 : self.model_input_size]
            example["length"] = len(example["input_ids"])

            return example

        output_dataset_truncated = output_dataset.map(
            format_ind_features, num_proc=self.nproc
        )
        return output_dataset_truncated

In [6]:
import torch

import sys
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("ctheodoris/Geneformer")

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
model = AutoModelForMaskedLM.from_pretrained("ctheodoris/Geneformer").to(device)


def get_embeddings(example):

    # inputs = torch.Tensor(example["input_ids"]).unsqueeze(0).long()
    # outputs = model.bert(inputs)
    # embeddings = outputs.last_hidden_state.cpu().detach().numpy()

    # example["embeddings"] = embeddings
    # print(example)
    # example["a"] = np.random.normal(size=(len(example["input_ids"]), 10, 10))
    inputs = torch.Tensor(example["input_ids"]).long().to(device)
    outputs = model.bert(inputs).last_hidden_state.cpu().detach().numpy()
    example["embeddings"] = outputs

    return example

loom_dir = ""
# train
loom_file_path = f"{tmpdir}/2_train_imputed.loom"
outputpath = f"{tmpdir}/imputed_200/train"
Path(outputpath).mkdir(parents=True, exist_ok=True)

proteomics_tokenizer = ProteomicsTokenizer(
    {"incident_cad": "incident_cad", "eid": "eid"},
    model_input_size=200,
    special_token=False,  # TODO: <cls> not in the dictionary
)  # TODO: model_input_size may be larger if it is ok; special_token=True if we want to add CLS and SEP tokens
tokenized_ind, file_ = proteomics_tokenizer.tokenize_loom(loom_file_path)  # toknize
output_dataset_truncated = proteomics_tokenizer.create_dataset(
    tokenized_ind, file_
)  # create dataset

output_dataset_truncated = output_dataset_truncated.map(
    get_embeddings, batched=True, batch_size=256, num_proc=4
)
output_dataset_truncated.save_to_disk(outputpath)  # save to disk


# test

loom_file_path = f"{tmpdir}/2_test_imputed.loom"
outputpath = f"{tmpdir}/imputed_200/test"
Path(outputpath).mkdir(parents=True, exist_ok=True)

proteomics_tokenizer = ProteomicsTokenizer(
    {"incident_cad": "incident_cad", "eid": "eid"},
    model_input_size=200,
    special_token=False,  # TODO: <cls> not in the dictionary
)  # TODO: model_input_size may be larger if it is ok; special_token=True if we want to add CLS and SEP tokens
tokenized_ind, file_ = proteomics_tokenizer.tokenize_loom(loom_file_path)  # toknize
output_dataset_truncated = proteomics_tokenizer.create_dataset(
    tokenized_ind, file_
)  # create dataset

output_dataset_truncated = output_dataset_truncated.map(
    get_embeddings, batched=True, batch_size=256, num_proc=1
)
output_dataset_truncated.save_to_disk(outputpath)  # save to disk

KeyboardInterrupt: 

In [8]:
import datasets

train_Dataset = datasets.load_from_disk(f"{tmpdir}/imputed_200/train")
train_Dataset

Dataset({
    features: ['input_ids', 'incident_cad', 'eid', 'length'],
    num_rows: 36007
})

## training 

In [None]:

output_dataset_truncated.save_to_disk(outputpath)  # save to disk

In [45]:
loom_file_path = f"{tmpdir}/2_test_imputed.loom"
outputpath = f"{tmpdir}/imputed_200/test"
Path(outputpath).mkdir(parents=True, exist_ok=True)

proteomics_tokenizer = ProteomicsTokenizer(
    {"incident_cad": "incident_cad"}, model_input_size=200, special_token=False
)  # TODO: model_input_size may be larger if it is ok; special_token=True if we want to add CLS and SEP tokens
tokenized_ind, file_ = proteomics_tokenizer.tokenize_loom(loom_file_path)  # toknize
output_dataset_truncated = proteomics_tokenizer.create_dataset(
    tokenized_ind, file_
)  # create dataset
output_dataset_truncated.save_to_disk(outputpath)  # save to disk

13 genes not in gene token dictionary, skipping them, some are: ['ENSG00000275841', 'ENSG00000291237', 'ENSG00000228789', 'ENSG00000266200', 'ENSG00000248546']
tmp//2_test_imputed.loom has no column attribute 'filter_pass'; tokenizing all inds.
Creating dataset.


Map:   0%|          | 0/15432 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15432 [00:00<?, ? examples/s]

In [46]:
output_dataset_truncated

Dataset({
    features: ['input_ids', 'incident_cad', 'length'],
    num_rows: 15432
})

## get embedding

In [None]:
# !source ~/vpn.sh


# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("ctheodoris/Geneformer")
model = AutoModelForMaskedLM.from_pretrained("ctheodoris/Geneformer")

In [None]:
from datasets import load_from_disk

dataset = load_from_disk(f"{tmpdir}/imputed_200/test")
dataset[0]

In [38]:
import torch

import sys
import numpy as np


def get_embeddings(example):

    # inputs = torch.Tensor(example["input_ids"]).unsqueeze(0).long()
    # outputs = model.bert(inputs)
    # embeddings = outputs.last_hidden_state.cpu().detach().numpy()

    # example["embeddings"] = embeddings
    # print(example)
    # example["a"] = np.random.normal(size=(len(example["input_ids"]), 10, 10))
    inputs = torch.Tensor(example["input_ids"]).long()
    outputs = model.bert(inputs).last_hidden_state.cpu().detach().numpy()
    example["embeddings"] = outputs

    return example


test = dataset.map(get_embeddings, batched=True, batch_size=128, num_proc=4)

Map (num_proc=4):   0%|          | 0/15432 [00:00<?, ? examples/s]

In [None]:
test[0]["embeddings"]

: 

: 

In [36]:
test[0]["a"]

[[2.443294683019979,
  -0.28211410571610945,
  1.2625037373664558,
  -1.4698392975720649,
  0.412663431492597,
  -0.6720643058804685,
  0.6370625505509272,
  0.4383611692883055,
  -0.9469268261102411,
  0.1866431795129508],
 [2.018076132952118,
  -1.396561747879193,
  0.3363108759124171,
  -0.9575563130223151,
  0.06698016665688715,
  -0.30448374389087024,
  -0.390177235468002,
  0.996180871902987,
  0.6756530429210247,
  1.483116023083148],
 [2.213912828257484,
  -0.9047191237407438,
  -0.16463655938118052,
  0.5839528701611253,
  -1.2072897546288042,
  1.1303511706142206,
  0.29084594055370894,
  2.5040704770002056,
  -1.330320759409782,
  -0.2883352069137411],
 [-0.1312388478695491,
  0.365759316566558,
  -1.289969055516189,
  1.6630244192019943,
  0.1718978327084064,
  -1.3203740999948013,
  0.23104629638514015,
  -0.6317994372823559,
  2.6457231413562967,
  1.3812916242164794],
 [-0.9094756971678113,
  0.7200806249987156,
  -0.21279409621144432,
  1.0593773664464585,
  -0.79217097

In [None]:
# import torch


# def get_embeddings(example):
#     inputs = torch.Tensor(example["input_ids"]).unsqueeze(0).long()
#     outputs = model.bert(inputs)
#     embeddings = outputs.last_hidden_state.cpu().detach().numpy()

#     example["embeddings"] = embeddings
#     print(example)
#     return example


# dataset.map(get_embeddings, batched=True, batch_size=32, num_proc=4)

In [None]:


# o = model(torch.Tensor(dataset[0]["input_ids"]).unsqueeze(0).long())
# o

In [None]:
model.eval()
o = model.bert(torch.Tensor(dataset[0]["input_ids"]).unsqueeze(0).long())
o

In [None]:
o.last_hidden_state

In [None]:
data = test_imputed
protein_cols = protein_cols

In [None]:
gene_name_id_dict_geneformer = pd.read_pickle(GENE_NAME_ID_DICTIONARY_FILE)

## check olink proteins in geneformer dict
in_geneforer_proteins = []
out_geneforer_proteins = []
for gene in protein_cols:
    if gene not in gene_name_id_dict_geneformer:
        out_geneforer_proteins.append(gene)
    else:
        in_geneforer_proteins.append(gene)
print(f"out_geneformer_proteins: {out_geneforer_proteins}")
print(
    f"Successly found {len(in_geneforer_proteins)} proteins in geneformer, only {len(out_geneforer_proteins)} proteins are not found in geneformer"
)


## ca_attr_cols
ca_attr_cols = [col for col in data.columns if col not in protein_cols]
ca_df = data[ca_attr_cols]

main_df = data[in_geneforer_proteins].rename(columns=gene_name_id_dict_geneformer).T

ra_df = main_df.index.to_frame().reset_index(drop=True)
ra_df.columns = ["ensembl_id"]

main_df = main_df.values
assert len(ra_df) == main_df.shape[0]
assert len(ca_df) == main_df.shape[1]
print(f"finnal main_df shape: {main_df.shape}")
## gene_name => esmble_id

In [None]:
import loompy

loompy.create(
    "2_test_imputed.loom", main_df, ra_df.to_dict("list"), ca_df.to_dict("list")
)

In [None]:
"ENSG00000175164" in list(gene_name_id_dict_geneformer.values())

In [None]:
train_imputed[protein_cols]

In [None]:
import pyranges as pr

ensemble_anno_hs_path = "/home/xutingfeng/ukb/externel/Homo_sapiens.GRCh38.111.gff3.gz"
ensemble_anno_hs = pr.read_gff3(ensemble_anno_hs_path)  # load gff3
ensemble_anno_hs_gene = ensemble_anno_hs[ensemble_anno_hs.Feature == "gene"]

In [None]:
ensemble_anno_hs_gene[ensemble_anno_hs_gene.Name == "ABO"]

In [None]:
gene_name_id_dict_geneformer = pd.read_pickle(
    "/home/xutingfeng/github_code/others/Geneformer/geneformer/gene_name_id_dict.pkl"
)

gene_name_id_dict_geneformer["ABO"]

In [None]:
ENSG00000175164