## 组织数据成GeneFormer支持的格式


Geneformer需要loom格式的单细胞数据来进行tokenize，自己修改代码是可以，但是麻烦，选择套他的代码。

- loom格式，行代表基因，列代表细胞，这里我们是人（为了符合GeneFormer需求）
- ra 用记录蛋白信息的df；必须要有：ensembl_id、
- ca 记录人的各种信息；`n_counts`需要；`filter_pass`可以需要，来表示该列是否需要保留，为0则会删除


>其他需要的信息可以通过 传入一个字典来保留：`{"cell_type": "cell_type", "organ_major": "organ"}`




In [1]:
import pandas as pd

pd.set_option("display.max_columns", 500)

In [2]:
# proteomics = pd.read_pickle(
#     "/home/xutingfeng/ukb/ukbData/omics/proteomics/Olink_v1.pkl"
# )
# proteomics
# ukbData = "/home/xutingfeng/ukb/ukbData/"

# all_pheno_path = ukbData + "phenotypes/tf/all_pheno.pkl"
# all_pheno = pd.read_pickle(all_pheno_path)

In [3]:
# TODO: eid信息缺失了，并且部分蛋白是用mean填充的，后续可以根本不需要填充，因为可以传入的时候没有他们

train_imputed = pd.read_pickle("1_train_imputed.pkl")
test_imputed = pd.read_pickle("1_test_imputed.pkl")

In [4]:
import json

with open("1_X_combination_dict.json", "r") as f:
    combination_json = json.load(f)

protein_cols = combination_json["all_protein"]

In [5]:
train_imputed.reset_index(drop=False, names="eid", inplace=True)
test_imputed.reset_index(drop=False, names="eid", inplace=True)

In [7]:
train_imputed["incident_cad"] = train_imputed["incident_cad"].astype(int)
test_imputed["incident_cad"] = test_imputed["incident_cad"].astype(int)

In [9]:
from geneformer.in_silico_perturber_stats import GENE_NAME_ID_DICTIONARY_FILE
import loompy


def proteomics_to_loom(
    data,
    protein_cols,
    ca_cols=None,
    gene_name_id_dict_path=GENE_NAME_ID_DICTIONARY_FILE,
):
    """

    Return: main_matrix, ra, ca
    """
    gene_name_id_dict = pd.read_pickle(gene_name_id_dict_path)

    ## check olink proteins in geneformer dict
    in_geneforer_proteins = []
    out_geneforer_proteins = []
    for gene in protein_cols:
        if gene not in gene_name_id_dict:
            out_geneforer_proteins.append(gene)
        else:
            in_geneforer_proteins.append(gene)
    print(f"out_geneformer_proteins: {out_geneforer_proteins}")
    print(
        f"Successly found {len(in_geneforer_proteins)} proteins in geneformer, only {len(out_geneforer_proteins)} proteins are not found in geneformer"
    )

    ## ca_attr_cols
    ca_attr_cols = [col for col in data.columns if col not in protein_cols]
    ca_df = data[ca_attr_cols]

    main_df = data[in_geneforer_proteins].rename(columns=gene_name_id_dict).T

    ra_df = main_df.index.to_frame().reset_index(drop=True)
    ra_df.columns = ["ensembl_id"]

    main_df = main_df.values
    assert len(ra_df) == main_df.shape[0]
    assert len(ca_df) == main_df.shape[1]
    print(f"finnal main_df shape: {main_df.shape}")
    ## gene_name => esmble_id

    return main_df, ra_df.to_dict("list"), ca_df.to_dict("list")

In [10]:
main_matrix, ra, ca = proteomics_to_loom(train_imputed, protein_cols)

loompy.create("2_train_imputed.loom", main_matrix, ra, ca)

out_geneformer_proteins: ['DEFB103A_DEFB103B', 'BTNL10', 'ARNTL', 'SKIV2L', 'DEFB104A_DEFB104B', 'GBA', 'MYLPF', 'BOLA2_BOLA2B', 'MENT', 'CTAG1A_CTAG1B', 'GPR15L', 'SARG', 'PALM2', 'BAP18', 'SPACA5_SPACA5B', 'CGB3_CGB5_CGB8', 'DDX58', 'CERT', 'DEFA1_DEFA1B', 'DEFB4A_DEFB4B', 'IL12A_IL12B', 'CKMT1A_CKMT1B', 'LGALS7_LGALS7B', 'MICB_MICA', 'NTproBNP', 'WARS', 'EBI3_IL27', 'FUT3_FUT5']
Successly found 2883 proteins in geneformer, only 28 proteins are not found in geneformer
finnal main_df shape: (2883, 40806)


In [11]:
main_matrix, ra, ca = proteomics_to_loom(test_imputed, protein_cols)

loompy.create("2_test_imputed.loom", main_matrix, ra, ca)

out_geneformer_proteins: ['DEFB103A_DEFB103B', 'BTNL10', 'ARNTL', 'SKIV2L', 'DEFB104A_DEFB104B', 'GBA', 'MYLPF', 'BOLA2_BOLA2B', 'MENT', 'CTAG1A_CTAG1B', 'GPR15L', 'SARG', 'PALM2', 'BAP18', 'SPACA5_SPACA5B', 'CGB3_CGB5_CGB8', 'DDX58', 'CERT', 'DEFA1_DEFA1B', 'DEFB4A_DEFB4B', 'IL12A_IL12B', 'CKMT1A_CKMT1B', 'LGALS7_LGALS7B', 'MICB_MICA', 'NTproBNP', 'WARS', 'EBI3_IL27', 'FUT3_FUT5']
Successly found 2883 proteins in geneformer, only 28 proteins are not found in geneformer
finnal main_df shape: (2883, 10195)


In [121]:
ra

{'ensembl_id': ['ENSG00000125730',
  'ENSG00000169035',
  'ENSG00000137880',
  'ENSG00000188811',
  'ENSG00000189058',
  'ENSG00000111640',
  'ENSG00000115129',
  'ENSG00000128510',
  'ENSG00000182718',
  'ENSG00000132463',
  'ENSG00000166090',
  'ENSG00000072571',
  'ENSG00000172590',
  'ENSG00000124374',
  'ENSG00000137492',
  'ENSG00000170345',
  'ENSG00000102678',
  'ENSG00000057757',
  'ENSG00000136114',
  'ENSG00000148334',
  'ENSG00000101892',
  'ENSG00000166347',
  'ENSG00000133958',
  'ENSG00000198569',
  'ENSG00000144834',
  'ENSG00000119705',
  'ENSG00000074054',
  'ENSG00000165916',
  'ENSG00000240403',
  'ENSG00000166681',
  'ENSG00000101132',
  'ENSG00000110987',
  'ENSG00000108055',
  'ENSG00000156222',
  'ENSG00000151465',
  'ENSG00000121634',
  'ENSG00000077009',
  'ENSG00000107485',
  'ENSG00000145920',
  'ENSG00000058335',
  'ENSG00000140285',
  'ENSG00000164331',
  'ENSG00000119707',
  'ENSG00000151033',
  'ENSG00000170312',
  'ENSG00000107175',
  'ENSG00000137504',

In [52]:
data = test_imputed
protein_cols = protein_cols

In [81]:
gene_name_id_dict_geneformer = pd.read_pickle(GENE_NAME_ID_DICTIONARY_FILE)

## check olink proteins in geneformer dict
in_geneforer_proteins = []
out_geneforer_proteins = []
for gene in protein_cols:
    if gene not in gene_name_id_dict_geneformer:
        out_geneforer_proteins.append(gene)
    else:
        in_geneforer_proteins.append(gene)
print(f"out_geneformer_proteins: {out_geneforer_proteins}")
print(
    f"Successly found {len(in_geneforer_proteins)} proteins in geneformer, only {len(out_geneforer_proteins)} proteins are not found in geneformer"
)


## ca_attr_cols
ca_attr_cols = [col for col in data.columns if col not in protein_cols]
ca_df = data[ca_attr_cols]

main_df = data[in_geneforer_proteins].rename(columns=gene_name_id_dict_geneformer).T

ra_df = main_df.index.to_frame().reset_index(drop=True)
ra_df.columns = ["ensembl_id"]

main_df = main_df.values
assert len(ra_df) == main_df.shape[0]
assert len(ca_df) == main_df.shape[1]
print(f"finnal main_df shape: {main_df.shape}")
## gene_name => esmble_id

out_geneformer_proteins: ['DEFB103A_DEFB103B', 'BTNL10', 'ARNTL', 'SKIV2L', 'DEFB104A_DEFB104B', 'GBA', 'MYLPF', 'BOLA2_BOLA2B', 'MENT', 'CTAG1A_CTAG1B', 'GPR15L', 'SARG', 'PALM2', 'BAP18', 'SPACA5_SPACA5B', 'CGB3_CGB5_CGB8', 'DDX58', 'CERT', 'DEFA1_DEFA1B', 'DEFB4A_DEFB4B', 'IL12A_IL12B', 'CKMT1A_CKMT1B', 'LGALS7_LGALS7B', 'MICB_MICA', 'NTproBNP', 'WARS', 'EBI3_IL27', 'FUT3_FUT5']
Successly found 2883 proteins in geneformer, only 28 proteins are not found in geneformer
finnal main_df shape: (2883, 10195)


In [83]:
import loompy

loompy.create(
    "2_test_imputed.loom", main_df, ra_df.to_dict("list"), ca_df.to_dict("list")
)

In [87]:
"ENSG00000175164" in list(gene_name_id_dict_geneformer.values())

True

In [14]:
train_imputed[protein_cols]

Unnamed: 0,C3,KLK7,GCHFR,NHLRC3,APOD,GAPDH,TP53I3,CPA4,ANXA2,GRSF1,...,EGFR,TGFBR3,CRTAC1,IGFBP7,SELE,VWF,NOTCH3,CNTN1,ENG,ICAM2
0,-0.099300,1.1771,0.0168,0.0085,0.64210,0.30385,0.7117,0.5761,-0.52325,1.555400,...,-0.004284,-0.0087,-0.029539,0.022568,-0.027118,0.008048,0.004249,0.000619,0.001707,-0.026825
1,0.260000,-0.3965,-0.5041,0.5323,-1.03430,0.10195,-0.0746,0.0584,0.83135,0.367100,...,0.061000,0.0200,-0.129250,0.767800,0.475400,-0.998200,-0.191000,0.257100,-0.035500,-0.062800
2,0.116000,0.3652,0.4443,0.1803,-0.30610,-0.39415,-0.1995,0.3213,0.60255,0.141800,...,0.192700,0.4141,0.617050,0.255300,0.516900,-0.778000,0.754700,0.518900,0.202900,-0.106700
3,-0.334800,-0.0388,0.1682,-0.1796,0.02491,-0.00610,0.6901,0.0147,-0.15590,1.034800,...,-0.100150,0.0859,-0.321950,0.610200,-1.247300,1.797250,0.728800,-0.612500,0.098050,0.265000
4,0.098900,0.9519,0.4342,-0.1683,0.02610,0.36655,0.7529,0.7505,0.04335,0.082175,...,0.131400,0.0614,0.043250,-0.606100,-0.295200,1.646800,-0.164000,0.155000,-0.083700,-0.376600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40801,0.861200,0.1095,0.0088,-0.0773,-0.17580,-0.05135,-0.3694,0.1716,-0.02755,-0.288100,...,0.245600,-0.4848,-0.479450,0.000000,-0.328900,0.679200,-0.450600,0.027300,-0.152600,-0.679800
40802,-0.599800,0.0836,-0.1788,0.0775,0.04360,0.17520,0.1326,-0.0604,0.09250,1.196700,...,0.122300,0.0860,0.010300,-0.043300,-0.684200,0.162300,0.486000,0.251500,0.034900,-0.082700
40803,0.506400,0.3565,-0.0931,0.1316,0.26370,-0.41960,-0.5433,0.1315,-0.10240,-0.501700,...,-0.025950,0.2433,0.594750,0.013200,-0.094450,0.558400,0.444300,0.083100,-0.033300,0.026500
40804,-0.350000,0.3262,-0.5348,0.3078,-0.11740,-0.01050,0.2912,-0.0875,-0.17940,-0.039700,...,-0.338600,0.4135,0.283050,0.005200,0.213900,0.341400,0.746300,0.238200,-0.541400,-0.195800


In [22]:
import pyranges as pr

ensemble_anno_hs_path = "/home/xutingfeng/ukb/externel/Homo_sapiens.GRCh38.111.gff3.gz"
ensemble_anno_hs = pr.read_gff3(ensemble_anno_hs_path)  # load gff3
ensemble_anno_hs_gene = ensemble_anno_hs[ensemble_anno_hs.Feature == "gene"]

In [35]:
ensemble_anno_hs_gene[ensemble_anno_hs_gene.Name == "ABO"]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,ID,Alias,external_name,logic_name,Name,biotype,description,gene_id,version,Parent,tag,transcript_id,transcript_support_level,constitutive,ensembl_end_phase,ensembl_phase,exon_id,rank,ccdsid,protein_id
9839,9,ensembl_havana,gene,133233277,133276024,.,-,.,gene:ENSG00000175164,,,ensembl_havana_gene_homo_sapiens,ABO,protein_coding,ABO%2C alpha 1-3-N-acetylgalactosaminyltransfe...,ENSG00000175164,16,,,,,,,,,,,


In [36]:
gene_name_id_dict_geneformer = pd.read_pickle(
    "/home/xutingfeng/github_code/others/Geneformer/geneformer/gene_name_id_dict.pkl"
)

gene_name_id_dict_geneformer["ABO"]

'ENSG00000175164'

In [None]:
ENSG00000175164