In [1]:
import scanpy as sc
data_dir = '../../dataset/uniport_imputed_Xenium_breast_cancer_sample1_replicate1.h5ad'
adata = sc.read_h5ad(data_dir)


In [2]:
import torch
import lightning.pytorch as pl
from self_supervision.models.lightning_modules.cellnet_autoencoder import MLPAutoEncoder
from self_supervision.estimator.cellnet import EstimatorAutoEncoder

# 设置你的 .ckpt 文件路径
ckpt_path = "../../sc_pretrained/Pretrained Models/GPMask.ckpt"

# 模型参数
units_encoder = [512, 512, 256, 256, 64]
units_decoder = [256, 256, 512, 512]

# 初始化 EstimatorAutoEncoder 实例
estim = EstimatorAutoEncoder(data_path=None)  # 如果没有实际数据路径，可以设置为None

# 加载预训练模型
estim.model = MLPAutoEncoder.load_from_checkpoint(
    ckpt_path,
    gene_dim=19331,  # 根据你的数据调整
    batch_size=2048,  # 根据你的需要调整
    units_encoder=units_encoder, 
    units_decoder=units_decoder,
    masking_strategy="random",  # 假设模型使用了随机掩码
    masking_rate=0.5,  # 根据需要调整
)

# 使用 GPU 进行评估（如果可用）
estim.trainer = pl.Trainer(accelerator="gpu", devices=1 if torch.cuda.is_available() else None)
estim.model

  warn(f"Tensorflow dtype mappings did not load successfully due to an error: {exc.msg}")
  warn(f"Triton dtype mappings did not load successfully due to an error: {exc.msg}")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


HPU available: False, using: 0 HPUs


MLPAutoEncoder(
  (train_metrics): MetricCollection(
    (explained_var_uniform): ExplainedVariance()
    (explained_var_weighted): ExplainedVariance()
    (mse): MeanSquaredError(),
    prefix=train_
  )
  (val_metrics): MetricCollection(
    (explained_var_uniform): ExplainedVariance()
    (explained_var_weighted): ExplainedVariance()
    (mse): MeanSquaredError(),
    prefix=val_
  )
  (test_metrics): MetricCollection(
    (explained_var_uniform): ExplainedVariance()
    (explained_var_weighted): ExplainedVariance()
    (mse): MeanSquaredError(),
    prefix=test_
  )
  (encoder): MLP(
    (0): Linear(in_features=19331, out_features=512, bias=True)
    (1): SELU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=512, out_features=512, bias=True)
    (4): SELU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): SELU()
    (8): Dropout(p=0.1, inplace=False)
    (9): Linear(in_features=256, out_features=256, b

In [3]:
import pandas as pd
var_df = pd.read_parquet('../../sc_pretrained/var.parquet')
var_df

Unnamed: 0,feature_id,feature_name
0,ENSG00000186092,OR4F5
1,ENSG00000284733,OR4F29
2,ENSG00000284662,OR4F16
3,ENSG00000187634,SAMD11
4,ENSG00000188976,NOC2L
...,...,...
19326,ENSG00000288702,UGT1A3
19327,ENSG00000288705,UGT1A5
19328,ENSG00000182484,WASH6P
19329,ENSG00000288622,PDCD6-AHRR


In [4]:
all_genes = var_df['feature_name'].tolist()
all_genes

['OR4F5',
 'OR4F29',
 'OR4F16',
 'SAMD11',
 'NOC2L',
 'KLHL17',
 'PLEKHN1',
 'PERM1',
 'HES4',
 'ISG15',
 'AGRN',
 'RNF223',
 'C1orf159',
 'TTLL10',
 'TNFRSF18',
 'TNFRSF4',
 'SDF4',
 'B3GALT6',
 'C1QTNF12',
 'UBE2J2',
 'SCNN1D',
 'ACAP3',
 'PUSL1',
 'INTS11',
 'CPTP',
 'TAS1R3',
 'DVL1',
 'MXRA8',
 'AURKAIP1',
 'CCNL2',
 'MRPL20',
 'ANKRD65',
 'TMEM88B',
 'VWA1',
 'ATAD3C',
 'ATAD3B',
 'ATAD3A',
 'TMEM240',
 'SSU72',
 'FNDC10',
 'MIB2',
 'MMP23B',
 'CDK11B',
 'SLC35E2B',
 'CDK11A',
 'NADK',
 'GNB1',
 'CALML6',
 'TMEM52',
 'CFAP74',
 'GABRD',
 'PRKCZ',
 'FAAP20',
 'SKI',
 'MORN1',
 'RER1',
 'PEX10',
 'PLCH2',
 'PANK4',
 'HES5',
 'TNFRSF14',
 'PRXL2B',
 'MMEL1',
 'TTC34',
 'ACTRT2',
 'PRDM16',
 'ARHGEF16',
 'MEGF6',
 'TPRG1L',
 'WRAP73',
 'TP73',
 'CCDC27',
 'SMIM1',
 'LRRC47',
 'CEP104',
 'DFFB',
 'C1orf174',
 'AJAP1',
 'NPHP4',
 'KCNAB2',
 'CHD5',
 'RPL22',
 'RNF207',
 'ICMT',
 'HES3',
 'GPR153',
 'ACOT7',
 'HES2',
 'ESPN',
 'TNFRSF25',
 'PLEKHG5',
 'NOL9',
 'TAS1R1',
 'ZBTB48',
 'KLH

In [5]:
adata.var['gene_name']=adata.var.index
adata.var['gene_name']

SAMD11      SAMD11
NOC2L        NOC2L
KLHL17      KLHL17
PLEKHN1    PLEKHN1
PERM1        PERM1
            ...   
MT-ND4L    MT-ND4L
MT-ND4      MT-ND4
MT-ND5      MT-ND5
MT-ND6      MT-ND6
MT-CYB      MT-CYB
Name: gene_name, Length: 18079, dtype: object

In [6]:
import numpy as np
# 初始化一个新的数据矩阵，形状为 (adata.X.shape[0], len(all_genes))，填充为零
new_data = np.zeros((adata.X.shape[0], len(all_genes)), dtype=np.float32)


In [7]:
existing_genes = adata.var['gene_name']
existing_genes

SAMD11      SAMD11
NOC2L        NOC2L
KLHL17      KLHL17
PLEKHN1    PLEKHN1
PERM1        PERM1
            ...   
MT-ND4L    MT-ND4L
MT-ND4      MT-ND4
MT-ND5      MT-ND5
MT-ND6      MT-ND6
MT-CYB      MT-CYB
Name: gene_name, Length: 18079, dtype: object

In [8]:
# 将所有基因名称转换为小写
all_genes_lower = [gene.lower() for gene in all_genes]
adata_genes_lower = [gene.lower() for gene in existing_genes]

# 将两个列表转换为集合
all_genes_set = set(all_genes_lower)
adata_genes_set = set(adata_genes_lower)

# 计算交集
matching_genes = all_genes_set.intersection(adata_genes_set)
matching_count = len(matching_genes)
# 计算不匹配的基因
non_matching_genes = adata_genes_set - matching_genes
non_matching_count = len(non_matching_genes)


# 输出结果
print(f"匹配的基因数量: {matching_count}")
print(f"匹配的基因列表: {matching_genes}")
non_matching_genes


匹配的基因数量: 17768
匹配的基因列表: {'adam20', 'tram1', 'srbd1', 'amer3', 'alox5', 'gpat3', 'fam241b', 'klhl33', 'nagpa', 'mfsd6l', 'zcchc8', 'kif19', 'clpp', 'trabd2b', 'prr5', 'hand2', 'stox2', 'mei4', 'atp1a2', 'clvs1', 'xylb', 'thrap3', 'sae1', 'gadd45a', 'syngap1', 'myh2', 'otop3', 'stt3b', 'dnaja1', 'vwc2', 'vps33b', 'ube2d3', 'hibadh', 'oxsr1', 'zmym3', 'epha8', 'ptpn13', 'reln', 'fbln5', 'plpp6', 'prr12', 'ints1', 'eps8l2', 'pkn2', 'ppp4r4', 'matn3', 'fabp1', 'sun5', 'chd6', 'nup205', 'ms4a6a', 'map1lc3c', 'ctrb1', 'ctdsp2', 'ctdnep1', 'gls', 'chst12', 'c11orf16', 'gps1', 'ifi30', 'rpa3', 'usp15', 'c12orf29', 'tcf21', 'ccdc96', 'slc38a8', 'gxylt2', 'or4b1', 'nosip', 'faf1', 'dcun1d4', 'tax1bp1', 'lmx1b', 'gabpb1', 'ghr', 'etfa', 'cdk5rap3', 'obp2b', 'capn9', 'spink5', 'lrrc1', 'slc41a1', 'zbtb10', 'atp6v1c2', 'krt1', 'astl', 'abcg5', 'muc3a', 's100a8', 'tmem92', 'pdcd11', 'npas2', 'iyd', 'il4', 'lcor', 'tmem150c', 'traf6', 'ccdc125', 'syde2', 'rho', 'smarca4', 'slc36a3', 'gpr26', 'adnp2', 

{'aars',
 'ac004593.3',
 'ac005551.1',
 'ac007244.1',
 'ac007906.2',
 'ac008397.1',
 'ac008687.4',
 'ac010255.3',
 'ac010325.1',
 'ac011005.1',
 'ac011195.2',
 'ac013470.2',
 'ac015871.1',
 'ac021072.1',
 'ac021097.2',
 'ac025283.2',
 'ac025287.4',
 'ac067752.1',
 'ac072022.2',
 'ac073111.4',
 'ac087498.1',
 'ac090360.1',
 'ac091057.6',
 'ac092835.1',
 'ac093323.1',
 'ac099489.1',
 'ac104389.5',
 'ac106774.4',
 'ac113348.1',
 'ac115220.1',
 'ac118549.1',
 'ac119396.1',
 'ac132217.2',
 'ac134684.8',
 'ac135068.1',
 'ac138647.1',
 'ac187653.1',
 'ac233723.1',
 'ac236972.4',
 'acpp',
 'adprhl2',
 'adss',
 'adssl1',
 'al032819.3',
 'al109810.2',
 'al121578.2',
 'al135905.2',
 'al160269.1',
 'al162231.1',
 'al162596.1',
 'al353572.3',
 'al354761.1',
 'al391650.1',
 'al445238.1',
 'al451007.3',
 'al590560.2',
 'al603764.2',
 'al772284.2',
 'al845331.2',
 'alg1l',
 'ap000552.4',
 'ap002495.1',
 'arih2os',
 'armc4',
 'arse',
 'atp5md',
 'atp5mpl',
 'atp6ap1l',
 'b3gnt10',
 'bhmg1',
 'bx255925.

In [9]:
gene_to_index = {gene: idx for idx, gene in enumerate(all_genes_lower)}
gene_to_index

{'or4f5': 0,
 'or4f29': 1,
 'or4f16': 2,
 'samd11': 3,
 'noc2l': 4,
 'klhl17': 5,
 'plekhn1': 6,
 'perm1': 7,
 'hes4': 8,
 'isg15': 9,
 'agrn': 10,
 'rnf223': 11,
 'c1orf159': 12,
 'ttll10': 13,
 'tnfrsf18': 14,
 'tnfrsf4': 15,
 'sdf4': 16,
 'b3galt6': 17,
 'c1qtnf12': 18,
 'ube2j2': 19,
 'scnn1d': 20,
 'acap3': 21,
 'pusl1': 22,
 'ints11': 23,
 'cptp': 24,
 'tas1r3': 25,
 'dvl1': 26,
 'mxra8': 27,
 'aurkaip1': 28,
 'ccnl2': 29,
 'mrpl20': 30,
 'ankrd65': 31,
 'tmem88b': 32,
 'vwa1': 33,
 'atad3c': 34,
 'atad3b': 35,
 'atad3a': 36,
 'tmem240': 37,
 'ssu72': 38,
 'fndc10': 39,
 'mib2': 40,
 'mmp23b': 41,
 'cdk11b': 42,
 'slc35e2b': 43,
 'cdk11a': 44,
 'nadk': 45,
 'gnb1': 46,
 'calml6': 47,
 'tmem52': 48,
 'cfap74': 49,
 'gabrd': 50,
 'prkcz': 51,
 'faap20': 52,
 'ski': 53,
 'morn1': 54,
 'rer1': 55,
 'pex10': 56,
 'plch2': 57,
 'pank4': 58,
 'hes5': 59,
 'tnfrsf14': 60,
 'prxl2b': 61,
 'mmel1': 62,
 'ttc34': 63,
 'actrt2': 64,
 'prdm16': 65,
 'arhgef16': 66,
 'megf6': 67,
 'tprg1l': 68

In [10]:
only_in_all_genes = all_genes_set - adata_genes_set

only_in_adata_genes = adata_genes_set - all_genes_set

# 输出结果
print(f"仅在 all_genes 中存在的基因数量: {len(only_in_all_genes)}")
print(f"仅在 all_genes 中存在的基因: {only_in_all_genes}")

print(f"仅在 adata_genes 中存在的基因数量: {len(only_in_adata_genes)}")
print(f"仅在 adata_genes 中存在的基因: {only_in_adata_genes}")


仅在 all_genes 中存在的基因数量: 1563
仅在 all_genes 中存在的基因: {'adss2', 'nanog', 'ighv3or16-17', 'cryaa', 'cgb7', 'mtrnr2l6', 'agap6', 'hla-dra', 'rps6ka1', 'scygr6', 'myt1l', 'rbmy1b', 'jmjd7', 'urgcp-mrps24', 'zfp91-cntf', 'sting1', 'rpl8', 'trappc2', 'tas2r13', 'zhx1-c8orf76', 'znf91', 'gage12g', 'defb107a', 'f8a2', 'hla-dqb1', 'hla-g', 'cacna1a', 'tvp23b', 'fbxl14', 'smn2', 'poteb', 'hsfx1', 'h3c12', 'mbd3l5', 'ct47a8', 'h4c1', 'etda', 'palm2akap2', 'taf11l2', 'mrpl10', 'rpl27a', 'tmdd1', 'npipa2', 'hla-e', 'opn1mw3', 'ice1', 'wtap', 'golga6d', 'spring1', 'tubb3', 'duox1', 'slc35e2b', 'defb104b', 'borcs7-asmt', 'csag2', 'odad3', 'c1qtnf5', 'npipb4', 'mrpl23', 'h4c13', 'sdhd', 'grapl', 'skp1', 'fam90a17p', 'aqp12b', 'pwwp4', 'rgpd1', 'defb103b', 'peds1-ube2v1', 'bola2-smg1p6', 'mrps9', 'lyg2', 'ct45a2', 'acp3', 'ankrd20a3p', 'cdy2a', 'prr5-arhgap8', 'hnrnpa1l2', 'tubb8b', 'spcs2', 'rhoxf2', 'npipb3', 'sult1a3', 'ifna13', 'h3-3a', 'fam90a18p', 'irag2', 'mrpl51', 'rex1bd', 'rps19', 'c1orf50', 'ami

In [11]:
dense_adata_X = adata.X
for i, gene in enumerate(adata_genes_lower):
    if gene in gene_to_index:
        new_data[:, gene_to_index[gene]] = dense_adata_X[:, i]
    else:
        print(f'Gene {gene} not found in all_genes list')


Gene al391650.1 not found in all_genes list


Gene yars not found in all_genes list
Gene adprhl2 not found in all_genes list


Gene tctex1d4 not found in all_genes list


Gene tctex1d1 not found in all_genes list
Gene wdr78 not found in all_genes list
Gene hhla3 not found in all_genes list
Gene ac118549.1 not found in all_genes list
Gene wdr63 not found in all_genes list


Gene kiaa1324 not found in all_genes list
Gene sars not found in all_genes list


Gene hist2h2be not found in all_genes list


Gene al162596.1 not found in all_genes list
Gene lor not found in all_genes list


Gene c1orf61 not found in all_genes list


Gene al590560.2 not found in all_genes list


Gene rgs5 not found in all_genes list
Gene dusp27 not found in all_genes list


Gene eprs not found in all_genes list
Gene marc2 not found in all_genes list
Gene marc1 not found in all_genes list
Gene h3f3a not found in all_genes list
Gene hist3h3 not found in all_genes list
Gene hist3h2a not found in all_genes list
Gene hist3h2bb not found in all_genes list


Gene al109810.2 not found in all_genes list
Gene tbce not found in all_genes list
Gene adss not found in all_genes list
Gene al451007.3 not found in all_genes list


Gene gcsaml-as1 not found in all_genes list


Gene mycnos not found in all_genes list
Gene fam49a not found in all_genes list


Gene c2orf91 not found in all_genes list


Gene igkc not found in all_genes list
Gene al845331.2 not found in all_genes list
Gene ac092835.1 not found in all_genes list
Gene kiaa1211l not found in all_genes list


Gene dars not found in all_genes list
Gene march7 not found in all_genes list


Gene pde11a not found in all_genes list
Gene dirc1 not found in all_genes list


Gene march4 not found in all_genes list


Gene ccdc140 not found in all_genes list
Gene c2orf83 not found in all_genes list


Gene arih2os not found in all_genes list
Gene qars not found in all_genes list
Gene ccdc36 not found in all_genes list
Gene cyb561d2 not found in all_genes list


Gene c3orf67 not found in all_genes list


Gene maats1 not found in all_genes list
Gene alg1l not found in all_genes list


Gene kiaa1257 not found in all_genes list
Gene h1fx not found in all_genes list
Gene h1foo not found in all_genes list
Gene acpp not found in all_genes list


Gene slc66a1l not found in all_genes list
Gene terc not found in all_genes list


Gene ccdc39 not found in all_genes list


Gene ac072022.2 not found in all_genes list
Gene tctex1d2 not found in all_genes list


Gene ac093323.1 not found in all_genes list


Gene kiaa1211 not found in all_genes list


Gene h2afz not found in all_genes list


Gene tmem155 not found in all_genes list


Gene march1 not found in all_genes list
Gene fam218a not found in all_genes list


Gene march6 not found in all_genes list
Gene march11 not found in all_genes list
Gene ac106774.4 not found in all_genes list
Gene h3.y not found in all_genes list
Gene tars not found in all_genes list


Gene c5orf67 not found in all_genes list


Gene atp6ap1l not found in all_genes list
Gene c5orf30 not found in all_genes list


Gene ac010255.3 not found in all_genes list
Gene march3 not found in all_genes list


Gene h2afy not found in all_genes list
Gene tmem173 not found in all_genes list


Gene hars not found in all_genes list


Gene lars not found in all_genes list


Gene rars not found in all_genes list


Gene ac113348.1 not found in all_genes list
Gene c5orf60 not found in all_genes list


Gene c6orf201 not found in all_genes list


Gene hist1h2aa not found in all_genes list
Gene hist1h2ba not found in all_genes list
Gene hist1h1a not found in all_genes list
Gene hist1h4b not found in all_genes list
Gene hist1h2bb not found in all_genes list
Gene hist1h1c not found in all_genes list
Gene hist1h4c not found in all_genes list
Gene hist1h2ac not found in all_genes list
Gene hist1h1e not found in all_genes list
Gene hist1h4e not found in all_genes list
Gene hist1h2bg not found in all_genes list
Gene hist1h2ae not found in all_genes list
Gene hist1h3e not found in all_genes list
Gene hist1h1d not found in all_genes list
Gene hist1h4g not found in all_genes list
Gene hist1h2bh not found in all_genes list
Gene hist1h3g not found in all_genes list
Gene hist1h2ag not found in all_genes list
Gene hist1h4i not found in all_genes list
Gene hist1h2ai not found in all_genes list
Gene hist1h3h not found in all_genes list
Gene hist1h4j not found in all_genes list
Gene hist1h2bn not found in all_genes list
Gene hist1h2ak not found

Gene znrd1 not found in all_genes list


Gene vars not found in all_genes list
Gene snhg32 not found in all_genes list


Gene c6orf223 not found in all_genes list
Gene defb133 not found in all_genes list


Gene ick not found in all_genes list
Gene al135905.2 not found in all_genes list


Gene fgfr1op not found in all_genes list
Gene tcte3 not found in all_genes list
Gene ac187653.1 not found in all_genes list


Gene ac013470.2 not found in all_genes list
Gene twistnb not found in all_genes list


Gene ac004593.3 not found in all_genes list
Gene gars not found in all_genes list
Gene trgc2 not found in all_genes list
Gene trgjp2 not found in all_genes list
Gene trgc1 not found in all_genes list
Gene trgjp1 not found in all_genes list


Gene ac115220.1 not found in all_genes list


Gene kiaa1324l not found in all_genes list


Gene castor3 not found in all_genes list


Gene c7orf77 not found in all_genes list
Gene ac011005.1 not found in all_genes list


Gene trbc1 not found in all_genes list
Gene trbc2 not found in all_genes list
Gene sspo not found in all_genes list
Gene ac073111.4 not found in all_genes list


Gene ac021097.2 not found in all_genes list
Gene wdr60 not found in all_genes list
Gene ac134684.8 not found in all_genes list


Gene pinx1 not found in all_genes list


Gene impad1 not found in all_genes list


Gene wdyhv1 not found in all_genes list
Gene fam49b not found in all_genes list
Gene ac138647.1 not found in all_genes list


Gene tsta3 not found in all_genes list
Gene dock8-as1 not found in all_genes list


Gene c9orf92 not found in all_genes list


Gene al162231.1 not found in all_genes list


Gene fam122a not found in all_genes list
Gene al353572.3 not found in all_genes list


Gene iars not found in all_genes list
Gene c9orf129 not found in all_genes list
Gene al160269.1 not found in all_genes list


Gene tmem246 not found in all_genes list
Gene palm2-akap2 not found in all_genes list
Gene znf883 not found in all_genes list


Gene dec1 not found in all_genes list
Gene b3gnt10 not found in all_genes list


Gene wdr34 not found in all_genes list


Gene al354761.1 not found in all_genes list
Gene bx255925.3 not found in all_genes list


Gene mir1915hg not found in all_genes list
Gene armc4 not found in all_genes list


Gene c10orf142 not found in all_genes list
Gene march8 not found in all_genes list


Gene ac067752.1 not found in all_genes list
Gene kif1bp not found in all_genes list
Gene h2afy2 not found in all_genes list


Gene c10orf55 not found in all_genes list
Gene dupd1 not found in all_genes list


Gene march5 not found in all_genes list


Gene atp5md not found in all_genes list


Gene al603764.2 not found in all_genes list


Gene pano1 not found in all_genes list
Gene ac132217.2 not found in all_genes list
Gene cars not found in all_genes list


Gene c11orf40 not found in all_genes list
Gene ac104389.5 not found in all_genes list


Gene st5 not found in all_genes list
Gene mrvi1 not found in all_genes list


Gene c11orf74 not found in all_genes list


Gene or5r1 not found in all_genes list


Gene ap002495.1 not found in all_genes list


Gene card16 not found in all_genes list
Gene card17 not found in all_genes list
Gene c11orf88 not found in all_genes list


Gene ccdc84 not found in all_genes list
Gene h2afx not found in all_genes list


Gene hist4h4 not found in all_genes list
Gene h2afj not found in all_genes list
Gene lrmp not found in all_genes list
Gene casc1 not found in all_genes list


Gene h3f3c not found in all_genes list
Gene h1fnt not found in all_genes list


Gene c12orf81 not found in all_genes list
Gene grasp not found in all_genes list
Gene ac021072.1 not found in all_genes list
Gene c12orf10 not found in all_genes list


Gene mars not found in all_genes list
Gene slc26a10 not found in all_genes list
Gene march9 not found in all_genes list


Gene cllu1os not found in all_genes list
Gene c12orf74 not found in all_genes list


Gene c12orf49 not found in all_genes list
Gene wdr66 not found in all_genes list


Gene spata13 not found in all_genes list


Gene spert not found in all_genes list
Gene al445238.1 not found in all_genes list


Gene trdc not found in all_genes list
Gene trac not found in all_genes list


Gene sfta3 not found in all_genes list


Gene elmsan1 not found in all_genes list


Gene c14orf177 not found in all_genes list
Gene wars not found in all_genes list
Gene atp5mpl not found in all_genes list
Gene adssl1 not found in all_genes list


Gene igha2 not found in all_genes list
Gene ighe not found in all_genes list
Gene igha1 not found in all_genes list
Gene ighg1 not found in all_genes list
Gene ighg3 not found in all_genes list
Gene ighd not found in all_genes list
Gene ighm not found in all_genes list
Gene fam30a not found in all_genes list
Gene ac135068.1 not found in all_genes list
Gene golga8m not found in all_genes list
Gene ac091057.6 not found in all_genes list
Gene c15orf41 not found in all_genes list


Gene linc02694 not found in all_genes list


Gene casc4 not found in all_genes list


Gene ct62 not found in all_genes list


Gene ac015871.1 not found in all_genes list


Gene spata8 not found in all_genes list
Gene fam169b not found in all_genes list
Gene tarsl2 not found in all_genes list
Gene tmem8a not found in all_genes list


Gene al032819.3 not found in all_genes list


Gene ac025283.2 not found in all_genes list


Gene ac099489.1 not found in all_genes list
Gene fopnl not found in all_genes list


Gene kiaa0556 not found in all_genes list


Gene c16orf58 not found in all_genes list
Gene ac007906.2 not found in all_genes list


Gene fam192a not found in all_genes list


Gene lrrc29 not found in all_genes list


Gene aars not found in all_genes list
Gene kars not found in all_genes list
Gene ac025287.4 not found in all_genes list


Gene fam92b not found in all_genes list
Gene cenpbd1 not found in all_genes list


Gene ac087498.1 not found in all_genes list
Gene ac233723.1 not found in all_genes list


Gene trim16l not found in all_genes list
Gene linc02693 not found in all_genes list


Gene slfn12l not found in all_genes list


Gene tmem99 not found in all_genes list
Gene ttc25 not found in all_genes list


Gene g6pc not found in all_genes list
Gene c17orf53 not found in all_genes list


Gene ac011195.2 not found in all_genes list
Gene march10 not found in all_genes list


Gene h3f3b not found in all_genes list


Gene eloa3 not found in all_genes list
Gene nars not found in all_genes list


Gene ac090360.1 not found in all_genes list


Gene ac005551.1 not found in all_genes list


Gene ac119396.1 not found in all_genes list
Gene march2 not found in all_genes list


Gene ccdc151 not found in all_genes list


Gene c19orf57 not found in all_genes list


Gene ac008397.1 not found in all_genes list


Gene kiaa0355 not found in all_genes list


Gene cntd2 not found in all_genes list


Gene cd3eap not found in all_genes list
Gene bhmg1 not found in all_genes list
Gene ppp5d1 not found in all_genes list


Gene ccdc114 not found in all_genes list
Gene ac008687.4 not found in all_genes list
Gene ccdc155 not found in all_genes list


Gene ac010325.1 not found in all_genes list
Gene c19orf48 not found in all_genes list
Gene siglec5 not found in all_genes list


Gene gdf5os not found in all_genes list


Gene tmem189 not found in all_genes list


Gene fp565260.1 not found in all_genes list


Gene ap000552.4 not found in all_genes list
Gene iglc1 not found in all_genes list
Gene iglc7 not found in all_genes list
Gene lrp5l not found in all_genes list


Gene elfn2 not found in all_genes list
Gene h1f0 not found in all_genes list
Gene z82206.1 not found in all_genes list


Gene arse not found in all_genes list


Gene cxorf21 not found in all_genes list
Gene hypm not found in all_genes list
Gene al121578.2 not found in all_genes list


Gene bx276092.9 not found in all_genes list


Gene nxf5 not found in all_genes list
Gene glra4 not found in all_genes list
Gene tmsb15b not found in all_genes list
Gene h2bfwt not found in all_genes list
Gene h2bfm not found in all_genes list
Gene pih1d3 not found in all_genes list


Gene al772284.2 not found in all_genes list
Gene cxorf56 not found in all_genes list


Gene fam122b not found in all_genes list
Gene fam122c not found in all_genes list
Gene cxorf40a not found in all_genes list


Gene ac236972.4 not found in all_genes list


Gene prky not found in all_genes list
Gene ac007244.1 not found in all_genes list


In [12]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split


label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(adata.obs['cell_type'])  # 预先编码标签


random_seed = 42
X_train_val, X_test, y_train_val, y_test = train_test_split(
    new_data, labels_encoded, test_size=0.15, random_state=random_seed)


X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=random_seed)  # 0.1765 是为了让验证集占 15%

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

estim.model.eval()
with torch.no_grad():
    X_train_tensor = torch.tensor(X_train).float().to(device)
    X_test_tensor = torch.tensor(X_test).float().to(device)
    train_embeddings = estim.model.encoder(X_train_tensor).detach().cpu().numpy()
    test_embeddings = estim.model.encoder(X_test_tensor).detach().cpu().numpy()


cuda


In [13]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

    

    # 初始化和训练KNN分类器
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_embeddings, y_train)
    
    # 模型预测
predictions = knn.predict(test_embeddings)

    # 计算准确率和 F1 分数
accuracy = accuracy_score(y_test, predictions)
print(f"KNN Accuracy on Test Data: {accuracy}")
f1 = f1_score(y_test, predictions, average='weighted')
print(f"Weighted F1 Score: {f1}")
    
macro_f1 = f1_score(y_test, predictions, average='macro')
print(f'Macro F1 Score: {macro_f1}')

    # 计算随机猜测的准确率
class_probabilities = np.bincount(y_test) / len(y_test)
random_accuracy = np.sum(class_probabilities ** 2)
print(f"Random Guess Accuracy: {random_accuracy}")

    # 生成分类报告
report = classification_report(y_test, predictions, target_names=label_encoder.classes_)
print(report)

KNN Accuracy on Test Data: 0.6209024605273704
Weighted F1 Score: 0.6111620015004593
Macro F1 Score: 0.4513135878186737
Random Guess Accuracy: 0.13330938159201633
                         precision    recall  f1-score   support

                B_Cells       0.73      0.70      0.72       770
           CD4+_T_Cells       0.50      0.68      0.58      1264
           CD8+_T_Cells       0.50      0.51      0.51      1069
                 DCIS_1       0.41      0.42      0.41      1927
                 DCIS_2       0.39      0.30      0.34      1739
            Endothelial       0.77      0.75      0.76      1347
              IRF7+_DCs       0.86      0.51      0.64        73
         Invasive_Tumor       0.59      0.72      0.65      5153
             LAMP3+_DCs       0.50      0.04      0.08        45
          Macrophages_1       0.71      0.71      0.71      1646
          Macrophages_2       0.51      0.31      0.39       256
             Mast_Cells       1.00      0.07      0.13   

In [14]:
with torch.no_grad():
    new_data_tensor = torch.tensor(new_data).float().to(device)
    SSL_embeddings = estim.model.encoder(new_data_tensor).detach().cpu().numpy()
new_adata = sc.read_h5ad(data_dir)
new_adata.obsm[f'SSL_GP_ZS_{random_seed}'] = SSL_embeddings
new_adata.uns[f'GP_ZS_y_test_{random_seed}'] = y_test
new_adata.uns[f'GP_ZS_predictions_{random_seed}'] = predictions
new_adata.uns[f'GP_ZS_target_names_{random_seed}'] = label_encoder.classes_
new_adata.write_h5ad(data_dir)

In [15]:

import pandas as pd
import os
import re

# 当前 Notebook 文件名
notebook_name = "uniport_imputed_Xenium_breast_cancer_sample1_replicate1_GP_mask_zero_shot_42.ipynb"

# 初始化需要打印的值
init_train_loss = train_losses[0] if 'train_losses' in globals() else None
init_val_loss = val_losses[0] if 'val_losses' in globals() else None
converged_epoch = len(train_losses) - patience if 'train_losses' in globals() else None
converged_val_loss = best_val_loss if 'best_val_loss' in globals() else None

# 打印所有所需的指标
print("Metrics Summary:")
if 'train_losses' in globals():
    print(f"init_train_loss\tinit_val_loss\tconverged_epoch\tconverged_val_loss\tmacro_f1\tweighted_f1\tmicro_f1")
    print(f"{init_train_loss:.3f}\t{init_val_loss:.3f}\t{converged_epoch}\t{converged_val_loss:.3f}\t{macro_f1:.3f}\t{f1:.3f}\t{accuracy:.3f}")
else:
    print(f"macro_f1\tweighted_f1\tmicro_f1")
    print(f"{macro_f1:.3f}\t{f1:.3f}\t{accuracy:.3f}")

# 保存结果到 CSV 文件
output_data = {
    'dataset_split_random_seed': [int(random_seed)],
    'dataset': ['uniport_imputed_xenium_breast_cancer_sample1_replicate1'],
    'method': [re.search(r'replicate1_(.*?)_\d+', notebook_name).group(1)],
    'init_train_loss': [init_train_loss if init_train_loss is not None else ''],
    'init_val_loss': [init_val_loss if init_val_loss is not None else ''],
    'converged_epoch': [converged_epoch if converged_epoch is not None else ''],
    'converged_val_loss': [converged_val_loss if converged_val_loss is not None else ''],
    'macro_f1': [macro_f1],
    'weighted_f1': [f1],
    'micro_f1': [accuracy]
}
output_df = pd.DataFrame(output_data)

# 保存到当前目录下名为 results 的文件夹中
if not os.path.exists('results'):
    os.makedirs('results')

csv_filename = f"results/{os.path.splitext(notebook_name)[0]}_results.csv"
output_df.to_csv(csv_filename, index=False)


Metrics Summary:
macro_f1	weighted_f1	micro_f1
0.451	0.611	0.621
