In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
from scipy.stats import spearmanr
import scanpy as sc
import pandas as pd
from scTEL.scTEL_API import scTEL_API
import random,torch,numpy


In [None]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    numpy.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

setup_seed(2021)

In [None]:
"""Read in Raw Data"""
adata_gene = sc.read(
    "./Data/pbmc/pbmc_gene.h5ad")  # CITE-seq数据集中基因表达量 161764cells,20729genes  P1-8 8名捐赠者的细胞，取三个时间点测序0，3，7
adata_protein = sc.read("./Data/pbmc/pbmc_protein.h5ad")  # CITE-seq数据集中蛋白质表达量 161764cells,224protein
doublet_bool = (adata_gene.obs['celltype.l3'] != 'Doublet')
adata_gene = adata_gene[doublet_bool].copy()  # 161159,20729
adata_protein = adata_protein[doublet_bool].copy()  # 161159,224

adata_gene.obs.donor.value_counts()

In [None]:
train_donors = ['P1', 'P3', 'P4', 'P7']
train_bools = [x in train_donors for x in adata_gene.obs['donor']]  # train_bools
test_bools = np.invert(train_bools)

adata_gene = adata_gene[train_bools].copy() # 用于训练集的基因表达量75496,20729
adata_protein = adata_protein[train_bools].copy() # 训练集

adata_gene_test = adata_gene[test_bools].copy() # 测试集中基因表达85663,20729
adata_protein_test = adata_protein[test_bools].copy() # 测试集中蛋白质表达

Keep only relevant genes/proteins

In [None]:
# Remove lowly expressed cells过滤测试集中低表达的细胞tmp_gene、tmp_protein
cell_filter = (adata_gene_test.X > 10**(-8)).sum(axis = 1) >= 200
tmp_gene = adata_gene_test[cell_filter].copy() # 85660,20729
tmp_protein = adata_protein_test[cell_filter].copy() # 85660,224
tmp_gene.X = tmp_gene.X.toarray()  # csc_matrix转换成ndarray格式
tmp_protein.X = tmp_protein.X.toarray()

In [None]:
# Cell and log normalize对训练集进常规标准化处理
sc.pp.normalize_total(tmp_gene)
sc.pp.log1p(tmp_gene) # uns:log1p
sc.pp.normalize_total(tmp_protein)
sc.pp.log1p(tmp_protein)

In [None]:
# Z-score normalizean对训练集按照捐赠者划分单独进行标准化
patients = pd.unique(tmp_gene.obs['donor'].values)
for patient in patients:
    indices = [x == patient for x in tmp_gene.obs['donor']]
    
    sub_adata = tmp_gene[indices].copy()
    sc.pp.scale(sub_adata)
    tmp_gene[indices] = sub_adata.X.copy()
    
    sub_adata = tmp_protein[indices].copy()
    sc.pp.scale(sub_adata)
    tmp_protein[indices] = sub_adata.X.copy()


In [None]:
scTEL = scTEL_API([adata_gene], [adata_protein], adata_gene_test, train_batchkeys=['donor'], test_batchkey='donor',
                  type_key='celltype.l3', batch_size=32,
                  min_cells=30, min_genes=200, h_size=512, h=4, drop_rate=0.15)

In [None]:
# model training
scTEL.train(n_epochs=1000, ES_max=50, decay_max=8,
            decay_step=0.1, lr=0.001, weights_dir="weights_dir/pbmc_to_pbmc_typel3_scTEL_Pro2", load=False)

In [None]:
predicted_test = scTEL.predict()
acc = (predicted_test.obs['transfered cell labels'] == predicted_test.obs['celltype.l3']).mean()
print("ACC:", acc)

In [None]:
"""Get test data"""
adata_protein_test.X = adata_protein_test.X.toarray()  # 85663，224
adata_protein_test.layers["raw"] = adata_protein_test.X

adata_protein_test = adata_protein_test[predicted_test.obs.index]  # 85660，224 取过滤后的细胞蛋白质表达
sc.pp.normalize_total(adata_protein_test)
sc.pp.log1p(adata_protein_test)
common_proteins = np.intersect1d(predicted_test.var.index, adata_protein_test.var.index)
adata_protein_test = adata_protein_test[:, common_proteins]
adata_protein_test.layers['imputed'] = predicted_test[:, common_proteins].X  # 加入预测的蛋白质表达量
adata_protein_test.layers.update(predicted_test[:, common_proteins].layers)  # 加入q10、q90、q25和q75
patients = np.unique(adata_protein_test.obs['donor'].values)

In [None]:
for patient in patients:
    indices = [x == patient for x in adata_protein_test.obs['donor']]
    sub_adata = adata_protein_test[indices]

    sc.pp.scale(sub_adata)
    adata_protein_test[indices] = sub_adata.X

In [None]:
def corr2_coeff(A, B, pearson=True):
    if pearson:
        # Rowwise mean of input arrays & subtract from input arrays themeselves
        A_mA = A - A.mean(1)[:, None]
        B_mB = B - B.mean(1)[:, None]

        # Sum of squares across rows
        ssA = (A_mA ** 2).sum(1)
        ssB = (B_mB ** 2).sum(1)

        # Finally get corr coeff
        corr_mat = np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None], ssB[None]))

        return corr_mat[range(corr_mat.shape[0]), range(corr_mat.shape[0])]

    else:
        corrs = [0.] * A.shape[0]

        for i in range(A.shape[0]):
            corrs[i] = spearmanr(A[i], B[i])[0]

        return corrs

In [None]:
"""Compute correlation across patients"""
corrs = corr2_coeff(adata_protein_test.layers["imputed"].T, adata_protein_test.X.T)
corrs = pd.DataFrame(corrs)
corrs.index = adata_protein_test.var.index
corrs = corrs.dropna()
print(corrs)
print(corrs.mean())

In [None]:
MSEs = ((adata_protein_test.X - adata_protein_test.layers["imputed"]) ** 2).mean(axis=0) ** (1 / 2)

protein_table = pd.DataFrame(np.concatenate(
    (corrs.to_numpy(), np.expand_dims(MSEs, axis=1), adata_protein_test.layers["raw"].mean(axis=0, keepdims=True).T),
    axis=1),
                             index=corrs.index, columns=["Correlations", "RMSE", "Mean Expression"])

protein_table["Log-Mean Expression"] = np.log(protein_table["Mean Expression"])

In [None]:
sq = lambda x, y: (x - y)**2

In [None]:
"""Compute correlations within patient"""
corrs_table = np.zeros((adata_protein_test.shape[1], len(np.unique(adata_protein_test.obs["donor"]))))
sq_table = corrs_table.copy()  # 224，4
i = 0
for patient in np.unique(adata_protein_test.obs["donor"]):
    truth = adata_protein_test[adata_protein_test.obs["donor"] == patient].X.copy()
    imputed = adata_protein_test.layers["imputed"][adata_protein_test.obs["donor"] == patient].copy()

    corrs_table[:, i] = corr2_coeff(truth.T, imputed.T)
    sq_table[:, i] = sq(truth, imputed).mean(axis=0)

    i += 1


if np.isnan(corrs_table).sum() > 0:
    corrs_table[np.isnan(corrs_table)] = 0

In [None]:
corrs_table = pd.DataFrame(corrs_table)
corrs_table.index, corrs_table.columns = adata_protein_test.var.index, np.unique(adata_protein_test.obs["donor"])

sq_table = pd.DataFrame(sq_table)
sq_table.index, sq_table.columns = adata_protein_test.var.index, np.unique(adata_protein_test.obs["donor"])

In [None]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q75'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q25'])

print(f"Effective Coverage Probability for Nominal 50% PI: {(r95*l95).mean():.3f}")

In [None]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q90'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q10'])

print(f"Effective Coverage Probability for Nominal 80% PI: {(r95*l95).mean():.3f}")

In [None]:
tmp_gene.write('gene_pbmctopbmc_scTEL.h5ad')
adata_protein_test.write('protein_pbmctopbmc_scTEL.h5ad')