In [1]:
%load_ext autoreload
%autoreload 2

from time import time
import numpy as np
from scipy.stats import spearmanr
import scanpy as sc
import pandas as pd
from read_monocyte_data import read_data
from scTEL.scTEL_API import scTEL_API
import torch,numpy,random

In [2]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    numpy.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

setup_seed(2021)

In [None]:
adata_gene, adata_protein = read_data(cell_normalize = False, log_normalize = False, feature_normalize = False,
                   dir_path = './Data/monocytes_mingyao/cite_seq', subset_hvg = False)
tmp = adata_protein.copy()
sc.pp.normalize_total(tmp)
sc.pp.log1p(tmp)
sums = tmp.X.sum(axis = 0)
samples = ((tmp.X > 0.0001).sum(axis = 0))
expression = sums/samples
adata_protein = adata_protein[:, expression > 0.8].copy()
adata_protein

In [4]:
train_patientset = ['RPM211A', 'RPM211B', 'RPM232A', 'RPM232B']
test_patientset = ['RPM215A', 'RPM215B', 'RPM218A', 'RPM218B']

train_patients = [x in train_patientset for x in adata_gene.obs['patient']]
test_patients = [x in test_patientset for x in adata_gene.obs['patient']]

adata_gene, adata_gene_test = adata_gene[train_patients], adata_gene[test_patients]
adata_protein, adata_protein_test = adata_protein[train_patients], adata_protein[test_patients]

In [5]:
scTEL = scTEL_API([adata_gene], [adata_protein], adata_gene_test, train_batchkeys=['patient'], test_batchkey='patient',batch_size=32,
                  min_cells=30, min_genes=200, h_size=512, h=4, drop_rate=0.15)

Trying to set attribute `.obs` of view, copying.


Searching for GPU
GPU detected, using GPU


Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.



QC Filtering Training Cells
QC Filtering Testing Cells

QC Filtering Training Genes
QC Filtering Testing Genes

Normalizing Training Cells
Normalizing Testing Cells

Log-Normalizing Training Data
Log-Normalizing Testing Data

Finding HVGs


... storing 'patient' as categorical
... storing 'batch' as categorical
... storing 'Dataset' as categorical
... storing 'expression_type' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)



Normalizing Gene Training Data by Batch


100%|██████████| 4/4 [00:00<00:00, 10.16it/s]



Normalizing Protein Training Data by Batch


100%|██████████| 4/4 [00:00<00:00, 46.06it/s]



Normalizing Gene Testing Data by Batch


100%|██████████| 4/4 [00:00<00:00, 11.35it/s]


In [6]:
start = time()
scTEL.train(n_epochs = 1000, ES_max = 30, decay_max = 10,
            decay_step = 0.1, lr = 0.001,weights_dir = "weights_dir/monocyte_to_monocyte_1",load=True)
imputed_test = scTEL.predict()
print(time() - start)

20.066420793533325

In [7]:
embedding = scTEL.embed()
embedding.write("embedding/sctel_monocyteembedding.h5ad")

In [8]:
adata_protein_test = adata_protein_test[imputed_test.obs.index]
sc.pp.normalize_total(adata_protein_test)
sc.pp.log1p(adata_protein_test)
sc.pp.filter_genes(adata_protein_test, min_counts = 1)

common_proteins = np.intersect1d(imputed_test.var.index, adata_protein_test.var.index)

adata_protein_test = adata_protein_test[:, common_proteins]
adata_protein_test.layers['imputed'] = imputed_test[:, common_proteins].X
adata_protein_test.layers.update(imputed_test[:, common_proteins].layers)

patients = np.unique(adata_protein_test.obs['patient'].values)

for patient in patients:
    indices = [x == patient for x in adata_protein_test.obs['patient']]
    sub_adata = adata_protein_test[indices]

    sc.pp.scale(sub_adata)
    adata_protein_test[indices] = sub_adata.X

In [9]:
def corr2_coeff(A, B, pearson = True):
    if pearson:
        # Rowwise mean of input arrays & subtract from input arrays themeselves
        A_mA = A - A.mean(1)[:, None]
        B_mB = B - B.mean(1)[:, None]

        # Sum of squares across rows
        ssA = (A_mA**2).sum(1)
        ssB = (B_mB**2).sum(1)

        # Finally get corr coeff
        corr_mat = np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None],ssB[None]))
        
        return corr_mat[range(corr_mat.shape[0]), range(corr_mat.shape[0])]
    
    else:
        corrs = [0.] * A.shape[0]
        
        for i in range(A.shape[0]):
            corrs[i] = spearmanr(A[i], B[i])[0]
            
        return corrs

In [10]:
corrs = corr2_coeff(adata_protein_test.layers["imputed"].T, adata_protein_test.X.T)
corrs = pd.DataFrame(corrs)
corrs.index = adata_protein_test.var.index
corrs = corrs.dropna()

In [11]:
MSEs = ((adata_protein_test.X - adata_protein_test.layers["imputed"])**2).mean(axis = 0)**(1/2)

In [12]:
protein_table = pd.DataFrame(np.concatenate((corrs.to_numpy(), np.expand_dims(MSEs, axis = 1), adata_protein_test.layers["raw"].mean(axis = 0, keepdims = True).T), axis = 1),
                             index = corrs.index, columns = ["Correlations", "RMSE", "Mean Expression"])

protein_table["Log-Mean Expression"] = np.log(protein_table["Mean Expression"])

In [13]:
sq = lambda x, y: (x - y)**2

In [14]:
corrs_table = np.zeros((adata_protein_test.shape[1], len(np.unique(adata_protein_test.obs["patient"]))))
sq_table = corrs_table.copy()

i = 0
for patient in np.unique(adata_protein_test.obs["patient"]):
    truth = adata_protein_test[adata_protein_test.obs["patient"] == patient].X.copy()
    imputed = adata_protein_test.layers["imputed"][adata_protein_test.obs["patient"] == patient].copy()

    corrs_table[:, i] = corr2_coeff(truth.T, imputed.T)
    sq_table[:, i] = sq(truth, imputed).mean(axis = 0)
    i += 1

if np.isnan(corrs_table).sum() > 0:
    corrs_table[np.isnan(corrs_table)] = 0

In [15]:
corrs_table = pd.DataFrame(corrs_table)
sq_table = pd.DataFrame(sq_table)
corrs_table.index, corrs_table.columns = adata_protein_test.var.index, np.unique(adata_protein_test.obs["patient"])
sq_table.index, sq_table.columns = adata_protein_test.var.index, np.unique(adata_protein_test.obs["patient"])

In [16]:
corrs_table.mean(axis = 0)
corrs_table.mean(axis = 0).mean()
corrs_table.to_csv('corrs_results/sctel_monocyte.csv')

In [None]:
sq_table.mean(axis = 0)
sq_table.mean().mean()
sq_table.to_csv('mse_results/sctel_monocyte.csv')

In [None]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q75'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q25'])
print(f"Effective Coverage Probability for Nominal 50% PI: {(r95*l95).mean():.3f}")
r95 = (adata_protein_test.X < adata_protein_test.layers['q90'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q10'])

print(f"Effective Coverage Probability for Nominal 80% PI: {(r95*l95).mean():.3f}")

In [None]:
adata_protein_test.write("sctel_monocytefeatures.h5ad")