In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from matplotlib import pyplot
import os
# os.chdir('/home/yychen/sciPENN_codes-master/Experiments')
from copy import deepcopy

from time import time

from math import ceil
from scipy.stats import spearmanr, gamma, poisson

from anndata import AnnData, read_h5ad
import scanpy as sc
from scanpy import read
import pandas as pd

from torch.utils.data import DataLoader, TensorDataset
from torch import tensor
from torch.cuda import is_available

from sciPENN.sciPENN_API import sciPENN_API

In [2]:
"""Read in Raw Data"""
adata_gene = sc.read("../Data/pbmc/pbmc_gene.h5ad")
adata_protein = sc.read("../Data/pbmc/pbmc_protein.h5ad")

adata_gene_test = sc.read("../Data/H1N1/gene_data.mtx").T
adata_gene_test.var.index = pd.read_csv("../Data/H1N1/gene_names.txt", index_col = 0).iloc[:, 0]
adata_gene_test.obs = pd.read_csv("../Data/H1N1/meta_data.txt", sep = ',', index_col = 0)

In [3]:
sciPENN = sciPENN_API([adata_gene], [adata_protein], adata_gene_test,
                    train_batchkeys = ['donor'], test_batchkey = 'sample')

Searching for GPU
GPU detected, using GPU

QC Filtering Training Cells
QC Filtering Testing Cells

QC Filtering Training Genes
QC Filtering Testing Genes

Normalizing Training Cells
Normalizing Testing Cells

Log-Normalizing Training Data
Log-Normalizing Testing Data

Finding HVGs


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)



Normalizing Gene Training Data by Batch


100%|██████████| 8/8 [00:09<00:00,  1.22s/it]



Normalizing Protein Training Data by Batch


100%|██████████| 8/8 [00:09<00:00,  1.15s/it]



Normalizing Gene Testing Data by Batch


100%|██████████| 20/20 [00:02<00:00,  7.88it/s]


In [4]:
adata_protein_test = sc.read("../Data/H1N1/protein_data.mtx").T
adata_protein_test.var.index = [x[:len(x) - 5] for x in pd.read_csv("../Data/H1N1/protein_names.txt", index_col = 0).iloc[:,0]]
adata_protein_test.obs = pd.read_csv("../Data/H1N1/meta_data.txt", sep = ',', index_col = 0)

In [5]:
sciPENN.train(n_epochs = 10000, ES_max = 12, decay_max = 8,
             decay_step = 0.1, lr = 10**(-3), weights_dir = "weights_dir/pbmc_to_h1n1",load=False)

Epoch 0 prediction loss = 1.398
Epoch 1 prediction loss = 0.964
Epoch 2 prediction loss = 0.962
Epoch 3 prediction loss = 0.965
Epoch 4 prediction loss = 0.980
Epoch 5 prediction loss = 0.968
Epoch 6 prediction loss = 0.996
Epoch 7 prediction loss = 0.990
Epoch 8 prediction loss = 1.008
Decaying loss to 0.0005
Epoch 9 prediction loss = 0.950
Epoch 10 prediction loss = 0.943
Epoch 11 prediction loss = 0.933
Epoch 12 prediction loss = 0.951
Epoch 13 prediction loss = 0.950
Epoch 14 prediction loss = 0.940
Epoch 15 prediction loss = 0.975
Epoch 16 prediction loss = 0.936
Epoch 17 prediction loss = 0.947
Epoch 18 prediction loss = 0.978
Decaying loss to 0.00025
Epoch 19 prediction loss = 0.929
Epoch 20 prediction loss = 0.931
Epoch 21 prediction loss = 0.914
Epoch 22 prediction loss = 0.931
Epoch 23 prediction loss = 0.919
Epoch 24 prediction loss = 0.936
Epoch 25 prediction loss = 0.934
Epoch 26 prediction loss = 0.923
Epoch 27 prediction loss = 0.933
Epoch 28 prediction loss = 0.920
Deca

In [8]:
imputed_test = sciPENN.predict()

  imputed_test = AnnData(zeros(shape=(len(cells), len(proteins.var))))


In [6]:
embedding = sciPENN.embed()
embedding.write("scipenn_pbmctoh1n1embedding.h5ad")

  embedding = AnnData(zeros(shape=(len(cells_train) + len(cells_test), 512)))


In [9]:
"""Get test data"""

adata_protein_test = sc.read("../Data/H1N1/protein_data.mtx").T
adata_protein_test.var.index = [x[:len(x) - 5] for x in pd.read_csv("../Data/H1N1/protein_names.txt", index_col = 0).iloc[:,0]]
adata_protein_test.obs = pd.read_csv("../Data/H1N1/meta_data.txt", sep = ',', index_col = 0)

adata_protein_test.X = adata_protein_test.X.toarray()
adata_protein_test.layers["raw"] = adata_protein_test.X

adata_protein_test = adata_protein_test[imputed_test.obs.index]

sc.pp.normalize_total(adata_protein_test)
sc.pp.log1p(adata_protein_test)

common_proteins = np.intersect1d(imputed_test.var.index, adata_protein_test.var.index)

adata_protein_test = adata_protein_test[:, common_proteins]
adata_protein_test.layers['imputed'] = imputed_test[:, common_proteins].X
adata_protein_test.layers.update(imputed_test[:, common_proteins].layers)

patients = np.unique(adata_protein_test.obs['sample'].values)

for patient in patients:
    indices = [x == patient for x in adata_protein_test.obs['sample']]
    sub_adata = adata_protein_test[indices]

    sc.pp.scale(sub_adata)
    adata_protein_test[indices] = sub_adata.X

  view_to_actual(adata)
  view_to_actual(adata)


In [10]:
def corr2_coeff(A, B, pearson = True):
    if pearson:
        # Rowwise mean of input arrays & subtract from input arrays themeselves
        A_mA = A - A.mean(1)[:, None]
        B_mB = B - B.mean(1)[:, None]

        # Sum of squares across rows
        ssA = (A_mA**2).sum(1)
        ssB = (B_mB**2).sum(1)

        # Finally get corr coeff
        corr_mat = np.dot(A_mA, B_mB.T) / np.sqrt(np.dot(ssA[:, None],ssB[None]))
        
        return corr_mat[range(corr_mat.shape[0]), range(corr_mat.shape[0])]
    
    else:
        corrs = [0.] * A.shape[0]
        
        for i in range(A.shape[0]):
            corrs[i] = spearmanr(A[i], B[i])[0]
            
        return corrs

In [11]:
"""Compute correlation across patients"""

corrs = corr2_coeff(adata_protein_test.layers["imputed"].T, adata_protein_test.X.T)
corrs = pd.DataFrame(corrs)
corrs.index = adata_protein_test.var.index
corrs = corrs.dropna()

In [12]:
MSEs= ((adata_protein_test.X - adata_protein_test.layers["imputed"])**2).mean(axis = 0)**(1/2)

protein_table = pd.DataFrame(np.concatenate((corrs.to_numpy(), np.expand_dims(MSEs, axis = 1), adata_protein_test.layers["raw"].mean(axis = 0, keepdims = True).T), axis = 1), 
                             index = corrs.index, columns = ["Correlations", "RMSE", "Mean Expression"])

protein_table["Log-Mean Expression"] = np.log(protein_table["Mean Expression"])

In [13]:
sq = lambda x, y: (x - y)**2

In [14]:
"""Compute correlations within patient"""

corrs_table = np.zeros((adata_protein_test.shape[1], len(np.unique(adata_protein_test.obs["sample"]))))
sq_table = corrs_table.copy()

i = 0
for patient in np.unique(adata_protein_test.obs["sample"]):
    truth = adata_protein_test[adata_protein_test.obs["sample"] == patient].X.copy()
    imputed = adata_protein_test.layers["imputed"][adata_protein_test.obs["sample"] == patient].copy()

    corrs_table[:, i] = corr2_coeff(truth.T, imputed.T)
    sq_table[:, i] = sq(truth, imputed).mean(axis = 0)
    i += 1

if np.isnan(corrs_table).sum() > 0:
    corrs_table[np.isnan(corrs_table)] = 0

In [15]:
corrs_table = pd.DataFrame(corrs_table)
corrs_table.index, corrs_table.columns = adata_protein_test.var.index, np.unique(adata_protein_test.obs["sample"])

sq_table = pd.DataFrame(sq_table)
sq_table.index, sq_table.columns = adata_protein_test.var.index, np.unique(adata_protein_test.obs["sample"])

In [16]:
corrs_table.mean(axis = 0)

200_d0    0.523797
201_d0    0.490595
205_d0    0.502941
207_d0    0.491734
209_d0    0.507680
212_d0    0.503582
215_d0    0.519087
229_d0    0.487023
233_d0    0.493979
234_d0    0.528349
236_d0    0.483877
237_d0    0.499614
245_d0    0.486768
250_d0    0.503890
256_d0    0.534520
261_d0    0.500027
268_d0    0.503700
273_d0    0.469704
277_d0    0.519644
279_d0    0.497677
dtype: float64

In [None]:
corrs_table.mean().mean()

In [19]:
corrs_table.to_csv('corrs_results/scipenn_pbmctoh1n1.csv')

In [None]:
sq_table.mean(axis = 0)

In [21]:
sq_table.mean().mean()

0.694491784375603

In [22]:
sq_table.to_csv('mse_results/scipenn_pbmctoh1n1.csv')

In [None]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q75'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q25'])

print(f"Effective Coverage Probability for Nominal 50% PI: {(r95*l95).mean():.3f}")

In [None]:
r95 = (adata_protein_test.X < adata_protein_test.layers['q90'])
l95 = (adata_protein_test.X > adata_protein_test.layers['q10'])

print(f"Effective Coverage Probability for Nominal 80% PI: {(r95*l95).mean():.3f}")

In [25]:
adata_protein_test.write("scipenn_pbmctoh1n1features.h5ad")