In [1]:
import functools
import operator
import os
import sys

import anndata as ad
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import scanpy as sc
import scipy.stats
import seaborn as sns
import sklearn.cluster
import torch

import sklearn.linear_model
import sklearn.preprocessing
import sklearn.metrics
from matplotlib import rcParams
from networkx.algorithms.bipartite import biadjacency_matrix

import scglue
import matplotlib.pyplot as plt

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances


from utils.test_util import test_data_multiome, Siamese_Test_multiome
from utils.feature_util import feature_selection_multiome
import utils.cis_infer_util as cis_infer_util


  data = yaml.load(f.read()) or {}
  import pandas.util.testing as tm


### Sparsity Check

In [4]:
input_mod1 = ad.read_h5ad('data/10XMultiome_GEX.PBMC.h5ad')
input_mod1.X = input_mod1.layers['counts'].copy()
input_mod2 = ad.read_h5ad('data/10XMultiome_ATAC.PBMC.h5ad')

gene_spar = pd.DataFrame(np.mean(input_mod1.X > 0, axis=0))
gene_spar.columns = ['Sparsity']
gene_spar['Smoothing'] = 'No'
gene_spar['Feature'] = 'GEX'
gene_spar['ID'] = input_mod1.var_names.to_numpy()

peak_spar = pd.DataFrame(np.mean(input_mod2.X > 0, axis=0))
peak_spar.columns = ['Sparsity']
peak_spar['Smoothing'] = 'No'
peak_spar['Feature'] = 'ATAC'
peak_spar['ID'] = input_mod2.var_names.to_numpy()

spar_df = pd.concat([gene_spar, peak_spar])


In [7]:
for num_nn in [5,10,15,20]:
    input_mod1_sm = ad.read_h5ad('data/10XMultiome_GEX.PBMC.smoothed.%sNN.h5ad'%(num_nn))
    input_mod1_sm.X = input_mod1_sm.layers['smoothed_counts'].copy()

    input_mod2_sm = ad.read_h5ad('data/10XMultiome_ATAC.PBMC.smoothed.%sNN.h5ad'%(num_nn))
    input_mod2_sm.X = input_mod2_sm.layers['smoothed_counts'].copy()
    
    gene_spar_sm = pd.DataFrame(np.mean(input_mod1_sm.X > 0, axis=0))
    gene_spar_sm.columns = ['Sparsity']
    gene_spar_sm['Smoothing'] = '%sNN'%(num_nn)
    gene_spar_sm['Feature'] = 'GEX'
    gene_spar_sm['ID'] = input_mod1_sm.var_names.to_numpy()

    peak_spar_sm = pd.DataFrame(np.mean(input_mod2_sm.X > 0, axis=0))
    peak_spar_sm.columns = ['Sparsity']
    peak_spar_sm['Smoothing'] = '%sNN'%(num_nn)
    peak_spar_sm['Feature'] = 'ATAC'
    peak_spar_sm['ID'] = input_mod2_sm.var_names.to_numpy()
    
    spar_df = pd.concat([spar_df, gene_spar_sm, peak_spar_sm])

spar_df = spar_df.iloc[:,[3,0,1,2]]

In [8]:
spar_df

Unnamed: 0,ID,Sparsity,Smoothing,Feature
0,MIR1302-2HG,0.000000,No,GEX
1,FAM138A,0.000000,No,GEX
2,OR4F5,0.000000,No,GEX
3,AL627309.1,0.007843,No,GEX
4,AL627309.3,0.000000,No,GEX
...,...,...,...,...
106051,chrX-155966975-155967181,0.135372,20NN,ATAC
106052,chrX-155997360-155997882,0.281769,20NN,ATAC
106053,chrX-156030027-156030149,0.020914,20NN,ATAC
106054,chrY-11332988-11334144,0.128893,20NN,ATAC


In [9]:
spar_df.to_csv('sparsity_sum.csv', index=False)

### 2kb peaks-gene correlation

In [2]:
input_mod1 = ad.read_h5ad('data/10XMultiome_GEX.PBMC.h5ad')
input_mod1.X = input_mod1.layers['counts'].copy()
input_mod2 = ad.read_h5ad('data/10XMultiome_ATAC.PBMC.h5ad')



In [3]:
rna = ad.read_h5ad("data/rna.h5ad")
atac = ad.read_h5ad("data/atac.h5ad")

genes = scglue.genomics.Bed(rna.var.assign(name=rna.var_names).query("d_highly_variable"))
peaks = scglue.genomics.Bed(atac.var.assign(name=atac.var_names).query("d_highly_variable"))
tss = genes.strand_specific_start_site()
promoters = tss.expand(2000, 0)

pchic_graph = nx.read_graphml("data/pchic.graphml.gz")
pchic = pd.DataFrame(
    biadjacency_matrix(pchic_graph, genes.index, peaks.index, weight=None, dtype=np.float32).toarray(),
    index=genes.index, columns=peaks.index
)


In [4]:
genes = pchic.index.to_numpy()
distance = 2000
ref_tss_fn = 'utils/hg38_ref_TSS.txt'


#### Non-smoothing

In [5]:
#non-smoothing
cis_score_df = pd.DataFrame(np.zeros((0,6)))
cis_score_df.columns = ['genes','peaks','tss_dist','pRegion','Spearman.cor','Method']


cis_score_tmp, rps, aps, label = cis_infer_util.cis_element_score(input_mod1, input_mod2, genes, 
                                         distance, ref_tss_fn, use_rep='SiaNN',
                                         n_pseudocells=100, return_pseudo_bulk = True)
cis_score_tmp['Method'] = 'SiaNN'
cis_score_df = pd.concat([cis_score_df,cis_score_tmp])

Add suffix to cell names in RNA and ATAC data...
Clustering pseudocells...
Representation mode engaged, clustering based on combined SiaNN embedding...


  view_to_actual(adata)


Selecting peaks within 2000bp of the genes...
Zeros are kept and spearman correlation calculation will be done.


In [6]:
cis_score_df['is_pcHiC'] = False
for gene in genes:
    pcHiC_bool = cis_score_df.loc[cis_score_df['genes']==gene,'peaks'].isin(pchic.columns[pchic.loc[gene,:]==1])
    cis_score_df.loc[cis_score_df['genes']==gene,'is_pcHiC'] = pcHiC_bool
    
cis_score_df.to_csv('correlation/spr.cor.pbmc.2k.NoSm.csv',index=False)
    

In [7]:
label = label[['pseudocell','cell_type']].value_counts()
label = label.to_frame()
label.reset_index(level=0, inplace=True)
label.reset_index(level=0, inplace=True)
label.index = label['pseudocell']
label = label.loc[rps.obs.index.astype('float64'),:]
rps.obs['cell_types'] = label['cell_type'].to_numpy()
aps.obs['cell_types'] = label['cell_type'].to_numpy()

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


In [8]:
pseudo_data = pd.concat([pd.DataFrame(rps.X),
                         pd.DataFrame(aps.X)], axis=1)
pseudo_data.columns = np.concatenate((rps.var_names.to_numpy(),
                                      aps.var_names.to_numpy()))
pseudo_data['cell_types'] = label['cell_type'].to_numpy()

In [10]:
pseudo_data.to_csv('correlation/pseudo_data.noSm.csv')

#### Smoothing

In [26]:
num_nn = 20
input_mod1_sm = ad.read_h5ad('data/10XMultiome_GEX.PBMC.smoothed.%sNN.h5ad'%(num_nn))
input_mod1_sm.X = input_mod1_sm.layers['smoothed_counts'].copy()

input_mod2_sm = ad.read_h5ad('data/10XMultiome_ATAC.PBMC.smoothed.%sNN.h5ad'%(num_nn))
input_mod2_sm.X = input_mod2_sm.layers['smoothed_counts'].copy()


In [27]:
#Smothing
cis_score_df = pd.DataFrame(np.zeros((0,6)))
cis_score_df.columns = ['genes','peaks','tss_dist','pRegion','Spearman.cor','Method']


cis_score_tmp, rps, aps, label = cis_infer_util.cis_element_score(input_mod1_sm, input_mod2_sm, genes, 
                                         distance, ref_tss_fn, use_rep='SiaNN',
                                         n_pseudocells=100, return_pseudo_bulk = True)
cis_score_tmp['Method'] = '%sNN'%(num_nn)
cis_score_df = pd.concat([cis_score_df,cis_score_tmp])

Add suffix to cell names in RNA and ATAC data...
Clustering pseudocells...
Representation mode engaged, clustering based on combined SiaNN embedding...


  view_to_actual(adata)


Selecting peaks within 2000bp of the genes...
Zeros are kept and spearman correlation calculation will be done.


In [28]:
cis_score_df['is_pcHiC'] = False
for gene in genes:
    pcHiC_bool = cis_score_df.loc[cis_score_df['genes']==gene,'peaks'].isin(pchic.columns[pchic.loc[gene,:]==1])
    cis_score_df.loc[cis_score_df['genes']==gene,'is_pcHiC'] = pcHiC_bool
    
cis_score_df.to_csv('correlation/spr.cor.pbmc.2k.%sNN.csv'%(num_nn),index=False)
    

In [29]:
label = label[['pseudocell','cell_type']].value_counts()
label = label.to_frame()
label.reset_index(level=0, inplace=True)
label.reset_index(level=0, inplace=True)
label.index = label['pseudocell']
label = label.loc[rps.obs.index.astype('float64'),:]
rps.obs['cell_types'] = label['cell_type'].to_numpy()
aps.obs['cell_types'] = label['cell_type'].to_numpy()

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


In [30]:
pseudo_data = pd.concat([pd.DataFrame(rps.X),
                         pd.DataFrame(aps.X)], axis=1)
pseudo_data.columns = np.concatenate((rps.var_names.to_numpy(),
                                      aps.var_names.to_numpy()))
pseudo_data['cell_types'] = label['cell_type'].to_numpy()

pseudo_data.to_csv('correlation/pseudo_data.%sNN.csv'%(num_nn))

#### True Pair

In [20]:
sc.pp.normalize_total(input_mod1)
sc.pp.log1p(input_mod1)
sc.pp.pca(input_mod1, n_comps=30)
sc.pp.neighbors(input_mod1, n_neighbors=10)
sc.tl.umap(input_mod1)
input_mod1.X = input_mod1.layers['counts'].copy()

In [21]:
#True pair
cis_score_df = pd.DataFrame(np.zeros((0,6)))
cis_score_df.columns = ['genes','peaks','tss_dist','pRegion','Spearman.cor','Method']


cis_score_paired = cis_infer_util.cis_element_score(input_mod1, input_mod2, genes, 
                                             distance, ref_tss_fn, use_rep = 'paired',
                                             n_pseudocells=100, return_pseudo_bulk = False)
cis_score_paired['Method'] = 'True_pair'
cis_score_df = pd.concat([cis_score_df,cis_score_paired])

Add suffix to cell names in RNA and ATAC data...
Clustering pseudocells...
Paired mode engaged, clustering based on RNA dataset...


Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
  view_to_actual(adata)


Selecting peaks within 2000bp of the genes...
Zeros are kept and spearman correlation calculation will be done.


In [22]:
cis_score_df['is_pcHiC'] = False
for gene in genes:
    pcHiC_bool = cis_score_df.loc[cis_score_df['genes']==gene,'peaks'].isin(pchic.columns[pchic.loc[gene,:]==1])
    cis_score_df.loc[cis_score_df['genes']==gene,'is_pcHiC'] = pcHiC_bool
    
cis_score_df.to_csv('correlation/spr.cor.pbmc.2k.Pair.csv',index=False)
    