In [1]:
import anndata
import scanpy as sc
import numpy as np
import pandas as pd

from matplotlib.pyplot import rc_context

In [2]:
adata = sc.read('/lmh_data/data/sclab/syn22092838/snRNA_counts.mtx')
genes = pd.read_csv('/lmh_data/data/sclab/syn22092838/genes.csv', header=None)
scRNA_metadata = pd.read_csv('/lmh_data/data/sclab/syn22092838/snRNA_metadta.csv', index_col=0)

adata.obs_names = genes.values.reshape(-1)
adata.var = scRNA_metadata
adata = adata.T

adata = adata[adata.obs[adata.obs['celltype'].isin(['ASC', 'OPC', 'ODC'])].index,:]
# adata.obs.celltype = adata.obs.celltype.cat.rename_categories({'ASC': 'Astro'})
adata

View of AnnData object with n_obs × n_vars = 44548 × 36114
    obs: 'Sample.ID', 'Batch', 'Sex', 'Age', 'Diagnosis', 'UMAP_1', 'UMAP_2', 'cluster', 'celltype'

In [3]:
train_set = np.load('/lmh_data/data/sclab/sclab/train_dataset.npy', allow_pickle=True)
scRNA_head = train_set[0]['scRNA_head']

In [4]:
gene_names = set(scRNA_head) & set(adata.var_names)
np.save('/lmh_data/data/sclab/sclab/AD/filter_genes.npy', np.array(gene_names))

In [5]:
rna = anndata.read_h5ad("/lmh_data/data/sclab/Human_M1_10x/scRNA.h5ad")
rna.obs.cell_type = rna.obs.cell_type.cat.rename_categories({'Oligo': 'ODC'})
rna = rna[rna.obs['cell_type'].isin(['Astro', 'OPC', 'ODC']),:]
rna

View of AnnData object with n_obs × n_vars = 3793 × 50281
    obs: 'cell_type', 'domain'

In [6]:
def rna_pca(rna):
    rna.X = rna.layers["counts"].copy()
    sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
    sc.pp.normalize_total(rna)
    sc.pp.log1p(rna)
    sc.pp.scale(rna)
    sc.tl.pca(rna, n_comps=100, svd_solver="auto")

    sc.pp.neighbors(rna, metric="cosine")
    sc.tl.umap(rna)
    sc.tl.leiden(rna)

In [7]:
_rna = rna[:, list(gene_names)].copy()
_adata = adata[:, list(gene_names)].copy()
# _adata = _adata[_adata.obs[_adata.obs['Diagnosis'] == 'Control'].index, :]

_adata.obs.loc[:, 'cell_type'] = _adata.obs.celltype
_rna.obs.loc[:, 'Diagnosis'] = 'Control'
_rna.obs.loc[:, 'category'] = 'control'
_adata.obs.loc[:, 'category'] = 'AD'

In [8]:
import scanorama
_combine = scanorama.correct_scanpy([_rna, _adata], return_dimred=True)
_combine

Found 20144 genes among all datasets
[[0.         0.88795149]
 [0.         0.        ]]
Processing datasets (0, 1)


  adata = AnnData(datasets[i])


[AnnData object with n_obs × n_vars = 3793 × 20144
     obs: 'cell_type', 'domain', 'Diagnosis', 'category'
     obsm: 'X_scanorama',
 AnnData object with n_obs × n_vars = 44548 × 20144
     obs: 'Sample.ID', 'Batch', 'Sex', 'Age', 'Diagnosis', 'UMAP_1', 'UMAP_2', 'cluster', 'celltype', 'cell_type', 'category'
     obsm: 'X_scanorama']

In [19]:
_rna.X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [21]:
_combine[0].X.toarray()

array([[-1.9158385e-04,  1.0845234e-04, -1.5628726e-06, ...,
         5.6502991e-05, -4.5109562e-05,  6.5224263e-04],
       [-1.9265388e-04,  1.1382904e-04, -2.3168375e-06, ...,
         6.4990956e-05, -3.1886459e-05,  2.9698350e-03],
       [-1.6441097e-04,  7.8217083e-05, -1.9157599e-06, ...,
         6.4019288e-05, -2.9461862e-05,  6.2647217e-04],
       ...,
       [-1.9328964e-04,  1.1335802e-04, -8.4863723e-07, ...,
         3.0492058e-03, -5.8678357e-05,  6.5309752e-04],
       [-1.8978630e-04,  9.7540709e-05,  3.5983652e-07, ...,
         6.4111911e-05, -6.1675171e-05,  8.1646937e-04],
       [-1.9666333e-04,  1.1344179e-04, -8.4231391e-07, ...,
         4.0816816e-05, -5.6449182e-05,  7.8291129e-03]], dtype=float32)

In [25]:
_combine[0].obsm['X_scanorama']

array([[ 0.04195746,  0.09063238, -0.0047397 , ...,  0.01147386,
         0.00190099,  0.00195387],
       [-0.13169012,  0.23028693,  0.0530308 , ...,  0.00415152,
         0.01534598, -0.00530411],
       [-0.18655597,  0.02258294,  0.05581256, ..., -0.02383893,
         0.01660524, -0.00374544],
       ...,
       [ 0.22910912, -0.11740401, -0.22209272, ..., -0.01148135,
        -0.00640589,  0.00672929],
       [ 0.20079893, -0.10042473,  0.07463531, ...,  0.01512926,
         0.00322322,  0.01235884],
       [ 0.22203792, -0.068517  , -0.07283686, ..., -0.00058348,
         0.00656775, -0.00127542]])

In [30]:
_test = anndata.concat(_combine).copy()
_test.layers["counts"] = _test.X.copy()

rna_pca(_test)

with rc_context({'figure.figsize': (5, 5)}):
    sc.pl.umap(_test, color=['Diagnosis', 'cell_type', 'category'], add_outline=True, legend_loc='on data',
               legend_fontsize=12, legend_fontoutline=2, frameon=False, palette='Set1')

  x = np.log10(mean[not_const])


ValueError: b'Extrapolation not allowed with blending'

In [28]:
_test

AnnData object with n_obs × n_vars = 48341 × 20144
    obs: 'cell_type', 'Diagnosis', 'category'
    obsm: 'X_scanorama'
    layers: 'counts'

In [None]:
_combine

In [None]:
_combine.layers["counts"] = _combine.X.copy()

rna_pca(_combine)
with rc_context({'figure.figsize': (5, 5)}):
    sc.pl.umap(_combine, color=['leiden', 'cell_type', 'category'], add_outline=True, legend_loc='on data',
               legend_fontsize=12, legend_fontoutline=2, frameon=False, palette='Set1')

In [None]:
_rna = rna[:, list(gene_names)].copy()
_adata = adata[:, list(gene_names)].copy()
_adata = _adata[_adata.obs[_adata.obs['Diagnosis'] == 'Control'].index, :]

_adata.obs.loc[:, 'cell_type'] = _adata.obs.celltype
_rna.obs.loc[:, 'category'] = 'control'
_adata.obs.loc[:, 'category'] = 'AD'

# _proportion = _rna.X.max() / _adata.X.max()
# print(_proportion)
# _adata.X *= _proportion
# _adata.X = np.around(_adata.X.toarray()).astype(np.int32)

_combine = anndata.concat([_rna, _adata])
_combine.layers["counts"] = _combine.X.copy()

rna_pca(_combine)
with rc_context({'figure.figsize': (5, 5)}):
    sc.pl.umap(_combine, color=['leiden', 'cell_type', 'category'], add_outline=True, legend_loc='on data',
               legend_fontsize=12, legend_fontoutline=2, frameon=False, palette='Set1')

In [None]:
_rna = rna[:, list(gene_names)]
_adata = adata[:, list(gene_names)]

_adata.obs.loc[:, 'cell_type'] = _adata.obs.celltype
_rna.obs.loc[:, 'category'] = 'control'
_adata.obs.loc[:, 'category'] = 'AD'

_combine = anndata.concat([_rna, _adata])
_combine.layers["counts"] = _combine.X.copy()

rna_pca(_combine)
with rc_context({'figure.figsize': (5, 5)}):
    sc.pl.umap(_combine, color=['leiden', 'cell_type', 'category'], add_outline=True, legend_loc='on data',
               legend_fontsize=12, legend_fontoutline=2, frameon=False, palette='Set1')

In [None]:
_con = pd.DataFrame(_combine.obsp['connectivities'].toarray(), columns=_combine.obs_names, index=_combine.obs_names)
_con = _con.loc[
    _combine.obs[_combine.obs['category']=='control'].index,
    _combine.obs[_combine.obs['category']=='AD'].index
]

_con = _con[(_con.T != 0).any()].T
_con = _con[(_con.T != 0).any()].T
_con

In [None]:
_dataset = []
for AD_case in _con.columns:
    _dataset.append({
        'scRNA': adata[AD_case, list(gene_names)].X.toarray()[0].astype(np.int32),
        'scRNA_head': adata[AD_case, list(gene_names)].var_names,
        'cell_type': adata[AD_case, list(gene_names)].obs['celltype'].values[0],
    })
np.save('/lmh_data/data/sclab/sclab/AD/eval_dataset.npy', _dataset)