In [1]:
# This is code to compute horizontal integration results
import os, sys
import numpy as np
import scanpy as sc
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix
import scipy
from scipy import sparse
import importlib

import anndata as ad
from scipy.io import mmread, mmwrite

import matplotlib.pyplot as plt
import seaborn as sns

import scib
import scib_metrics
from scib_metrics.benchmark import Benchmarker

from typing import Any, Callable, Optional, Union

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [2]:
import os
os.chdir("../")
os.getcwd()

'/home/CMML_mini2_final'

In [3]:
results_path = "run_res/horizontal/GSE156478/" ## path to integration latent results
data_path = "data/GSE156478/" ## path to raw data and metadata
save_path = "bench_res"  ## path to save the metrics results

In [4]:
# original data load ----
# batch1
path = data_path+"/Control"
cell_names = pd.read_csv(path+'/RNA/barcodes.tsv', sep = '\t', header=None, index_col=None)
cell_names.columns =  ['cell_ids']
X = csr_matrix(mmread(path+'/RNA/matrix.mtx').T)
gene_names = pd.read_csv(path+'/RNA/features.tsv', sep = '\t',  header=None, index_col=None)
gene_names.columns =  ['gene_ids']
adata_RNA = ad.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = gene_names.gene_ids))
adata_RNA.var_names_make_unique()
# batch2
path = data_path+"/Stim"
cell_names = pd.read_csv(path+'/RNA/barcodes.tsv', sep = '\t', header=None, index_col=None)
cell_names.columns =  ['cell_ids']
X = csr_matrix(mmread(path+'/RNA/matrix.mtx').T)
peak_name = pd.read_csv(path+'/RNA/features.tsv',header=None,index_col=None)
peak_name.columns = ['peak_ids']
adata_rna  = ad.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = peak_name.peak_ids))
adata_rna.var['modality'] = ['Gene Expression']*adata_rna.shape[1]
del X
# # We can now use the organizing method from scvi to concatenate these anndata
sc.pp.filter_cells(adata_RNA, min_genes=1)
sc.pp.filter_genes(adata_RNA, min_cells=20)
sc.pp.filter_cells(adata_rna, min_genes=1)
sc.pp.filter_genes(adata_rna, min_cells=20)

adata_RNA.obs_names = ['Ctrl_' + item for item in adata_RNA.obs_names]
adata_rna.obs_names = ['Stim_' + item for item in adata_rna.obs_names]

adata = sc.concat([adata_RNA, adata_rna],axis=0)
adata.obs['batch'] = adata_RNA.shape[0]*['Ctrl'] + adata_rna.shape[0]*['Stim']
del adata_RNA, adata_rna

In [6]:
# annotation load ----
meta=pd.read_csv(data_path+"metadata.csv")
meta=meta.rename(columns={
    "stim":"cond",
    "predicted.celltype.l2":"cell_type"
})
meta['cell_type'].index = adata.obs_names
adata.obs['cell_type'] = meta['cell_type'].astype('category')

In [18]:
adata.obs['cell_type']

Ctrl_AAACAGCCAAAGCGGC-1    CD4 Naive
Ctrl_AAACAGCCAACAGGAT-1      CD4 TCM
Ctrl_AAACAGCCAAGGTACG-1    CD4 Naive
Ctrl_AAACAGCCAGGATGGC-1          gdT
Ctrl_AAACAGCCATTCAGCA-1      CD4 TCM
                             ...    
Stim_TTTGTGTTCGACAAAG-2    CD8 Naive
Stim_TTTGTTGGTAGCCATA-2    CD8 Naive
Stim_TTTGTTGGTAGCCTAA-2         Treg
Stim_TTTGTTGGTGCAATAT-2    CD4 Naive
Stim_TTTGTTGGTTGGGTTA-2      CD4 TCM
Name: cell_type, Length: 13383, dtype: category
Categories (14, object): ['B intermediate', 'B naive', 'CD4 Naive', 'CD4 TCM', ..., 'MAIT', 'NK', 'Treg', 'gdT']

In [19]:
metrics_list=[]
# get MIRA results
method="MIRA"
latent = pd.read_csv(results_path+"MIRA.csv", header = None)
latent.index = adata.obs_names
adata.obsm[method] = latent
sc.pp.neighbors(adata, use_rep=method)
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added="cluster")
scib.metrics.cluster_optimal_resolution(adata, cluster_key="cluster", label_key="cell_type")
# compute
ari = scib.metrics.ari(adata, cluster_key="cluster", label_key="cell_type")
iso_asw = scib.metrics.isolated_labels_asw(adata, label_key="cell_type", batch_key='batch', embed=method,  verbose = False)
nmi = scib.metrics.nmi(adata, cluster_key="cluster", label_key="cell_type")
# clisi = scib.metrics.clisi_graph(adata, label_key="cell_type",use_rep=method, type_='embed')
sht = scib.metrics.silhouette(adata, label_key="cell_type", embed=method, metric='euclidean', scale=True)
metrics_list.append([ari, iso_asw, nmi, sht, method])

resolution: 0.1, nmi: 0.0014104090999529533
resolution: 0.2, nmi: 0.0019911793550111993
resolution: 0.3, nmi: 0.0023790660766987584
resolution: 0.4, nmi: 0.002516287539276588
resolution: 0.5, nmi: 0.0030021291043373604
resolution: 0.6, nmi: 0.0031041453968779532
resolution: 0.7, nmi: 0.0034222241397319805
resolution: 0.8, nmi: 0.0036066195013077598
resolution: 0.9, nmi: 0.0036140030753408292
resolution: 1.0, nmi: 0.0038617952395246916
resolution: 1.1, nmi: 0.0037667428653479344
resolution: 1.2, nmi: 0.0038078714819605865
resolution: 1.3, nmi: 0.003960658997609565
resolution: 1.4, nmi: 0.004228056830362721
resolution: 1.5, nmi: 0.00452665713445788
resolution: 1.6, nmi: 0.004598854877541191
resolution: 1.7, nmi: 0.004895194035812542
resolution: 1.8, nmi: 0.005236382598387689
resolution: 1.9, nmi: 0.00496101520047561
resolution: 2.0, nmi: 0.005479034755023446
optimised clustering against cell_type
optimal cluster resolution: 2.0
optimal score: 0.005479034755023446


  batch_per_lab = tmp.groupby(label_key).agg({batch_key: "count"})


In [21]:
# benchmark res
methods = ["Seurat","PCA"]
for method in methods:
    con = mmread(results_path + method + '_connectivities.mtx')
    dis = mmread(results_path + method + '_distance.mtx')
    adata.uns['neighbors'] = {'connectivities_key': 'connectivities', 'distances_key': 'distances', 
                              'params': {'n_neighbors': 20, 'method': 'umap', 'random_state': 0, 
                              'metric': 'euclidean'}}
    adata.uns['neighbors']['distance'] = csr_matrix(dis)
    adata.uns['neighbors']['connectivities'] = csr_matrix(con)
    adata.obsp['distance'] = csr_matrix(dis)
    adata.obsp['connectivities'] = csr_matrix(con)
    # get clusters
    sc.tl.umap(adata, n_components=20)
    scib.metrics.cluster_optimal_resolution(adata, cluster_key="cluster", label_key="cell_type")
    # calculate metrics
    ari = scib.metrics.ari(adata, cluster_key="cluster", label_key="cell_type")
    iso_asw = scib.metrics.isolated_labels_asw(adata, label_key="cell_type", batch_key='batch', embed="X_umap",  verbose = False)
    nmi = scib.metrics.nmi(adata, cluster_key="cluster", label_key="cell_type")
    # clisi = scib.metrics.clisi_graph(adata, label_key="cell_type",use_rep=method, type_='embed')
    sht = scib.metrics.silhouette(adata, label_key="cell_type", embed="X_umap", metric='euclidean', scale=True)
    metrics_list.append([ari, iso_asw, nmi, sht, method])

         Falling back to preprocessing with `sc.pp.pca` and default params.


resolution: 0.1, nmi: 0.613138065665138
resolution: 0.2, nmi: 0.6052908097996086
resolution: 0.3, nmi: 0.5986507069726714
resolution: 0.4, nmi: 0.5807279748049293
resolution: 0.5, nmi: 0.5703397665675113
resolution: 0.6, nmi: 0.5482153565018572
resolution: 0.7, nmi: 0.5472498858005685
resolution: 0.8, nmi: 0.5401900131844285
resolution: 0.9, nmi: 0.5438165248460632
resolution: 1.0, nmi: 0.5320320831225241
resolution: 1.1, nmi: 0.5316158537826198
resolution: 1.2, nmi: 0.5241059632053282
resolution: 1.3, nmi: 0.5206865738758537
resolution: 1.4, nmi: 0.5201020443828489
resolution: 1.5, nmi: 0.5186160996044048
resolution: 1.6, nmi: 0.5139955948752638
resolution: 1.7, nmi: 0.5095346264798339
resolution: 1.8, nmi: 0.4988271770346021
resolution: 1.9, nmi: 0.5004196324357951
resolution: 2.0, nmi: 0.5016007903851485
optimised clustering against cell_type
optimal cluster resolution: 0.1
optimal score: 0.613138065665138


  batch_per_lab = tmp.groupby(label_key).agg({batch_key: "count"})

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(


resolution: 0.1, nmi: 0.6127586115384228
resolution: 0.2, nmi: 0.6316454623704831
resolution: 0.3, nmi: 0.6160760499018771
resolution: 0.4, nmi: 0.5799488067123815
resolution: 0.5, nmi: 0.5872744626516901
resolution: 0.6, nmi: 0.5853982972825982
resolution: 0.7, nmi: 0.5878068756435227
resolution: 0.8, nmi: 0.5620915057214709
resolution: 0.9, nmi: 0.5444789591806941
resolution: 1.0, nmi: 0.5494914142678594
resolution: 1.1, nmi: 0.5408018570506732
resolution: 1.2, nmi: 0.5396725368381445
resolution: 1.3, nmi: 0.533936323384717
resolution: 1.4, nmi: 0.5300330801725958
resolution: 1.5, nmi: 0.5251278913679921
resolution: 1.6, nmi: 0.5204394445729845
resolution: 1.7, nmi: 0.5217677528598919
resolution: 1.8, nmi: 0.5225466501110038
resolution: 1.9, nmi: 0.5188551448345727
resolution: 2.0, nmi: 0.5152961226521711
optimised clustering against cell_type
optimal cluster resolution: 0.2
optimal score: 0.6316454623704831


  batch_per_lab = tmp.groupby(label_key).agg({batch_key: "count"})

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(

This is where adjacency matrices should go now.
  warn(


In [22]:
metrics_list

[[0.00015868857652281935,
  0.4843852641788765,
  0.005479034755023446,
  0.4820467051759265,
  'MIRA'],
 [0.5805248544486977,
  0.5645443862304091,
  0.6131380656651377,
  0.5915808379650116,
  'Seurat'],
 [0.6229998110456757,
  0.565419359165909,
  0.631645462370483,
  0.5824712887406349,
  'PCA']]

In [23]:
# save
df = pd.DataFrame(metrics_list,columns = ['ari', 'iso_asw', 'nmi', 'sht','method'])
df['Dataset'] = "GSE156478"

bench_path="bench_res/horizontal"
df.to_csv(bench_path + "/metrics_result.csv",index = False)