In [1]:
import sys
sys.path.append("..")
import warnings
warnings.filterwarnings('ignore')
import os
import scanpy as sc
import muon as mu
import pandas as pd

from scmamba2 import logger
from scmamba2.utils.metrics import biology_conservation, omics_mixing
from scmamba2.utils.evaluation import ARI

In [6]:
mdata = mu.read("../datasets/multiome/cite_BMMC_s1_500.h5mu")
mdata

In [4]:
mdata.obs['batch'].unique()

['s1d1', 's1d2', 's1d3', 's2d1', 's2d4', ..., 's3d6', 's3d7', 's4d1', 's4d8', 's4d9']
Length: 12
Categories (12, object): ['s1d1', 's1d2', 's1d3', 's2d1', ..., 's3d7', 's4d1', 's4d8', 's4d9']

In [2]:
concat_emb = sc.read_h5ad("../results/benckmark/multiome_BMMCbatchsize64projection_dim64/concat.h5ad")
concat_emb

AnnData object with n_obs × n_vars = 138498 × 64
    obs: 'rna:GEX_pct_counts_mt', 'rna:GEX_n_counts', 'rna:GEX_n_genes', 'rna:GEX_size_factors', 'rna:GEX_phase', 'rna:ATAC_nCount_peaks', 'rna:ATAC_atac_fragments', 'rna:ATAC_reads_in_peaks_frac', 'rna:ATAC_blacklist_fraction', 'rna:ATAC_nucleosome_signal', 'rna:cell_type', 'rna:batch', 'rna:ATAC_pseudotime_order', 'rna:GEX_pseudotime_order', 'rna:Samplename', 'rna:Site', 'rna:DonorNumber', 'rna:Modality', 'rna:VendorLot', 'rna:DonorID', 'rna:DonorAge', 'rna:DonorBMI', 'rna:DonorBloodType', 'rna:DonorRace', 'rna:Ethnicity', 'rna:DonorGender', 'rna:QCMeds', 'rna:DonorSmoker', 'atac:GEX_pct_counts_mt', 'atac:GEX_n_counts', 'atac:GEX_n_genes', 'atac:GEX_size_factors', 'atac:GEX_phase', 'atac:ATAC_nCount_peaks', 'atac:ATAC_atac_fragments', 'atac:ATAC_reads_in_peaks_frac', 'atac:ATAC_blacklist_fraction', 'atac:ATAC_nucleosome_signal', 'atac:cell_type', 'atac:batch', 'atac:ATAC_pseudotime_order', 'atac:GEX_pseudotime_order', 'atac:Samplename'

In [3]:
logger.info("Calculating omics mixing metrics...")
omics_mixing_metrics = omics_mixing(
    concat_emb, concat_emb.obs['cell_type'].values, concat_emb.obs['modality'].values
)
omics_mixing_metrics

scMamba - INFO - Calculating omics mixing metrics...


  counts = pd.value_counts(c)


{'omics entropy mixing score': 0.9897355481833218,
 'Seurat alignment score (omics)': 0.9829785005174084,
 'Graph connectivity': 0.6124171572338926,
 'ASW_omics': 0.7810567617416382}

In [4]:
logger.info("Calculating biology conservation metrics...")
biology_conservation_metrics, best_res= biology_conservation(
    concat_emb, concat_emb.obs['cell_type'].values
)
print(best_res)
biology_conservation_metrics

scMamba - INFO - Calculating biology conservation metrics...
1.0


{'ARI': 0.7798572644798041,
 'NMI': 0.7777392690075673,
 'Mean average precision': 0.742387808962666,
 'ASW_celltype': 0.6086784154176712}

In [3]:
max_res = 0
max_ARI = 0
for res in range(40, 60):
    res = res/100.0
    sc.tl.leiden(concat_emb, resolution=res, flavor="igraph", n_iterations=2)
    ARI_score = ARI(concat_emb.obs["cell_type"].values, concat_emb.obs["leiden"].values)
if ARI_score > max_ARI:
    max_res = res
    max_ARI = ARI_score
print(max_res)
print(max_ARI)

0.59
0.4896434443335846


In [None]:
metrics = {}
metrics_classified = {}
logger.info("Calculating biology conservation metrics...")
biology_conservation_metrics, best_res= biology_conservation(
    concat_emb, concat_emb.obs['cell_type'].values
)
logger.info("Calculating omics mixing metrics...")
omics_mixing_metrics = omics_mixing(
    concat_emb, concat_emb.obs['cell_type'].values, concat_emb.obs['modality'].values
)
metrics_classified['biology conservation'] = biology_conservation_metrics
metrics_classified['omics mixing'] = omics_mixing_metrics

metrics.update(biology_conservation_metrics)
metrics.update(omics_mixing_metrics)

metrics['epcohs'] = 100
metrics['best resolutioin'] = best_res
# metrics['mean F1 silhouette'] = mean_F1_silhouette(
#     concate_embeds.X, 
#     cell_type=concate_embeds.obs['cell_type'].values,
#     omics=concate_embeds.obs['modality'].values,
#     device_id=args.device,
#     chunk_size=100000
# )
print(metrics)

if not os.path.exists(f'{out_dir}/metrics.csv'):
    metrics_df = pd.DataFrame([metrics])
else:
    metrics_df = pd.read_csv(
        f'{out_dir}/metrics.csv' 
    )
    new_metrics_df = pd.DataFrame([metrics])
    metrics_df = pd.concat([metrics_df, new_metrics_df], ignore_index=True)
metrics_df.to_csv(f'{out_dir}/metrics.csv', index=False)