In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.family'] = 'Arial'
import scanpy as sc
import yaml
from pathlib import Path
from tqdm.auto import tqdm

In [None]:
import sys
sys.path.extend(['../../mylibs'])

In [None]:
results_dir = Path("../../data").absolute()
solo_out_filtered = "starsolo_outputs/Solo.out/GeneFull/filtered"

with open(results_dir / "config.yaml", 'r') as f:
    samples = yaml.safe_load(f).get('samples', {})

In [None]:
def plot_stats_bar(data, title, xlabel, ylabel, color, saveto=None):
    fig, ax = plt.subplots(figsize=(12, 5), dpi=300)
    bar_width = 0.6
    n_keys = len(data.keys())
    x = range(n_keys)
    ax.bar(x, list(data.values()), color=color, width=bar_width)
    ax.set_yscale('log')
    ax.set_title(title)
    ax.set_xlabel(xlabel, fontweight='bold')
    ax.set_ylabel(ylabel, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(data.keys(), rotation=45, ha='right')
    plt.xlim(-bar_width, n_keys)
    plt.subplots_adjust(left=0.08, right=0.98, bottom=0.25, top=0.92, wspace=0.2)
    if saveto is not None:
        fig.savefig(saveto, bbox_inches='tight')
        plt.close(fig)
    return fig, ax

In [None]:
def stats_umis_genes(adata):
    unique_gene_biotypes = adata.var['gene_biotype'].unique()
    counts_umi = {}
    counts_gene = {}
    for gene_biotype in unique_gene_biotypes:
        current_adata = adata[:, adata.var['gene_biotype'] == gene_biotype]
        counts_umi[gene_biotype] = current_adata.X.sum()
        # 统计在所有细胞中至少有一个非零表达的基因数量
        counts_gene[gene_biotype] = (current_adata.X.sum(axis=0) != 0).sum()
    sorted_counts_umi = dict(sorted(counts_umi.items(), key=lambda x: x[1], reverse=True))
    sorted_counts_gene = dict(sorted(counts_gene.items(), key=lambda x: x[1], reverse=True))
    return sorted_counts_umi, sorted_counts_gene

In [None]:
dontsave = False
for sample_name, sample in tqdm(samples.items()):
    sample_path = results_dir / sample_name / solo_out_filtered
    chem = sample["chem"]
    starindex = sample["starindex"]
    species = None
    if "GRCh38_and_GRCm39" in starindex:
        species = "GRCh38_and_GRCm39"
    elif "GRCh38" in starindex:
        species = "GRCh38"
    elif "GRCm39" in starindex:
        species = "GRCm39"
    elif "ChlSab" in starindex:
        species = "ChlSab"
    else:
        raise ValueError(f"Unknown species for {chem} with starindex {starindex}")

    adata = sc.read_h5ad(sample_path / "matrix.stats.h5ad")
    sorted_counts_umi_grch38, sorted_counts_gene_grch38 = None, None
    sorted_counts_umi_grcm39, sorted_counts_gene_grcm39 = None, None
    if species == "GRCh38_and_GRCm39":
        grch38_adata = adata[adata.obs['species'] == 'GRCh38']
        sorted_counts_umi_grch38, sorted_counts_gene_grch38 = stats_umis_genes(grch38_adata)
        grcm39_adata = adata[adata.obs['species'] == 'GRCm39']
        sorted_counts_umi_grcm39, sorted_counts_gene_grcm39 = stats_umis_genes(grcm39_adata)
    elif species == "GRCh38":
        sorted_counts_umi_grch38, sorted_counts_gene_grch38 = stats_umis_genes(grch38_adata)
    elif species == "GRCm39":
        sorted_counts_umi_grcm39, sorted_counts_gene_grcm39 = stats_umis_genes(grcm39_adata)

    if sorted_counts_umi_grch38 is not None:
        saveto = f'results/figures/{sample_name}_GRCh38_UMI_stats.svg' if not dontsave else None
        Path(saveto).parent.mkdir(parents=True, exist_ok=True) if saveto else None
        fig, ax = plot_stats_bar(sorted_counts_umi_grch38, '', 'Gene biotype', 'Number of UMIs', '#c9deba', saveto)
    if sorted_counts_gene_grch38 is not None:
        saveto = f'results/figures/{sample_name}_GRCh38_gene_stats.svg' if not dontsave else None
        Path(saveto).parent.mkdir(parents=True, exist_ok=True) if saveto else None
        fig, ax = plot_stats_bar(sorted_counts_gene_grch38, '', 'Gene biotype', 'Count of genes', '#a8c78d', saveto)
    if sorted_counts_umi_grcm39 is not None:
        saveto = f'results/figures/{sample_name}_GRCm39_UMI_stats.svg' if not dontsave else None
        Path(saveto).parent.mkdir(parents=True, exist_ok=True) if saveto else None
        fig, ax = plot_stats_bar(sorted_counts_umi_grcm39, '', 'Gene biotype', 'Number of UMIs', '#edccbd', saveto)
    if sorted_counts_gene_grcm39 is not None:
        saveto = f'results/figures/{sample_name}_GRCm39_gene_stats.svg' if not dontsave else None
        Path(saveto).parent.mkdir(parents=True, exist_ok=True) if saveto else None
        fig, ax = plot_stats_bar(sorted_counts_gene_grcm39, '', 'Gene biotype', 'Count of genes', '#e2aa90', saveto)