In [None]:
from pathlib import Path
import yaml

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
# plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.family'] = 'Arial'

import scanpy as sc
# sc.settings.verbosity = 3
# sc.logging.print_versions()
figs_savepath = Path("results/figures")
figs_savepath.mkdir(parents=True, exist_ok=True)
results_savepath = Path("results/data")
results_savepath.mkdir(parents=True, exist_ok=True)

figure_type = 'svg'
sc.settings.figdir = "results/figures"
sc.settings.set_figure_params(fontsize=12, color_map='RdYlGn', dpi=80, dpi_save=1000)

In [None]:
import sys
sys.path.extend(['../../mylibs'])

In [None]:
sample_path = Path("../../data/ZT-238").absolute()

In [None]:
solo_out = sample_path / "starsolo_outputs/Solo.out/GeneFull/filtered"

In [None]:
gem_classification = pd.read_csv(solo_out / "gem_classification/gem_classification.csv", sep=',', comment='#')

In [None]:
def stats_species_versus(gem_classification, show=False):
    # 统计各个类型数量
    count_grch38 = (gem_classification['call'] == 'GRCh38').sum()
    count_grcm39 = (gem_classification['call'] == 'GRCm39').sum()
    count_multiplet = (gem_classification['call'] == 'Multiplet').sum()
    total = len(gem_classification)

    # 计算百分比
    percent_grch38 = count_grch38 / total * 100
    percent_grcm39 = count_grcm39 / total * 100
    percent_multiplet = count_multiplet / total * 100

    print(f"GRCh38: {count_grch38} ({percent_grch38:.1f}%)")
    print(f"GRCm39: {count_grcm39} ({percent_grcm39:.1f}%)")
    print(f"Multiplet: {count_multiplet} ({percent_multiplet:.1f}%)")

    fig, ax = plt.subplots(figsize=(8, 6), dpi=300)
    select = gem_classification[gem_classification['call']=='GRCh38']
    x_ser = select['GRCm39'].values / 1000
    y_ser = select['GRCh38'].values / 1000
    ax.scatter(x_ser, y_ser, color='#f8766d', s=5, label=f'Human: {count_grch38} ({percent_grch38:.1f}%)')

    select = gem_classification[gem_classification['call']=='GRCm39']
    x_ser = select['GRCm39'].values / 1000
    y_ser = select['GRCh38'].values / 1000
    ax.scatter(x_ser, y_ser, color='#5f9dff', s=5, label=f'Mouse: {count_grcm39} ({percent_grcm39:.1f}%)')

    select = gem_classification[gem_classification['call']=='Multiplet']
    x_ser = select['GRCm39'].values / 1000
    y_ser = select['GRCh38'].values / 1000
    ax.scatter(x_ser, y_ser, color='#0bbc3f', s=5, label=f'Mixed: {count_multiplet} ({percent_multiplet:.1f}%)')

    ax.set_xlim(0, gem_classification['GRCm39'].max() / 1000)
    ax.set_ylim(0, gem_classification['GRCh38'].max() / 1000)
    ax.set_xlabel('Mouse UMIs (k)', fontweight='bold')
    ax.set_ylabel('Human UMIs (k)', fontweight='bold')
    ax.legend()

    fig.savefig(figs_savepath / f"{sample_path.name}_species_versus.svg", dpi=300, bbox_inches='tight')
    if show:
        plt.show()
    else:
        plt.close(fig)

In [None]:
results_dir = Path("../../data").absolute()

with open(results_dir / "config.yaml", 'r') as f:
    samples = yaml.safe_load(f).get('samples', {})

In [None]:
for sample_name, sample in tqdm(samples.items()):
    sample_path = results_dir / sample_name
    solo_out = sample_path / "starsolo_outputs/Solo.out/GeneFull/filtered"
    chem = sample["chem"]
    starindex = sample["starindex"]
    species = None
    if "GRCh38_and_GRCm39" in starindex:
        species = "GRCh38_and_GRCm39"
    elif "GRCh38" in starindex:
        species = "GRCh38"
    elif "GRCm39" in starindex:
        species = "GRCm39"
    elif "ChlSab" in starindex:
        species = "ChlSab"
    else:
        raise ValueError(f"Unknown species for {chem} with starindex {starindex}")

    if species == "GRCh38_and_GRCm39":
        gem_classification = pd.read_csv(solo_out / "gem_classification/gem_classification.csv", sep=',', comment='#')
        stats_species_versus(gem_classification, show=False)