In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import yaml

In [None]:
import matplotlib.pyplot as plt
# plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.family'] = 'Arial'

import scanpy as sc
# sc.settings.verbosity = 3
# sc.logging.print_versions()
Path("results/figures").mkdir(parents=True, exist_ok=True)
Path("results/data").mkdir(parents=True, exist_ok=True)
figure_type = 'svg'
sc.settings.figdir = "results/figures"
sc.settings.set_figure_params(fontsize=12, color_map='RdYlGn', dpi=80, dpi_save=1000)

In [None]:
import sys
sys.path.extend(['../../mylibs'])

In [None]:
results_dir = Path("../../data").absolute()
solo_out_filtered = "starsolo_outputs/Solo.out/GeneFull/filtered"
with open(results_dir / "config.yaml", 'r') as f:
    samples = yaml.safe_load(f).get('samples', {})

In [None]:
dontsave = False
for sample_name, sample in tqdm(samples.items()):
    sample_path = results_dir / sample_name / solo_out_filtered
    chem = sample["chem"]
    starindex = sample["starindex"]
    species = None
    if "GRCh38_and_GRCm39" in starindex:
        species = "GRCh38_and_GRCm39"
    elif "GRCh38" in starindex:
        species = "GRCh38"
    elif "GRCm39" in starindex:
        species = "GRCm39"
    elif "ChlSab" in starindex:
        species = "ChlSab"
    else:
        raise ValueError(f"Unknown species for {chem} with starindex {starindex}")

    if species != "GRCh38_and_GRCm39":
        continue

    adata = sc.read_h5ad(sample_path / "matrix.stats.h5ad")
    adata.X = adata.X.astype('float64')

    #### Drop sum_umi_count(gene_id) == 0
    count = adata.X.sum(axis=0)
    count = np.array(count).flatten()
    index = np.where(count>0)[0]
    adata = adata[:, index].copy()

    #### Quality Control
    # mitochondrial genes
    adata.var["mt"] = adata.var['gene_name'].str.lower().str.startswith((
        "grch38_mt", "grch38_mt-", "grch38_mt_",
        "grcm39_mt", "grcm39_mt-", "grcm39_mt_",
    ))
    # ribosomal genes
    adata.var["ribo"] = adata.var['gene_name'].str.lower().str.startswith((
        "grch38_rps", "grch38_rpl",
        "grcm39_rps", "grcm39_rpl"
    ))
    # hemoglobin genes
    adata.var["hb"] = adata.var['gene_name'].str.lower().str.contains('^grch38_hb[abgdez]$') | adata.var['gene_name'].str.lower().str.contains('^grcm39_hb[abgdez]$')

    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
    sc.pl.violin(
        adata,
        ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
        jitter=0.4,
        multi_panel=True,
        show=dontsave,
        save=None if dontsave else f"_{sample_name}_qc.{figure_type}"
    )
    # sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt", show=dontsave)
    sc.pp.filter_cells(adata, min_genes=100)
    sc.pp.filter_genes(adata, min_cells=3)

    #### Doublet Detection
    sc.pp.scrublet(adata)

    #### Normalization
    adata.layers["counts"] = adata.X.copy()  # Saving count data
    sc.pp.normalize_total(adata)  # Normalizing to median total counts
    sc.pp.log1p(adata)  # Logarithmize the data

    #### Feature Selection
    # sc.pp.highly_variable_genes(adata, n_top_genes=2000)
    # sc.pl.highly_variable_genes(adata, show=dontsave)

    #### Dimensionality Reduction (PCA)
    targets = ['all', 'protein_coding', 'lncRNA', 'others']
    for target in targets:
        if target == 'all':
            adata_select = adata.copy()
        elif target == 'others':
            remove_targets = targets.copy()
            remove_targets.remove('others')
            all_gene_biotype = list(pd.unique(adata.var['gene_biotype']))
            for bt in remove_targets:
                if bt in all_gene_biotype:
                    all_gene_biotype.remove(bt)
            adata_select = adata[:, adata.var['gene_biotype'].isin(all_gene_biotype)].copy()
        else:
            adata_select = adata[:, adata.var['gene_biotype'].isin([target])].copy()

        sc.tl.pca(adata_select)
        # sc.pl.pca_variance_ratio(adata_select, n_pcs=50, log=True, show=dontsave)
        # sc.pl.pca(adata_select, color=["pct_counts_mt", "pct_counts_mt"], dimensions=[(0, 1), (2, 3)], ncols=2, size=2, show=dontsave)

        adata_select.uns['species_colors'] = np.array(['#3A7D5E', '#D98758', '#7F7F7F'])
        # tSNE
        sc.tl.tsne(adata_select, n_pcs=30)
        sc.pl.tsne(adata_select, color='species', show=dontsave, save=None if dontsave else f"_{sample_name}_tsne_{target}.{figure_type}")
        # UMAP
        sc.pp.neighbors(adata_select, n_pcs=30)
        sc.tl.umap(adata_select)
        sc.pl.umap(adata_select, color='species', show=dontsave, save=None if dontsave else f"_{sample_name}_umap_{target}.{figure_type}")