In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import yaml
import csv
from tqdm.auto import tqdm
from pathlib import Path

In [None]:
import sys
sys.path.extend(['../../mylibs'])
import scAnalysis_util
import cellranger.matrix as cr_matrix
from cellranger.analysis import multigenome as cra_multigenome

In [None]:
results_dir = Path("results")
solo_out = "starsolo_outputs/Solo.out/GeneFull/filtered"

with open(results_dir / "config.yaml", 'r') as f:
    samples = yaml.safe_load(f).get('samples', {})

In [None]:
for sample_name, sample in tqdm(samples.items()):
# for sample_name, sample in tqdm([("ZT-238", samples["ZT-238"])]):
# for sample_name, sample in tqdm([("ZT-257", samples["ZT-257"])]):
    sample_result_dir = results_dir / sample_name / solo_out
    chem = sample["chem"]
    starindex = sample["starindex"]
    species = None
    if "GRCh38_and_GRCm39" in starindex:
        species = "GRCh38_and_GRCm39"
    elif "GRCh38" in starindex:
        species = "GRCh38"
    elif "GRCm39" in starindex:
        species = "GRCm39"
    elif "ChlSab" in starindex:
        species = "ChlSab"
    else:
        raise ValueError(f"Unknown species for {chem} with starindex {starindex}")

    adata = sc.read(sample_result_dir / "matrix.h5ad")
    adata.var_names_make_unique(join='_')

    if chem == "CapitalbioSeq-CB4":
        # Simple Stats
        estimated_number_of_cells = adata.n_obs
        umis_in_cells = adata.X.sum() if hasattr(adata.X, 'sum') else np.sum(adata.X)
        mean_umi_per_cell = np.mean(np.array(adata.X.sum(axis=1)).squeeze())
        median_umi_per_cell = np.median(np.array(adata.X.sum(axis=1)).squeeze())

        gene_counts_per_cell = np.array((adata.X > 0).sum(axis=1)).squeeze()
        mean_genefull_per_cell = np.mean(gene_counts_per_cell)
        median_genefull_per_cell = np.median(gene_counts_per_cell)
        total_genefull_detected = int((adata.X > 0).sum(axis=0).astype(bool).sum())

        summary_stats = {
            "Estimated Number of Cells": estimated_number_of_cells,
            "UMIs in Cells": int(umis_in_cells),
            "Mean UMI per Cell": float(mean_umi_per_cell),
            "Median UMI per Cell": float(median_umi_per_cell),
            "Mean GeneFull per Cell": float(mean_genefull_per_cell),
            "Median GeneFull per Cell": float(median_genefull_per_cell),
            "Total GeneFull Detected": total_genefull_detected,
        }

        # Save summary stats as CSV
        summary_csv_path = sample_result_dir / "Summary.csv"
        with open(summary_csv_path, "w", newline='') as csvfile:
            writer = csv.writer(csvfile)
            for k, v in summary_stats.items():
                adata.uns[k.lower().replace(" ", "_")] = v
                writer.writerow([k, v])

    if species == "GRCh38_and_GRCm39":
        cr_mat = cr_matrix.CountMatrix.load_h5_file(sample_result_dir / "matrix.h5")
        mga = cra_multigenome.MultiGenomeAnalysis(cr_mat)
        mga.run_all()
        mga.save_gem_class_csv(str(sample_result_dir / "gem_classification"))
        gem_classification = pd.read_csv(sample_result_dir / "gem_classification" / "gem_classification.csv", sep=',', comment='#')
        gem_classification = gem_classification.set_index('barcode')

        adata.obs['n_counts_umi_GRCh38'] = adata[:, (adata.var['genome'] == 'GRCh38').values].X.sum(axis=1)
        adata.obs['n_counts_umi_GRCm39'] = adata[:, (adata.var['genome'] == 'GRCm39').values].X.sum(axis=1)
        adata.obs['species'] = adata.obs_names.map(gem_classification['call'])
        adata.uns['multiplet_percent'] = (adata.obs['species'] == 'Multiplet').sum() / adata.n_obs

        with open(sample_result_dir / "Multiplet_stats.csv", "w", newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['Multiplet Percent', adata.uns['multiplet_percent']])
    else:
        adata.obs['species'] = species

    adata.write(sample_result_dir / "matrix.stats.h5ad")