In [None]:
import os
import shutil
import pickle
from pathlib import Path

import h5py
import numpy as np
import pandas as pd
import polars as pl
from scipy.sparse import csc_matrix

import scanpy as sc
import anndata as ad
from anndata import AnnData

from joblib import Parallel, delayed

import pprint
from tqdm.auto import tqdm

import yaml

In [None]:
import sys
sys.path.extend(['../../mylibs'])
import scAnalysis_util

In [None]:
gtf_pickle = {
    "GRCh38": Path('../../ref/GTF/GRCh38.patched.gtf.pkl'),
    "GRCm39": Path('../../ref/GTF/GRCm39.patched.gtf.pkl'),
    "GRCh38_and_GRCm39": Path('../../ref/GTF/GRCh38_and_GRCm39.patched.gtf.pkl'),
    "ChlSab": Path('../../ref/GTF/ChlSab.patched.gtf.pkl'),
}

In [None]:
results_dir = Path("results")
solo_genefull_filtered = "Solo.out/GeneFull/filtered"
solo_genefull_raw = "Solo.out/GeneFull/raw"

with open(results_dir / "config.yaml", 'r') as f:
    samples = yaml.safe_load(f).get('samples', {})

In [None]:
for sample_name, sample in tqdm(samples.items()):
    sample_dir = results_dir / sample_name
    chem = sample["chem"]
    starindex = sample["starindex"]
    species = None
    if "GRCh38_and_GRCm39" in starindex:
        species = "GRCh38_and_GRCm39"
    elif "GRCh38" in starindex:
        species = "GRCh38"
    elif "GRCm39" in starindex:
        species = "GRCm39"
    elif "ChlSab" in starindex:
        species = "ChlSab"
    else:
        raise ValueError(f"Unknown species for {chem} with starindex {starindex}")

    if chem == "CapitalbioSeq-CB4":
        # S1 Convert STARsolo outputs to h5ad format
        print(f"Processing {sample_name}: S1 Converting STARsolo outputs to h5ad format")
        tasks = []
        tasks.append(delayed(scAnalysis_util.convert_solomtx_to_h5ad)(
            sample_dir / f"starsolo_outputs_CB4/{solo_genefull_filtered}",
            gtf_pickle[species],
        ))
        tasks.append(delayed(scAnalysis_util.convert_solomtx_to_h5ad)(
            sample_dir / f"starsolo_outputs_CB4/{solo_genefull_raw}",
            None
        ))
        tasks.append(delayed(scAnalysis_util.convert_solomtx_to_h5ad)(
            sample_dir / f"starsolo_outputs_UNKNOWN/{solo_genefull_filtered}",
            gtf_pickle[species],
        ))
        tasks.append(delayed(scAnalysis_util.convert_solomtx_to_h5ad)(
            sample_dir / f"starsolo_outputs_UNKNOWN/{solo_genefull_raw}",
            None
        ))
        workers = Parallel(n_jobs=-1, backend='loky')
        workers(tasks)

        # S2 Merge CB4 and UNKNOWN outputs
        print(f"Processing {sample_name}: S2 Merging CB4 and UNKNOWN outputs")
        h5ad_CB4_filtered = sc.read_h5ad(sample_dir / f"starsolo_outputs_CB4/{solo_genefull_filtered}/matrix.h5ad")
        h5ad_UNKNOWN_filtered = sc.read_h5ad(sample_dir / f"starsolo_outputs_UNKNOWN/{solo_genefull_filtered}/matrix.h5ad")
        h5ad_merged_filtered = scAnalysis_util.CB4_UNKNOWN_Merger(
            h5ad_CB4_filtered,
            h5ad_UNKNOWN_filtered,
            "NNNNNNN", 24).run()

        OUTPUT_PATH = sample_dir / 'starsolo_outputs'
        shutil.rmtree(OUTPUT_PATH, ignore_errors=True)
        (OUTPUT_PATH / f'{solo_genefull_filtered}').mkdir(exist_ok=True, parents=True)
        h5ad_merged_filtered.write_h5ad(OUTPUT_PATH / f'{solo_genefull_filtered}/matrix.h5ad', compression='gzip')

        # S3 Convert merged h5ad to 10X h5 format
        print(f"Processing {sample_name}: S3 Converting merged h5ad to 10X h5 format")
        OUTPUT_PATH = sample_dir / f"starsolo_outputs/{solo_genefull_filtered}"
        scAnalysis_util.write_10X_h5(
            h5ad_merged_filtered,
            str(Path(OUTPUT_PATH / f"matrix.h5").absolute()),
            chemistry_description=f"{chem}"
        )
    else:
        # S1 Convert STARsolo outputs to h5ad format
        print(f"Processing {sample_name}: S1 Converting STARsolo outputs to h5ad format")
        tasks = []
        tasks.append(delayed(scAnalysis_util.convert_solomtx_to_h5ad)(
            sample_dir / f"starsolo_outputs/{solo_genefull_filtered}",
            gtf_pickle[species],
        ))
        tasks.append(delayed(scAnalysis_util.convert_solomtx_to_h5ad)(
            sample_dir / f"starsolo_outputs/{solo_genefull_raw}",
            None
        ))
        workers = Parallel(n_jobs=-1, backend='loky')
        workers(tasks)

        # S2 Convert filtered h5ad to 10X h5 format
        print(f"Processing {sample_name}: S2 Converting filtered h5ad to 10X h5 format")
        h5ad_merged_filtered = sc.read_h5ad(sample_dir / f"starsolo_outputs/{solo_genefull_filtered}/matrix.h5ad")
        OUTPUT_PATH = sample_dir / f"starsolo_outputs/{solo_genefull_filtered}"
        scAnalysis_util.write_10X_h5(
            h5ad_merged_filtered,
            str(Path(OUTPUT_PATH / f"matrix.h5").absolute()),
            chemistry_description=f"{chem}"
        )

In [None]:
h5ad_CB4_raw = sc.read_h5ad("ZT-238/starsolo_outputs_CB4/Solo.out/GeneFull/raw/matrix.h5ad")
h5ad_UNKNOWN_raw = sc.read_h5ad("ZT-238/starsolo_outputs_UNKNOWN/Solo.out/GeneFull/raw/matrix.h5ad")
h5ad_merged_raw = scAnalysis_util.CB4_UNKNOWN_Merger(h5ad_CB4_raw, h5ad_UNKNOWN_raw, "NNNNNNN", 12).run()

save_path = Path('ZT-238/starsolo_outputs/Solo.out/GeneFull/raw')
save_path.mkdir(parents=True, exist_ok=True)
h5ad_merged_raw.write_h5ad(save_path / 'matrix.h5ad', compression='gzip')