In [1]:
import pandas as pd
import warnings
import os
import subprocess
import shutil
import urllib.request 
import tarfile


In [2]:
def say(quiet, words):
    if not quiet:
        print(words)

In [3]:
os.getcwd()

'/mnt/scratch5/dongze/CODE/python/pyroe/src/pyroe'

In [4]:
my_file = os.path.join('data', 'available_datasets.tsv')
available_datasets = pd.read_csv(my_file, sep="\t")


In [5]:
available_datasets.head()

Unnamed: 0,chemistry,reference,dataset_name,link,data_url,MD5,delete_fastq,feature_barcode,library_csv,quant_link
0,v3,human2020A,"500 Human PBMCs, 3' LT v3.1, Chromium Controller",https://www.10xgenomics.com/resources/datasets...,https://cf.10xgenomics.com/samples/cell-exp/6....,5f080c6082f11ea9fc6448482e6fb590,1,,,https://umd.box.com/shared/static/tg919re5gd4k...
1,v3,human2020A,"500 Human PBMCs, 3' LT v3.1, Chromium X",https://www.10xgenomics.com/resources/datasets...,https://cf.10xgenomics.com/samples/cell-exp/6....,5b36a7bfda36a7093adc8e30c3fa92c8,1,,,https://umd.box.com/shared/static/lrl68q2lz0lt...
2,v3,human2020A,1k PBMCs from a Healthy Donor (v3 chemistry),https://www.10xgenomics.com/resources/datasets...,https://cf.10xgenomics.com/samples/cell-exp/3....,265ebe8f77ad90db350984d9c7a59e52,1,,,https://umd.box.com/shared/static/wrn19wsmkem1...
3,v3,mm10-2020A,10k PBMCs from a Healthy Donor (v3 chemistry),https://www.10xgenomics.com/resources/datasets...,https://s3-us-west-2.amazonaws.com/10x.files/s...,e0021592e209642d71f5dc420cf4c5c0,1,,,https://umd.box.com/shared/static/01j9574g1yd9...
4,v3,human2020A,"10k Human PBMCs, 3' v3.1, Chromium X",https://www.10xgenomics.com/resources/datasets...,https://s3-us-west-2.amazonaws.com/10x.files/s...,43ea77ed6f860597c568e9ff819f0504,1,,,https://umd.box.com/shared/static/jvvzacmo98vx...


In [11]:
dataset_ids = [1, 3, 2]
output_dir = "processed_data"
quiet = False
force = True
delete_tar = True
nonzero = False

In [7]:
say(quiet, "Processing parameters")
# load available dataset sheet
# location = os.path.dirname(os.path.realpath(__file__))
# my_file = os.path.join(location, 'data', 'available_datasets.tsv')
my_file = os.path.join('data', 'available_datasets.tsv')
available_datasets = pd.read_csv(my_file, sep="\t")

nd = len(dataset_ids)
# if no dataset is provided, just return the available dataset dataframe


Processing parameters


In [8]:
output_format = {"counts":["U", "S"]}

In [9]:
if type(list(output_format)) is dict:
    # if a dictionary is given,
    # it should be either one customized format
    # or the format of each fetched datasets
    # so check the name 
    if list(output_format.keys()).sort() != dataset_ids.sort():
        # now it should be one customized format
        output_format = dict(zip(dataset_ids, [output_format]*nd))
    # otherwise, each dataset should get a format, so do nothing
elif (type(output_format) is str):
    # if a str is given, it should be a pre-defined format
    # and it will be used for all datasets
    output_format = dict(zip(dataset_ids, [output_format]*nd))



In [22]:

from matplotlib.style import available
from pyroe_utils import say

def fetch_processed_quant(
    dataset_ids = [],
    fetch_dir = "10x_datasets",
    force = False,
    delete_tar = True,
    quiet = False
):
    """
    Download the quantification result of the preprocessed 10x datasets.

    Required Parameters
    ----------
    dataset_ids : `int` or `list`
        The list of the id of some available datasets.

    Optional Parameters
    ----------
    fetch_dir : `str` (default: `10x_datasets`)
        The path to a directory for storing fetched datasets.
    
    force : `bool` (default: `False`)
        True if existing datasets should be re-downloaded.
        
    delete_tar : `bool` (default: `True`)
        True if intermediate tar files should be deleted.
        If False, they will be stored in the datasets_tar
        folder under the fetch_dir.
        
    quiet : `bool` (default: `True`)
        True if function should be quiet.
        False if messages (including error messages) should be printed out. 

    Returns
    -------
    If an empty dataset_ids list is given, a dataframe 
    containing the information of all available datasets
    will be returned. If one or more dataset ids are provided as dataset_ids, 
    a dictionary of str paths will be returned.   

    Notes
    -----
    10x Genomics provides many publicly available single-cell
    RNA-sequencing experiments on their 
    [website](https://www.10xgenomics.com/resources/datasets).
    To avoid reinventing wheels, we processed these datasets
    using a nextflow-based 
    [alevin-fry workflow](https://github.com/COMBINE-lab/10x-requant) 
    and made the quantification results available for free downloading. 
    Currently, the available datasets include (Notice that dataset id starts form **1**, not zero):
    
    1. [500 Human PBMCs, 3' LT v3.1, Chromium Controller](https://www.10xgenomics.com/resources/datasets/500-human-pbm-cs-3-lt-v-3-1-chromium-controller-3-1-low-6-1-0): [link to the quant result](https://umd.box.com/shared/static/tg919re5gd4klua39z3zemcg9ya422am.tar)
    1. [500 Human PBMCs, 3' LT v3.1, Chromium X](https://www.10xgenomics.com/resources/datasets/500-human-pbm-cs-3-lt-v-3-1-chromium-x-3-1-low-6-1-0): [link to the quant result](https://umd.box.com/shared/static/lrl68q2lz0ltsvs89iazbr302p50wnqj.tar)
    1. [1k PBMCs from a Healthy Donor (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/wrn19wsmkem1jyc9seqpe4pxto5zimwa.tar)
    1. [10k PBMCs from a Healthy Donor (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/10-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/01j9574g1yd93noz2pqlqjfrdhx0m1ff.tar)
    1. [10k Human PBMCs, 3' v3.1, Chromium X](https://www.10xgenomics.com/resources/datasets/10k-human-pbmcs-3-ht-v3-1-chromium-x-3-1-high): [link to the quant result](https://umd.box.com/shared/static/jvvzacmo98vxfnoimg4dgi52lifhl2aa.tar)
    1. [10k Human PBMCs, 3' v3.1, Chromium Controller](https://www.10xgenomics.com/resources/datasets/10k-human-pbmcs-3-v3-1-chromium-controller-3-1-high): [link to the quant result](https://umd.box.com/shared/static/5dzu2tw8nz9tijt8lgmelll6sbaaomh4.tar)
    1. [10k Peripheral blood mononuclear cells (PBMCs) from a healthy donor, Single Indexed](https://www.10xgenomics.com/resources/datasets/10-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-single-indexed-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/iol9bxiv740xq6m29p2fzcoe8volsi7i.tar)
    1. [10k Peripheral blood mononuclear cells (PBMCs) from a healthy donor, Dual Indexed](https://www.10xgenomics.com/resources/datasets/10-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-dual-indexed-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/5dzu2tw8nz9tijt8lgmelll6sbaaomh4.tar)
    1. [20k Human PBMCs, 3' HT v3.1, Chromium X](https://www.10xgenomics.com/resources/datasets/20-k-human-pbm-cs-3-ht-v-3-1-chromium-x-3-1-high-6-1-0): [link to the quant result](https://umd.box.com/shared/static/c609sk8w6cbn4w0tcwofz4qcyjp67506.tar)
    1. [PBMCs from EDTA-Treated Blood Collection Tubes Isolated via SepMate-Ficoll Gradient (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_edta_sepmate-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/imedrs558dx4tzxy9uhhxvy0dmjlhjsh.tar)
    1. [PBMCs from Heparin-Treated Blood Collection Tubes Isolated via SepMate-Ficoll Gradient (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_heparin_sepmate-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/e8gqxali0lwy2nashh5rmmoc6bgj92xm.tar)
    1. [PBMCs from ACD-A Treated Blood Collection Tubes Isolated via SepMate-Ficoll Gradient (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_acda_sepmate-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/w1kdz3vifqi4ixtqkuwqgc2mpkkiehky.tar)
    1. [PBMCs from Citrate-Treated Blood Collection Tubes Isolated via SepMate-Ficoll Gradient (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_citrate_sepmate-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/cs0s6e2u0j7d8uc36xsdo6922c7dle6y.tar)
    1. [PBMCs from Citrate-Treated Cell Preparation Tubes (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_citrate_cpt-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/2tqrzreghvi6nxe94oob1ei1vi4458br.tar)
    1. [PBMCs from a Healthy Donor: Whole Transcriptome Analysis](https://www.10xgenomics.com/resources/datasets/pbm-cs-from-a-healthy-donor-whole-transcriptome-analysis-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/dk0hmj5mpqjq56afkr5jibavy9e3yil8.tar)
    1. [Whole Blood RBC Lysis for PBMCs and Neutrophils, Granulocytes, 3'](https://www.10xgenomics.com/resources/datasets/whole-blood-rbc-lysis-for-pbmcs-neutrophils-granulocytes-3-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/0gnwx7d9hbdmptyi0ddz6mfa79d1l8be.tar)
    1. [Peripheral blood mononuclear cells (PBMCs) from a healthy donor - Manual (channel 5)](https://www.10xgenomics.com/resources/datasets/peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-manual-channel-5-3-1-standard-3-1-0): [link to the quant result](https://umd.box.com/shared/static/tn884ctombnj214abt8rp77p7kih5i02.tar)
    1. [Peripheral blood mononuclear cells (PBMCs) from a healthy donor - Manual (channel 1)](https://www.10xgenomics.com/resources/datasets/peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-manual-channel-1-3-1-standard-3-1-0): [link to the quant result](https://umd.box.com/shared/static/0jcgdgy8woj30oarkwhybk8fly7gb7v8.tar)
    1. [Peripheral blood mononuclear cells (PBMCs) from a healthy donor - Chromium Connect (channel 5)](https://www.10xgenomics.com/resources/datasets/peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-chromium-connect-channel-5-3-1-standard-3-1-0): [link to the quant result](https://umd.box.com/shared/static/kybks0ncf609xhcwvhv7z743zrmvlg94.tar)
    1. [Peripheral blood mononuclear cells (PBMCs) from a healthy donor - Chromium Connect (channel 1)](https://www.10xgenomics.com/resources/datasets/peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-chromium-connect-channel-1-3-1-standard-3-1-0): [link to the quant result](https://umd.box.com/shared/static/vtuexhbqiyvfob7qdpvsxl1nbqlo074f.tar)
    1. [Hodgkin's Lymphoma, Dissociated Tumor: Whole Transcriptome Analysis](https://www.10xgenomics.com/resources/datasets/hodgkins-lymphoma-dissociated-tumor-whole-transcriptome-analysis-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/qis4ovf34wvq12n2uabdiem6w355qry7.tar)
    1. [200 Sorted Cells from Human Glioblastoma Multiforme, 3’ LT v3.1](https://www.10xgenomics.com/resources/datasets/200-sorted-cells-from-human-glioblastoma-multiforme-3-lt-v-3-1-3-1-low-6-0-0): [link to the quant result](https://umd.box.com/shared/static/2xf9xf8m1n5vbvmpo1vshwigs7f7o5jd.tar)
    1. [750 Sorted Cells from Human Invasive Ductal Carcinoma, 3’ LT v3.1](https://www.10xgenomics.com/resources/datasets/750-sorted-cells-from-human-invasive-ductal-carcinoma-3-lt-v-3-1-3-1-low-6-0-0): [link to the quant result](https://umd.box.com/shared/static/3txnreehxoj2plyypfs6fkibnnbo72h4.tar)
    1. [2k Sorted Cells from Human Glioblastoma Multiforme, 3’ v3.1](https://www.10xgenomics.com/resources/datasets/2-k-sorted-cells-from-human-glioblastoma-multiforme-3-v-3-1-3-1-standard-6-0-0): [link to the quant result](https://umd.box.com/shared/static/n0vpgbdwbnnqdw1h9of2ykk7ive9p6pt.tar)
    1. [7.5k Sorted Cells from Human Invasive Ductal Carcinoma, 3’ v3.1](https://www.10xgenomics.com/resources/datasets/7-5-k-sorted-cells-from-human-invasive-ductal-carcinoma-3-v-3-1-3-1-standard-6-0-0): [link to the quant result](https://umd.box.com/shared/static/aly78r6bppqf01npbqfopc3epmp17weu.tar)
    1. [Human Glioblastoma Multiforme: 3’v3 Whole Transcriptome Analysis](https://www.10xgenomics.com/resources/datasets/human-glioblastoma-multiforme-3-v-3-whole-transcriptome-analysis-3-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/suf8pt3avv4rchxfw0bqrshslzieygef.tar)
    1. [1k Brain Cells from an E18 Mouse (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-brain-cells-from-an-e-18-mouse-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/4w5eiq3qafbru5ocler39j5j28bvgz98.tar)
    1. [10k Brain Cells from an E18 Mouse (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/10-k-brain-cells-from-an-e-18-mouse-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/tym9m73frtp13vo15jhit9uwuk3mtfdq.tar)
    1. [1k Heart Cells from an E18 mouse (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-heart-cells-from-an-e-18-mouse-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/d838oy3udjvtzjo7tsdiao7u6sazabeg.tar)
    1. [10k Heart Cells from an E18 mouse (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/10-k-heart-cells-from-an-e-18-mouse-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/efinlf6p8weich13kv3bzrlndsx963v4.tar)
    1. [10k Mouse E18 Combined Cortex, Hippocampus and Subventricular Zone Cells, Single Indexed](https://www.10xgenomics.com/resources/datasets/10-k-mouse-e-18-combined-cortex-hippocampus-and-subventricular-zone-cells-single-indexed-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/mr0yolo83rjdcdqgu6om4q133fpime8r.tar)
    1. [10k Mouse E18 Combined Cortex, Hippocampus and Subventricular Zone Cells, Dual Indexed](https://www.10xgenomics.com/resources/datasets/10-k-mouse-e-18-combined-cortex-hippocampus-and-subventricular-zone-cells-dual-indexed-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/mr7raea3v5ccn4dchemwhcimpz7t1cwl.tar)
    1. [1k PBMCs from a Healthy Donor (v2 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-2-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/xeya5zr0t0wg0t8c20zu0pdhclxywx3c.tar)
    1. [1k Brain Cells from an E18 Mouse (v2 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-brain-cells-from-an-e-18-mouse-v-2-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/a53twm69uo2xf6778asuvw2aft7wkur5.tar)
    1. [1k Heart Cells from an E18 mouse (v2 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-heart-cells-from-an-e-18-mouse-v-2-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/p4ieuzimfgrjfsr9rzhrn48kved4ha7m.tar)

    To obtain the information of the available datasets as 
    a dataframe, one can run `preprocessed_10x_data()`
    """
    
    import pandas as pd
    import os
    import shutil
    import urllib.request 
    import tarfile

    # load available dataset sheet
    # location = os.path.dirname(os.path.realpath(__file__))
    # my_file = os.path.join(location, 'data', 'available_datasets.tsv')
    my_file = os.path.join('data', 'available_datasets.tsv')
    available_datasets = pd.read_csv(my_file, sep="\t")

    # if no dataset is provided, just return the available dataset dataframe
    if len(dataset_ids) == 0:
        return available_datasets

    # check the validity of dataset_ids
    n_ds = available_datasets.shape[0]
    invalid_ids = []
    for idx, dataset_id in enumerate(dataset_ids):
        if (type(dataset_id) == int):
            if dataset_id > n_ds & dataset_id < 1:
                print(f"Found invalid dataset id '{dataset_id}', ignored.")
                invalid_ids.append(idx)
        else:
            print(f"Found invalid dataset id '{dataset_id}', ignored.")
            invalid_ids.append(idx)
    for i in reversed(invalid_ids):
        del dataset_ids[i]

    # if no id left, return an error
    if not dataset_ids:
        raise ValueError(f"No valid dataset id found, can not proceed")

    # download the quantification tar file for each queried dataset.
    quant_dir_list = []
    # folder for (temporarily) storing tar files.
    tar_dir = os.path.join(fetch_dir, "datasets_tar")
    if not os.path.exists(tar_dir):
        os.makedirs(tar_dir)
    
    # download the quantification tar file for each queried dataset.
    for dataset_id in dataset_ids:
        say(quiet, f"Processing dataset #{dataset_id}")
        dataset_id -= 1
        quant_parent_dir = os.path.join(fetch_dir, 
                                        available_datasets.iloc[dataset_id,5])
        tar_file = os.path.join(tar_dir,
                            "".join([available_datasets.iloc[dataset_id,5], ".tar"])
                                )
        if os.path.exists(quant_parent_dir):
            say(quiet, f"  - output dir exists:\n    {quant_parent_dir}")
            
            if force:
                say(quiet, "  - force re-processing")
                shutil.rmtree(quant_parent_dir)
            else:
                say(quiet, "  - use the existing quant result")
                quant_dir_list.append(os.path.join(quant_parent_dir, next(os.walk(quant_parent_dir))[1][0]))

                continue
        say(quiet, "  - Downloading quant result")
        url = available_datasets.iloc[dataset_id, 9]
        urllib.request.urlretrieve(url, tar_file)
        # decompress the downloaded tar files
        say(quiet, "  - Decompressing quant result")
        tf = tarfile.open(tar_file)
        tf.extractall(quant_parent_dir)
        quant_dir_list.append(os.path.join(quant_parent_dir, next(os.walk(quant_parent_dir))[1][0]))
    
    # delete tar if needed
    if delete_tar:
        say(quiet, "Removing downloaded tar files")
    shutil.rmtree(tar_dir)

    say(quiet, "Done")
    return dict(zip(dataset_ids, quant_dir_list))
    

In [23]:
fetch_processed_quant([1,2,3])

Processing dataset #1
  - output dir exists:
    10x_datasets/5f080c6082f11ea9fc6448482e6fb590
  - use the existing quant result
Processing dataset #2
  - output dir exists:
    10x_datasets/5b36a7bfda36a7093adc8e30c3fa92c8
  - use the existing quant result
Processing dataset #3
  - output dir exists:
    10x_datasets/265ebe8f77ad90db350984d9c7a59e52
  - use the existing quant result
Removing downloaded tar files
Done


{1: '10x_datasets/5f080c6082f11ea9fc6448482e6fb590/5f080c6082f11ea9fc6448482e6fb590_fry_unfilt_quant_usa_cr-like',
 2: '10x_datasets/5b36a7bfda36a7093adc8e30c3fa92c8/5b36a7bfda36a7093adc8e30c3fa92c8_fry_unfilt_quant_usa_cr-like',
 3: '10x_datasets/265ebe8f77ad90db350984d9c7a59e52/265ebe8f77ad90db350984d9c7a59e52_fry_unfilt_quant_usa_cr-like'}

In [28]:
try:
    import scanpy
except ModuleNotFoundError as e:
    print("scanpy must be installed to enable the load_fry() function. Use `conda install -c scanpy ` or `pip install scanpy` to install it.")
    import sys
    sys.exit(1)

import scanpy

def load_fry(frydir, output_format="scRNA", nonzero = False, quiet=False):
    """
    load alevin-fry quantification result into an AnnData object
    
    Required Parameters
    ----------
    frydir : `str`
        The path to a output directory returned by alevin-fry quant command. \\
        The directory containing the alevin-fry quantification (i.e. the the quant.json file & alevin subdirectory).
    
    Optional Parameters
    ----------
    output_format : `str` or `dict`
        A string represents one of the pre-defined output formats, which are "scRNA", "snRNA" and "velocity". \\
        If a customized format of the returned `AnnData` is needed, one can pass a Dictionary.\\
        See Notes section for details.

    nonzero : `bool` (default: `False`)
        True if cells with non-zero expression value across all genes should be filtered in each layer.
        False if unexpressed genes should be kept.

    quiet : `bool` (default: `False`)
        True if function should be quiet.
        False if messages (including error messages) should be printed out. 

    Notes
    ----------
    The `output_format` argument takes either a dictionary that defines the customized format or 
    a string that represents one of the pre-defined format of the returned `AnnData` object.

    Each of the pre-defined formats contains a `X` field and some optional extra `AnnData.layers` 
    obtained from the submatrices representing unspliced (U), spliced (S) and ambiguous (A) counts 
    returned by alevin-fry. 
    
    The following formats are defined:

    * "scRNA": \\
        This format is recommended for single cell RNA-sequencing experiments. 
        It returns a `X` field that contains the S+A count of each gene in each cell without any extra layers.

    * "snRNA": \\
        This format is recommended for single nucleus RNA-sequencing experiments. 
        It returns a `X` field that contains the U+S+A count of each gene in each cell without any extra layers.

    * "raw": \\
        This format uses the S count matrix as the `X` field and put the U, S, and A counts into three 
        separate layers, which are "unspliced", "spliced" and "ambiguous".

    * "velocity": \\
        This format is the same as "scRNA", except it contains two extra layers: the "spliced" layer, 
        which contains the S+A counts, and the "unspliced" layer, which contains the U counts.

    A custom output format can be defined using a Dictionary specifying the desired format of the output `Anndata` object.  
    If the input is not a USA mode quantification directory, this parameter is ignored
    and the count matrix is returned in the `X` field of the returned `AnnData` object.  If the input
    quantification directory contains a USA mode quantification, then there are 3 sub-matrices that can 
    be referenced in the dictionary; 'U', 'S', 'A' containing, respectively, unspliced, spliced and 
    ambiguous counts.  The dictionary should have entries of the form `key` (str) : `value` (list[str]).
    The following constraints apply : there should be one key-value pair with the key `X`, the resulting
    counts will be returned in the `X` field of the AnnData object. There can be an arbitrary number
    of other key-value pairs, but each will be returned as a layer of the resulting AnnData object.
    Within the key-value pairs, the key refers to the layer name that will be given to the combined 
    count matrix upon output, and the value should be a subset of `['U', 'S', 'A']` that defines 
    which sub-matrices should be summed.  For example:
    `{'X' : ['S', 'A'], 'unspliced' : ['U']}`
    will result in a return AnnData object where the X field has a matrix in which each entry 
    corresponds to the summed spliced and ambiguous counts for each gene in each cell, and there
    is an additional "unspliced" layer, whose counts are taken directly from the unspliced sub-matrix.

    Returns:
    ----------
        An AnnData object with X and layers corresponding to the requested `output_format`.
        
    """
    import json
    import os
    import pandas as pd

    # since alevin-fry 0.4.1 the generic "meta_info.json"
    # has been replaced by a more informative name for each
    # sub-command. For quantification, it is "quant.json".
    # we check for both files here, in order.
    meta_info_files = ["quant.json", "meta_info.json"]

    fpath = os.path.sep.join([frydir, meta_info_files[0]])
    # first, check for the new file, if we don't find it, check
    # for the old one.
    if not os.path.exists(fpath):
        if quiet:
            print(f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}.")
        fpath = os.path.sep.join([frydir, meta_info_files[1]])
        # if we don't find the old one either, then return None
        if not os.path.exists(fpath):
            raise IOError(f"Found no {meta_info_files[1]} file either; cannot proceed.")

    # if we got here then we had a valid json file, so 
    # use it to get the number of genes, and if we are 
    # in USA mode or not.
    meta_info = json.load(open(fpath))
    ng = meta_info['num_genes']
    usa_mode = meta_info['usa_mode']
    if quiet:
        print(f"USA mode: {usa_mode}")

    # if we are in USA mode
    if usa_mode:
        # preparation
        # each gene has 3 splicing statuses, so the actual number of distinct 
        # genes is ng/3.
        ng = int(ng/3)
        output_assays = process_output_format(output_format, quiet)
    elif quiet:
        print("Processing input in standard mode, the count matrix will be stored in field 'X'.")
        if output_format != "scRNA":
            print("Output_format will be ignored.")

    # read the actual input matrix
    af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"]))
    afg = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])).readlines()][:ng]
    # read the gene ids
    afg_df =  pd.DataFrame(afg, columns=["gene_ids"])
    afg_df = afg_df.set_index("gene_ids")
    # and the barcodes
    abc = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])).readlines() ]
    abc_df = pd.DataFrame(abc, columns=["barcodes"])
    abc_df.index = abc_df["barcodes"]
    
    x = af_raw.X
    # if we're not in USA mode, just combine this info into 
    # an AnnData object
    if not usa_mode:
        af = scanpy.AnnData(x.T, var=abc_df, obs=afg_df)
        af = af.T
        
    else: # USA mode
        # otherwise, combine the sub-matrices into the output object as 
        # specified by `output_assays`
        rd = {'S' : range(0,ng), 'U' : range(ng, 2*ng), 'A' : range(2*ng,3*ng)}
        xcounts = output_assays['X']
        o = x[:, rd[xcounts[0]]]
        for wc in xcounts[1:]:
            o += x[:, rd[wc]]
        af = scanpy.AnnData(o.T, var=abc_df, obs=afg_df)
        af = af.T

        # now, if there are other layers requested, populate those
        for other_layer in output_assays.keys() - 'X':
            xcounts = output_assays[other_layer]
            o = x[:, rd[xcounts[0]]]
            for wc in xcounts[1:]:
                o += x[:, rd[wc]] 
            af.layers[other_layer] = o
    
    if nonzero:
        import numpy as np

        not_zero_genes = af.X.sum(axis=0).A1 > 0
        if usa_mode:
            for other_layer in output_assays.keys() - 'X':
                not_zero_genes = np.logical_or(not_zero_genes, af.layers[other_layer].sum(axis=0).A1 > 0)

        af = af[:, not_zero_genes]

        if quiet:
            print(f"Filtered {np.sum(~not_zero_genes)} non-expressed genes.")
    
    return af

def process_output_format(output_format, quiet):
    # make sure output_format isn't empty
    if not output_format:
        raise ValueError("output_format cannot be empty")  

    if isinstance(output_format, (str, dict)):
        if isinstance(output_format, str):
            predefined_format = {'scrna': {"X": ["S", "A"]}, 
                "snrna": {"X": ["U", "S", "A"]},
                "velocity": {"X" : ["S", "A"], "spliced": ["S", "A"], "unspliced": ["U"]},
                "raw": {"X" : ["S"], "spliced": ["S"], "unspliced": ["U"],  "ambiguous": ["A"]}
            }

            output_format = output_format.lower()
            if output_format not in predefined_format.keys():
                # invalid output_format string
                if quiet:
                    print("Provided output_format string must be 'scRNA', 'snRNA', 'raw' or 'velocity'.")
                    print("See function help message for details.")
                raise ValueError("Invalid output_format.")
            if quiet:
                print("Using pre-defined output format:", output_format)
                print(f"Will populate output field X with sum of counts frorm {predefined_format[output_format]['X']}.")
                for (k,v) in predefined_format[output_format].items():
                    if k != 'X':
                        print(f'Will combine {v} into output layer {k}.') 

            return predefined_format[output_format]
        else:
            if quiet:
                print("Processing user-defined output format.")
            # make sure the X is there
            if 'X' not in output_format.keys():
                raise ValueError('In USA mode some sub-matrices must be assigned to the \"X\" (default) output.')
            print(f"Will populate output field X with sum of counts frorm {output_format['X']}.")

            valid_counts = ["U", "S", "A"]
            for (k,v) in output_format.items():
                if not v:
                    # empty list
                    raise ValueError(f"The element list of key '{k}' in output_format is empty. Please remove it.")
                
                # v contains Non-USA element
                if len(set(v) - set(['U', 'S', 'A'])) != 0:
                    # invalid value
                    raise ValueError(f"Found non-USA element in output_format element list '{v}' for key '{k}'; cannot proceed.")
                if quiet and (k != 'X'):
                    print(f'Will combine {v} into output layer {k}.') 

            return output_format
    else:
        raise ValueError("Provided invalid output_format. See function help message for details")
        

In [29]:
from pyroe_utils import say
# from fetch_processed_quant import fetch_processed_quant
# from load_fry import load_fry


def load_processed_quant(
    dataset_ids = [],
    fetch_dir = "10x_datasets",
    force = False,
    delete_tar = True,
    output_format="scRNA",
    nonzero = False,
    quiet = False
):
    """
    Download the quantification result of the preprocessed 10x datasets.

    Required Parameters
    ----------
    dataset_ids : `int` or `list`
        The list of the id of some available datasets.

    Optional Parameters
    ----------
    fetch_dir : `str` (default: `10x_datasets`)
        The path to a directory for storing downloaded datasets.

    force : `bool` (default: `False`)
        True if existing datasets should be re-downloaded.
        
    delete_tar : `bool` (default: `True`)
        True if intermediate tar files should be deleted.
        If False, they will be stored in the datasets_tar
        folder under the fetch_dir.
    
    output_format : `str` or `dict`
        Either a str represents one of the pre-defined output 
        formats, which are "scRNA", "snRNA" and "velocity", 
        that will be used for loading all fetched datasets, \\
        or a `dict` represent a customized format that will
        be used for loading all fetched datasets,\\
        or a `dict` of `str` or `dict` which keys are the dataset
        ids to be fetched and values are the output_format that will 
        be used for loading each fetched dataset. 
        See [load_fry](https://github.com/COMBINE-lab/pyroe/blob/main/src/pyroe/load_fry.py) 
        for the details of output_format.

    nonzero : `bool` or `list` (default: `False`)
        True if cells with non-zero expression value 
        across all genes should be filtered in each layer.
        False if unexpressed genes should be kept.
        If a list of `bool` is passed, the booleans
        will be used for loading each fetched dataset in order.

    quiet : `bool` (default: `False`)
        True if function should be quiet.
        False if messages (including error messages) should be printed out. 


    Returns
    -------
    If an empty dataset_ids list is given, a data frame 
    containing the information of all available datasets
    will be returned. If an dataset id is provided as dataset_ids, 
    a str represents path to the downloaded dataset will be 
    returned. If a list of dataset ids is provided as dataset_ids, 
    a dictionary of AnnData objects will be returned. The keys
    are the dataset ids, the values are the corresponding AnnData objects.

    Notes
    -----
    10x Genomics provides many publicly available single-cell
    RNA-sequencing experiments on their 
    [website](https://www.10xgenomics.com/resources/datasets).
    To avoid reinventing wheels, we processed these datasets
    using a nextflow-based 
    [alevin-fry workflow](https://github.com/COMBINE-lab/10x-requant) 
    and made the quantification results available for free downloading. 
    Currently, the available datasets include (Notice that dataset id starts form **1**, not zero):
    
    1. [500 Human PBMCs, 3' LT v3.1, Chromium Controller](https://www.10xgenomics.com/resources/datasets/500-human-pbm-cs-3-lt-v-3-1-chromium-controller-3-1-low-6-1-0): [link to the quant result](https://umd.box.com/shared/static/tg919re5gd4klua39z3zemcg9ya422am.tar)
    1. [500 Human PBMCs, 3' LT v3.1, Chromium X](https://www.10xgenomics.com/resources/datasets/500-human-pbm-cs-3-lt-v-3-1-chromium-x-3-1-low-6-1-0): [link to the quant result](https://umd.box.com/shared/static/lrl68q2lz0ltsvs89iazbr302p50wnqj.tar)
    1. [1k PBMCs from a Healthy Donor (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/wrn19wsmkem1jyc9seqpe4pxto5zimwa.tar)
    1. [10k PBMCs from a Healthy Donor (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/10-k-pbm-cs-from-a-healthy-donor-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/01j9574g1yd93noz2pqlqjfrdhx0m1ff.tar)
    1. [10k Human PBMCs, 3' v3.1, Chromium X](https://www.10xgenomics.com/resources/datasets/10k-human-pbmcs-3-ht-v3-1-chromium-x-3-1-high): [link to the quant result](https://umd.box.com/shared/static/jvvzacmo98vxfnoimg4dgi52lifhl2aa.tar)
    1. [10k Human PBMCs, 3' v3.1, Chromium Controller](https://www.10xgenomics.com/resources/datasets/10k-human-pbmcs-3-v3-1-chromium-controller-3-1-high): [link to the quant result](https://umd.box.com/shared/static/5dzu2tw8nz9tijt8lgmelll6sbaaomh4.tar)
    1. [10k Peripheral blood mononuclear cells (PBMCs) from a healthy donor, Single Indexed](https://www.10xgenomics.com/resources/datasets/10-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-single-indexed-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/iol9bxiv740xq6m29p2fzcoe8volsi7i.tar)
    1. [10k Peripheral blood mononuclear cells (PBMCs) from a healthy donor, Dual Indexed](https://www.10xgenomics.com/resources/datasets/10-k-peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-dual-indexed-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/5dzu2tw8nz9tijt8lgmelll6sbaaomh4.tar)
    1. [20k Human PBMCs, 3' HT v3.1, Chromium X](https://www.10xgenomics.com/resources/datasets/20-k-human-pbm-cs-3-ht-v-3-1-chromium-x-3-1-high-6-1-0): [link to the quant result](https://umd.box.com/shared/static/c609sk8w6cbn4w0tcwofz4qcyjp67506.tar)
    1. [PBMCs from EDTA-Treated Blood Collection Tubes Isolated via SepMate-Ficoll Gradient (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_edta_sepmate-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/imedrs558dx4tzxy9uhhxvy0dmjlhjsh.tar)
    1. [PBMCs from Heparin-Treated Blood Collection Tubes Isolated via SepMate-Ficoll Gradient (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_heparin_sepmate-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/e8gqxali0lwy2nashh5rmmoc6bgj92xm.tar)
    1. [PBMCs from ACD-A Treated Blood Collection Tubes Isolated via SepMate-Ficoll Gradient (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_acda_sepmate-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/w1kdz3vifqi4ixtqkuwqgc2mpkkiehky.tar)
    1. [PBMCs from Citrate-Treated Blood Collection Tubes Isolated via SepMate-Ficoll Gradient (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_citrate_sepmate-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/cs0s6e2u0j7d8uc36xsdo6922c7dle6y.tar)
    1. [PBMCs from Citrate-Treated Cell Preparation Tubes (3' v3.1 Chemistry)](https://www.10xgenomics.com/resources/datasets/pbmcs-3p_citrate_cpt-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/2tqrzreghvi6nxe94oob1ei1vi4458br.tar)
    1. [PBMCs from a Healthy Donor: Whole Transcriptome Analysis](https://www.10xgenomics.com/resources/datasets/pbm-cs-from-a-healthy-donor-whole-transcriptome-analysis-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/dk0hmj5mpqjq56afkr5jibavy9e3yil8.tar)
    1. [Whole Blood RBC Lysis for PBMCs and Neutrophils, Granulocytes, 3'](https://www.10xgenomics.com/resources/datasets/whole-blood-rbc-lysis-for-pbmcs-neutrophils-granulocytes-3-3-1-standard): [link to the quant result](https://umd.box.com/shared/static/0gnwx7d9hbdmptyi0ddz6mfa79d1l8be.tar)
    1. [Peripheral blood mononuclear cells (PBMCs) from a healthy donor - Manual (channel 5)](https://www.10xgenomics.com/resources/datasets/peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-manual-channel-5-3-1-standard-3-1-0): [link to the quant result](https://umd.box.com/shared/static/tn884ctombnj214abt8rp77p7kih5i02.tar)
    1. [Peripheral blood mononuclear cells (PBMCs) from a healthy donor - Manual (channel 1)](https://www.10xgenomics.com/resources/datasets/peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-manual-channel-1-3-1-standard-3-1-0): [link to the quant result](https://umd.box.com/shared/static/0jcgdgy8woj30oarkwhybk8fly7gb7v8.tar)
    1. [Peripheral blood mononuclear cells (PBMCs) from a healthy donor - Chromium Connect (channel 5)](https://www.10xgenomics.com/resources/datasets/peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-chromium-connect-channel-5-3-1-standard-3-1-0): [link to the quant result](https://umd.box.com/shared/static/kybks0ncf609xhcwvhv7z743zrmvlg94.tar)
    1. [Peripheral blood mononuclear cells (PBMCs) from a healthy donor - Chromium Connect (channel 1)](https://www.10xgenomics.com/resources/datasets/peripheral-blood-mononuclear-cells-pbm-cs-from-a-healthy-donor-chromium-connect-channel-1-3-1-standard-3-1-0): [link to the quant result](https://umd.box.com/shared/static/vtuexhbqiyvfob7qdpvsxl1nbqlo074f.tar)
    1. [Hodgkin's Lymphoma, Dissociated Tumor: Whole Transcriptome Analysis](https://www.10xgenomics.com/resources/datasets/hodgkins-lymphoma-dissociated-tumor-whole-transcriptome-analysis-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/qis4ovf34wvq12n2uabdiem6w355qry7.tar)
    1. [200 Sorted Cells from Human Glioblastoma Multiforme, 3’ LT v3.1](https://www.10xgenomics.com/resources/datasets/200-sorted-cells-from-human-glioblastoma-multiforme-3-lt-v-3-1-3-1-low-6-0-0): [link to the quant result](https://umd.box.com/shared/static/2xf9xf8m1n5vbvmpo1vshwigs7f7o5jd.tar)
    1. [750 Sorted Cells from Human Invasive Ductal Carcinoma, 3’ LT v3.1](https://www.10xgenomics.com/resources/datasets/750-sorted-cells-from-human-invasive-ductal-carcinoma-3-lt-v-3-1-3-1-low-6-0-0): [link to the quant result](https://umd.box.com/shared/static/3txnreehxoj2plyypfs6fkibnnbo72h4.tar)
    1. [2k Sorted Cells from Human Glioblastoma Multiforme, 3’ v3.1](https://www.10xgenomics.com/resources/datasets/2-k-sorted-cells-from-human-glioblastoma-multiforme-3-v-3-1-3-1-standard-6-0-0): [link to the quant result](https://umd.box.com/shared/static/n0vpgbdwbnnqdw1h9of2ykk7ive9p6pt.tar)
    1. [7.5k Sorted Cells from Human Invasive Ductal Carcinoma, 3’ v3.1](https://www.10xgenomics.com/resources/datasets/7-5-k-sorted-cells-from-human-invasive-ductal-carcinoma-3-v-3-1-3-1-standard-6-0-0): [link to the quant result](https://umd.box.com/shared/static/aly78r6bppqf01npbqfopc3epmp17weu.tar)
    1. [Human Glioblastoma Multiforme: 3’v3 Whole Transcriptome Analysis](https://www.10xgenomics.com/resources/datasets/human-glioblastoma-multiforme-3-v-3-whole-transcriptome-analysis-3-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/suf8pt3avv4rchxfw0bqrshslzieygef.tar)
    1. [1k Brain Cells from an E18 Mouse (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-brain-cells-from-an-e-18-mouse-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/4w5eiq3qafbru5ocler39j5j28bvgz98.tar)
    1. [10k Brain Cells from an E18 Mouse (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/10-k-brain-cells-from-an-e-18-mouse-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/tym9m73frtp13vo15jhit9uwuk3mtfdq.tar)
    1. [1k Heart Cells from an E18 mouse (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-heart-cells-from-an-e-18-mouse-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/d838oy3udjvtzjo7tsdiao7u6sazabeg.tar)
    1. [10k Heart Cells from an E18 mouse (v3 chemistry)](https://www.10xgenomics.com/resources/datasets/10-k-heart-cells-from-an-e-18-mouse-v-3-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/efinlf6p8weich13kv3bzrlndsx963v4.tar)
    1. [10k Mouse E18 Combined Cortex, Hippocampus and Subventricular Zone Cells, Single Indexed](https://www.10xgenomics.com/resources/datasets/10-k-mouse-e-18-combined-cortex-hippocampus-and-subventricular-zone-cells-single-indexed-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/mr0yolo83rjdcdqgu6om4q133fpime8r.tar)
    1. [10k Mouse E18 Combined Cortex, Hippocampus and Subventricular Zone Cells, Dual Indexed](https://www.10xgenomics.com/resources/datasets/10-k-mouse-e-18-combined-cortex-hippocampus-and-subventricular-zone-cells-dual-indexed-3-1-standard-4-0-0): [link to the quant result](https://umd.box.com/shared/static/mr7raea3v5ccn4dchemwhcimpz7t1cwl.tar)
    1. [1k PBMCs from a Healthy Donor (v2 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-pbm-cs-from-a-healthy-donor-v-2-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/xeya5zr0t0wg0t8c20zu0pdhclxywx3c.tar)
    1. [1k Brain Cells from an E18 Mouse (v2 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-brain-cells-from-an-e-18-mouse-v-2-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/a53twm69uo2xf6778asuvw2aft7wkur5.tar)
    1. [1k Heart Cells from an E18 mouse (v2 chemistry)](https://www.10xgenomics.com/resources/datasets/1-k-heart-cells-from-an-e-18-mouse-v-2-chemistry-3-standard-3-0-0): [link to the quant result](https://umd.box.com/shared/static/p4ieuzimfgrjfsr9rzhrn48kved4ha7m.tar)

    To obtain the information of the available datasets as 
    a dataframe, one can run `preprocessed_10x_data()`
    """
    
    import pandas as pd
    import os
    import shutil
    import urllib.request 
    import tarfile

    say(quiet, "Processing parameters")
    # load available dataset sheet
    # location = os.path.dirname(os.path.realpath(__file__))
    # my_file = os.path.join(location, 'data', 'available_datasets.tsv')
    my_file = os.path.join('data', 'available_datasets.tsv')
    available_datasets = pd.read_csv(my_file, sep="\t")

    nd = len(dataset_ids)
    # if no dataset is provided, just return the available dataset dataframe
    if nd == 0:
        return available_datasets

    # check whether output_format are valid
    # we just check the length, the validity of
    # each outputFormat will be checked by load_fry
    if type(list(output_format)) is dict:
        # if a dictionary is given,
        # it should be either one customized format
        # or the format of each fetched datasets
        # so check the name 
        if list(output_format.keys()).sort() != dataset_ids.sort():
            # now it should be one customized format
            output_format = dict(zip(dataset_ids, [output_format]*nd))
        # otherwise, each dataset should get a format, so do nothing
    elif (type(output_format) is str):
        # if a str is given, it should be a pre-defined format
        # and it will be used for all datasets
        output_format = dict(zip(dataset_ids, [output_format]*nd))


    if type(nonzero) is bool:
        nonzero = dict(zip(dataset_ids, [nonzero]*nd))

    dataset_paths = fetch_processed_quant(dataset_ids = dataset_ids,
                                        fetch_dir = fetch_dir,
                                        force = force,
                                        delete_tar = delete_tar,
                                        quiet = quiet)
    ann_list = {}
    for dataset_id in dataset_ids:
        nonzero_ds = nonzero[dataset_id]
        output_format_ds = output_format[dataset_id]
        dataset_path_ds = dataset_paths[dataset_id]
        say(quiet, f"Loading dataset {dataset_id}")
        ann_list[dataset_id] = load_fry(frydir = dataset_path_ds,
                                        output_format = output_format_ds,
                                        nonzero = nonzero_ds,
                                        quiet = quiet)

    return ann_list
    

In [30]:
load_processed_quant([1,2,3])

Processing parameters
Processing dataset #1
  - output dir exists:
    10x_datasets/5f080c6082f11ea9fc6448482e6fb590
  - use the existing quant result
Processing dataset #2
  - output dir exists:
    10x_datasets/5b36a7bfda36a7093adc8e30c3fa92c8
  - use the existing quant result
Processing dataset #3
  - output dir exists:
    10x_datasets/265ebe8f77ad90db350984d9c7a59e52
  - use the existing quant result
Removing downloaded tar files
Done
Loading dataset 1
Loading dataset 2
Loading dataset 3


{1: AnnData object with n_obs × n_vars = 10620 × 36601
     obs: 'barcodes',
 2: AnnData object with n_obs × n_vars = 10795 × 36601
     obs: 'barcodes',
 3: AnnData object with n_obs × n_vars = 76122 × 36601
     obs: 'barcodes'}

In [46]:
#!/usr/bin/env python

# from pyroe import make_splici_txome
# from pyroe import fetch_processed_quant

import argparse
import sys

# Create the parser
parser = argparse.ArgumentParser(description='The pyroe package provides useful functions for preparing input files required by alevin-fry.',
                                    prog='pyroe')
subparsers = parser.add_subparsers(title='subcommands', dest='command',
                                    description='valid subcommands',
                                    help='additional help')
parser_makeSplici = subparsers.add_parser('make-splici', help='Make splici reference')
parser_makeSplici.add_argument('genome_path', metavar='genome-path', type=str, help='The path to a genome fasta file.')
parser_makeSplici.add_argument('gtf_path', metavar='gtf-path', type=str, help='The path to a gtf file.')
parser_makeSplici.add_argument('read_length', metavar='read-length', type=int, help='The read length of the single-cell experiment being processed (determines flank size).')
parser_makeSplici.add_argument('output_dir', metavar='output-dir', type=str, help='The output directory where splici reference files will be written.')
parser_makeSplici.add_argument('--filename-prefix', type=str, default="splici", help='The file name prefix of the generated output files.')
parser_makeSplici.add_argument('--flank-trim-length', type=int, default=5, help='Determines the amount subtracted from the read length to get the flank length.')
parser_makeSplici.add_argument('--extra-spliced', type=str, help='The path to an extra spliced sequence fasta file.')
parser_makeSplici.add_argument('--extra-unspliced', type=str, help='The path to an extra unspliced sequence fasta file.')
parser_makeSplici.add_argument('--bt-path', type=str, default="bedtools", help='The path to bedtools v2.30.0 or greater.')
parser_makeSplici.add_argument('--no-bt', action='store_true', help='A flag indicates whether bedtools will be used for generating splici reference files.')
parser_makeSplici.add_argument('--dedup-seqs', action='store_true', help='A flag indicates whether identical sequences will be deduplicated.')
parser_makeSplici.add_argument('--no-flanking-merge', action='store_true', help='A flag indicates whether flank lengths will be considered when merging introns.')

parser_fetchQuant = subparsers.add_parser('fetch-quant', help='Fetch processed quant results')
parser_fetchQuant.add_argument('dataset_ids', metavar='dataset-ids', nargs='+', type=int, help='The ids of the datasets to fetch')
parser_fetchQuant.add_argument('--fetch_dir', type=str, default="processed_quant",  help='The path to a directory for storing fetched datasets.')
parser_fetchQuant.add_argument('--force', action='store_false', help='A flag indicates whether existing datasets will be redownloaded by force.')
parser_fetchQuant.add_argument('--delete_tar', action='store_true', help='A flag indicates whether fetched tar files will be deleted.')
parser_fetchQuant.add_argument('--quiet', action='store_true', help='A flag indicates whether help messaged should not be printed.')

# # Execute the parse_args() method
# args = parser.parse_args()
# if args.command == 'make-splici':
#     make_splici_txome.make_splici_txome(
#     genome_path=args.genome_path,
#     gtf_path=args.gtf_path,
#     read_length=args.read_length,
#     output_dir=args.output_dir,
#     flank_trim_length=args.flank_trim_length,
#     filename_prefix=args.filename_prefix,
#     extra_spliced=args.extra_spliced,
#     extra_unspliced=args.extra_unspliced,
#     dedup_seqs=args.dedup_seqs,
#     no_bt=args.no_bt,
#     bt_path=args.bt_path,
#     no_flanking_merge=args.no_flanking_merge)
# elif args.command == 'fetch-quant':
#     print("Ok")


# else:
#     print(parser.print_help())
#     sys.exit(1)


_StoreTrueAction(option_strings=['--quiet'], dest='quiet', nargs=0, const=True, default=False, type=None, choices=None, help='A flag indicates whether help messaged should not be printed.', metavar=None)

In [52]:
parser.print_help()

usage: pyroe [-h] {make-splici,fetch-quant} ...

The pyroe package provides useful functions for preparing input files required
by alevin-fry.

optional arguments:
  -h, --help            show this help message and exit

subcommands:
  valid subcommands

  {make-splici,fetch-quant}
                        additional help
    make-splici         Make splici reference
    fetch-quant         Fetch processed quant results


In [49]:
a = parser.parse_args(["fetch-quant", '1', '2'])

In [54]:
available_datasets = fetch_processed_quant()
