In [1]:
import csv
from collections import Counter
import scanpy
import pysam
from pathlib import Path
import pandas
import requests
import numpy
import os
import re
import scipy
import shutil
import time
import gzip
import sys
from matplotlib import pyplot
import matplotlib
import upsetplot
import warnings
from urllib.parse import urljoin

In [2]:
from common import scanpy_load_solo_mtx

In [3]:
MEX = str(Path("~/proj/mex_gene_archive").expanduser())
if MEX not in sys.path:
    sys.path.append(MEX)

In [4]:
star_analysis_dir = Path("adrenal/mouse_adr_2k_a4_f1")
erebboah_base = "http://crick.bio.uci.edu/erebboah/postnatal_splitseq/adrenal/A_4_F_1/DGE_unfiltered/"

In [5]:
def load_parse_decoder(filename):
    parse_barcodes = {}
    tsv = pandas.read_csv(filename, sep="\t", index_col=0).set_index("well")
    for index, row in tsv.iterrows():
        parse_barcodes[row["bc1_dt"]] = str(index)
        parse_barcodes[row["bc1_randhex"]] = str(index)
    return parse_barcodes

#def group_parse_barcodes(adata, parse_barcodes):
class DecodeParse:
    def __init__(self, parse_barcodes):
        self._parse_barcodes = parse_barcodes
        
    def __call__(self, barcode):
        barcodes = barcode.split("_")
        suffix = self._parse_barcodes[barcodes[2]]
        return "{}{}_{}".format(barcodes[0], barcodes[1], suffix)

def scanpy_load_parse_mtx(analysis_dir, barcodes="bc_dt_randhex.tsv", gene="Gene", mode="filtered", multi="matrix.mtx"):
    decode_parse = DecodeParse(load_parse_decoder(barcodes))
    
    star_run = scanpy_load_solo_mtx(star_analysis_dir, gene=gene, mode=mode, multi=multi)
    star_run.obs["parse_group"] = [decode_parse(x) for x in star_run.obs_names]

    parse_group = pandas.Series(
        Counter(star_run.obs["parse_group"]).keys(),
        index=Counter(star_run.obs["parse_group"]).keys()
    )
    parse_group.name = "cell_barcodes"

    var_names = star_run.var_names.to_series()
    var_names.name = "gene_id"

    coo_shape = (len(parse_group), star_run.X.shape[1])
    sparse = scipy.sparse.lil_matrix(coo_shape, dtype=star_run.X.dtype)
    for row, key in enumerate(parse_group):
        sparse[row] = star_run[star_run.obs["parse_group"] == key].X.sum(axis=0)

    mergedrun = scanpy.AnnData(sparse.tocsr(), obs=parse_group.to_frame(), var=var_names.to_frame())
    for key in star_run.var_keys():
        mergedrun.var[key] = star_run.var[key]
    
    return mergedrun

In [6]:
%timeit myrun = scanpy_load_parse_mtx(star_analysis_dir, gene="GeneFull_Ex50pAS")

myrun = scanpy_load_parse_mtx(star_analysis_dir, gene="GeneFull_Ex50pAS")
print("count sum", myrun.X.sum().sum())
print("shape", myrun.shape)
myrun

1.14 s ± 7.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
count sum 297095.0
shape (194, 81881)


AnnData object with n_obs × n_vars = 194 × 81881
    obs: 'cell_barcodes'
    var: 'gene_id'

# Fast parse matrix loader

In [7]:
parse_mapper = {
    "v1": {
        'ACTCGTAA': '0', 'CTGCTTTG': '0',
        'AAACGATA': '1', 'CATGATCA': '1',
        'TTACCTCG': '2', 'GGGTAGCG': '2',
        'GCCTGCAA': '3', 'CCGAGAAA': '3',
        'TGGTATAC': '4', 'ACGGACTC': '4',
        'CGTTCGAG': '5', 'ACTTACGA': '5',
        'TCTATTAC': '6', 'TATTTAAG': '6',
        'ATAAGCTC': '7', 'ACCGTACG': '7',
        'ATTCATGG': '8', 'TATAGTCG': '8',
        'ATCCGCGA': '9', 'TGGGCATC': '9',
        'ATCGCATA': '10', 'TACCTAGA': '10',
        'CCGTTCTA': '11', 'GCTGCATG': '11',
        'TGGCGCGC': '12', 'GTCATATG': '12',
        'TGTCTGAA': '13', 'ATATTGGC': '13',
        'CTGTCCCG': '14', 'CTAAGGGA': '14',
        'AATTTCTC': '15', 'TCGTTTCG': '15',
        'CGCGACTA': '16', 'GAATAATG': '16',
        'TGAAGCAA': '17', 'ACTGCGCA': '17',
        'TTATTCTG': '18', 'GCTTATAG': '18',
        'GTTCAACA': '19', 'ATCATGCA': '19',
        'ACGCCGGC': '20', 'ACGTTAAC': '20',
        'TTGTCTTA': '21', 'CCATCTTG': '21',
        'TACGGTTA': '22', 'CATAGCTA': '22',
        'TTGGGAGA': '23', 'GAGGTTGA': '23',
        'TGCTTGGG': '24', 'GCACTGAC': '24',
        'TAAATATC': '25', 'TTCATCGC': '25',
        'CACAATTG': '26', 'GAAATTAG': '26',
        'GTGCTAGC': '27', 'AGGATTAA': '27',
        'CGCCCGGA': '28', 'AATAGAAC': '28',
        'GCTCGCGG': '29', 'TCTTAATC': '29',
        'CTTTGGTC': '30', 'TAATACGC': '30',
        'TTCCGATC': '31', 'GTTTGTGA': '31',
        'TTCGCTAC': '32', 'CGAACGTC': '32',
        'AGCGAAAC': '33', 'GGTTCTTC': '33',
        'AAATAGCA': '34', 'GCAAATTC': '34',
        'CGTCTAGG': '35', 'GCTATGCG': '35',
        'GCCGTGTA': '36', 'CTACCCTA': '36',
        'CGCTTAAA': '37', 'GTGGGTTC': '37',
        'GACCTTTC': '38', 'GTCCGTAG': '38',
        'GGTGGAGC': '39', 'TGCGATCG': '39',
        'TACTCGAA': '40', 'TATCCGGG': '40',
        'CATTTGGA': '41', 'AGGTAATA': '41',
        'GAGCACAA': '42', 'CGTGGTTG': '42',
        'GTCGCGCG': '43', 'GACAAAGC': '43',
        'GTTACGTA': '44', 'GGGCGATG': '44',
        'CTATTTCA': '45', 'ATCTATAA': '45',
        'ACTATATA': '46', 'GCCCATGA': '46',
        'TCACTTTA': '47', 'CTGAAAGG': '47'
    }
}

from scipy.io.mmio import MMFile

class SimpleFloatMMWriter(MMFile):
    @staticmethod
    def _field_template(field, precision):
        return {MMFile.FIELD_REAL: '%%.%ig\n' % precision,
                MMFile.FIELD_INTEGER: '%i\n',
                MMFile.FIELD_UNSIGNED: '%u\n',
                MMFile.FIELD_COMPLEX: '%%.%ie %%.%ie\n' %
                    (precision, precision)
                }.get(field, None)

def read_parse_cell_barcode_lineno_map(stream):
    barcodes = {}
    reader = csv.reader(stream, delimiter="\t")
    for i, line in enumerate(reader):
        barcodes[line[0]] = i + 1

    return barcodes

def read_feature_lineno_map(stream):
    barcodes = {}
    reader = csv.reader(stream, delimiter="\t")
    for i, line in enumerate(reader):
        barcodes[line[0]] = i + 1

    return barcodes

def compute_parse_map(raw_barcodes, mapper):
    raw_to_collapsed_mapping = {}
    combined_indexes = {}
    for barcode in raw_barcodes:
        fragments = barcode.split("_")
        fragments[2] = str(mapper[fragments[2]])
        combined_barcode = "{}{}_{}".format(*fragments)
        
        combined_index = combined_indexes.setdefault(combined_barcode, len(combined_indexes)+1)
        raw_index = raw_barcodes[barcode]
        raw_to_collapsed_mapping[raw_index] = combined_index
    return (combined_indexes, raw_to_collapsed_mapping)


def _parse_mmread(matrix_filename, merged_barcodes, merged_mapping):
    header = True
    matrix = None
    with open(matrix_filename, "rt") as instream:
        for line in instream:
            if line.startswith("%"):
                pass
            elif header:
                # After the comment comes the one header line
                total_features, total_cells, total_counts = [
                    int(x) for x in line.rstrip().split()
                ]
                matrix = scipy.sparse.dok_matrix((len(merged_barcodes), total_features), dtype=float)
                header = False
            else:
                # row, column, count
                feature_index, cell_index, count = line.rstrip().split()
                feature_index = int(feature_index)
                cell_index = int(cell_index)
                count = float(count)
                
                new_cell_index = merged_mapping[cell_index]
                matrix[new_cell_index-1, feature_index-1] = matrix.get((new_cell_index-1, feature_index-1), 0) + count

    return matrix

def load_parse_mtx(analysis_dir, *, gene="Gene", mode="filtered", multi="matrix.mtx"):
    assert mode in ["filtered", "raw"], "STAR Solo only produces raw or filtered files"
    assert gene in ["SJ", "Gene", "GeneFull_Ex50pAS", "GeneFull"]
    assert multi in ["matrix.mtx", "UniqueAndMult-EM.mtx"]

    analysis_dir = Path(analysis_dir)
    solo_dir = analysis_dir / "Solo.out" / gene / mode
    cell_barcode_filename = solo_dir / "barcodes.tsv"
    feature_filename = solo_dir / "features.tsv"
    matrix_filename = solo_dir / multi
    
    with open(cell_barcode_filename, "rt") as instream:
        cell_barcodes = read_parse_cell_barcode_lineno_map(instream)
    with open(feature_filename, "rt") as instream:
        features = read_feature_lineno_map(instream)
    merged_barcodes, merged_mapping = compute_parse_map(cell_barcodes, parse_mapper["v1"])

    matrix = _parse_mmread(matrix_filename, merged_barcodes, merged_mapping)
    print("Loading matrix shape", matrix.shape)

    return matrix, list(merged_barcodes.keys()), list(features.keys())

def write_parse_mtx(analysis_dir, output_dir, *, gene="Gene", mode="filtered", multi="matrix.mtx"):
    matrix, cells, features = load_parse_mtx(analysis_dir, gene=gene, mode=mode, multi=multi)
    output_dir = Path(output_dir)
    with open(output_dir / "barcodes.tsv", "wt") as outstream:
        for barcode in cells:
            outstream.write(barcode)
            outstream.write(os.linesep)

    solo_dir = analysis_dir / "Solo.out" / gene / mode
    feature_filename = solo_dir / "features.tsv"
    shutil.copy(feature_filename, output_dir / "features.tsv")
    
    matrix_name = output_dir / multi
    matrix = matrix.T
    
    assert matrix.shape[1] == len(cells), "The number of rows doesn't match the number of barcodes"
    assert matrix.shape[0] == len(features), "The number of columns doesn't match the number of features"
    print("Writing matrix.shape", matrix.shape, matrix_name)
    SimpleFloatMMWriter().write(matrix_name, matrix, comment='', field=None, precision=None, symmetry=None)
    

def load_parse_anndata(analysis_dir, *, gene="Gene", mode="filtered", multi="matrix.mtx"):
    matrix, cells, features = load_parse_mtx(analysis_dir, gene=gene, mode=mode, multi=multi)

    obs = pandas.DataFrame(cells, index=cells, columns=["cell_barcodes"])
    var = pandas.DataFrame(features, index=features, columns=["gene_id"])
    adata = scanpy.AnnData(matrix.tocsr(), obs=obs, var=var)

    return adata

In [8]:
parse = load_parse_anndata(star_analysis_dir, gene="GeneFull_Ex50pAS")
parse

Loading matrix shape (194, 81881)


AnnData object with n_obs × n_vars = 194 × 81881
    obs: 'cell_barcodes'
    var: 'gene_id'

In [9]:
%timeit parse = load_parse_mtx(star_analysis_dir, gene="GeneFull_Ex50pAS")

Loading matrix shape (194, 81881)
Loading matrix shape (194, 81881)
Loading matrix shape (194, 81881)
Loading matrix shape (194, 81881)
Loading matrix shape (194, 81881)
Loading matrix shape (194, 81881)
Loading matrix shape (194, 81881)
Loading matrix shape (194, 81881)
2.89 s ± 13.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
numpy.all(myrun.obs == parse.obs)

True

In [11]:
numpy.all(myrun.var == parse.var)

True

In [12]:
(myrun.X - parse.X)

<194x81881 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in Compressed Sparse Row format>

In [13]:
t0 = time.monotonic()
raw_parse = load_parse_anndata(star_analysis_dir, gene="GeneFull_Ex50pAS", mode="raw")
print("Took {:.3}".format(time.monotonic() - t0))
raw_parse


Loading matrix shape (442368, 81881)
Took 7.04


AnnData object with n_obs × n_vars = 442368 × 81881
    obs: 'cell_barcodes'
    var: 'gene_id'

In [14]:
raw_collapsed_parse_filtered = star_analysis_dir / "Solo.out" / "GeneFull_Ex50pAS" / "raw_collapsed"

if not raw_collapsed_parse_filtered.exists():
    raw_collapsed_parse_filtered.mkdir()

write_parse_mtx(star_analysis_dir, raw_collapsed_parse_filtered, gene="GeneFull_Ex50pAS", mode="raw")

Loading matrix shape (442368, 81881)
Writing matrix.shape (81881, 442368) adrenal/mouse_adr_2k_a4_f1/Solo.out/GeneFull_Ex50pAS/raw_collapsed/matrix.mtx


In [15]:
raw_collapsed_parse_filtered = star_analysis_dir / "Solo.out" / "GeneFull_Ex50pAS" / "raw_collapsed"

if not raw_collapsed_parse_filtered.exists():
    raw_collapsed_parse_filtered.mkdir()

write_parse_mtx(star_analysis_dir, raw_collapsed_parse_filtered, gene="GeneFull_Ex50pAS", multi="UniqueAndMult-EM.mtx", mode="raw")

Loading matrix shape (442368, 81881)
Writing matrix.shape (81881, 442368) adrenal/mouse_adr_2k_a4_f1/Solo.out/GeneFull_Ex50pAS/raw_collapsed/UniqueAndMult-EM.mtx


In [16]:
filtered_collapsed_parse_filtered = star_analysis_dir / "Solo.out" / "GeneFull_Ex50pAS" / "filtered_collapsed"
filtered_collapsed_parse_filtered = str(filtered_collapsed_parse_filtered) + "/"

In [17]:
!~/proj/STAR/bin/Linux_x86_64_static/STAR --runMode soloCellFiltering \
  $star_analysis_dir/Solo.out/GeneFull_Ex50pAS/raw_collapsed/  \
  $star_analysis_dir/Solo.out/GeneFull_Ex50pAS/filtered_collapsed/ \
  --soloCellFilter EmptyDrops_CR

	/woldlab/loxcyc/home/diane/proj/STAR/bin/Linux_x86_64_static/STAR --runMode soloCellFiltering adrenal/mouse_adr_2k_a4_f1/Solo.out/GeneFull_Ex50pAS/raw_collapsed/ adrenal/mouse_adr_2k_a4_f1/Solo.out/GeneFull_Ex50pAS/filtered_collapsed/ --soloCellFilter EmptyDrops_CR
	STAR version: dev_EoI_2.7.9a_2021-12-22   compiled: 2021-12-22T08:42:40-05:00 :/home/dobin/data/STAR/STARcode/STAR.master/source
Jan 07 16:16:51 ..... started STAR run
Jan 07 16:16:51 ..... starting SoloCellFiltering
Jan 07 16:16:52 ..... finished successfully


In [18]:
from mex_gene_archive.filter import write_filtered_mtx

In [19]:
write_filtered_mtx(
    star_analysis_dir / "Solo.out" / "GeneFull_Ex50pAS" / "raw_collapsed" / "barcodes.tsv",
    star_analysis_dir / "Solo.out" / "GeneFull_Ex50pAS" / "raw_collapsed" / "UniqueAndMult-EM.mtx",
    star_analysis_dir / "Solo.out" / "GeneFull_Ex50pAS" / "filtered_collapsed" / "barcodes.tsv",
    star_analysis_dir / "Solo.out" / "GeneFull_Ex50pAS" / "filtered_collapsed" / "UniqueAndMult-EM.mtx",
)