Lets try to do the conversion in a really lightweight way.

In [1]:
from pathlib import Path
import csv


In [2]:
def read_barcode_lineno_map(filename):
    barcodes = {}
    with open(filename, "rt") as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        for i, line in enumerate(reader):
            barcodes[line[0]] = i + 1
            
    return barcodes

In [3]:
analysis_dir = Path("fullsolo_multi_eoi_container/")
solo_dir = analysis_dir / "Solo.out"

In [4]:
def compute_raw_to_filtered_map(filtered_barcodes, raw_barcodes):
    raw_to_filtered_mapping = {}
    for filtered_barcode in filtered_barcodes:
        filtered_index = filtered_barcodes[filtered_barcode]
        raw_index = raw_barcodes[filtered_barcode]
        raw_to_filtered_mapping[raw_index] = filtered_index
    return raw_to_filtered_mapping

In [5]:
def filter_mtx(raw_barcode_filename, raw_matrix_filename, filtered_barcode_filename):
    """Read raw barcodes, matrix and filtered barcodes writing a filtered matrix file
    """
    raw_barcodes = read_barcode_lineno_map(raw_barcode_filename)
    filtered_barcodes = read_barcode_lineno_map(filtered_barcode_filename)
    raw_to_filtered_mapping = compute_raw_to_filtered_map(filtered_barcodes, raw_barcodes)

    header = True
    results = []
    with open(raw_matrix_mtx, "rt") as instream:
        # copy comments
        for line in instream:
            if line.startswith("%"):
                yield line
            elif header:
                # After the comment comes the one header line
                total_rows, total_columns, total_counts = [int(x) for x in line.rstrip().split()]
                assert total_columns == len(raw_barcodes)
                header = False
            else:
                # row, column, count
                row, column, count = [int(x) for x in line.rstrip().split()]
                if column in raw_to_filtered_mapping:
                    new_column = raw_to_filtered_mapping[column]
                    results.append((row, new_column, count))

        rs = sorted(results, key=lambda row: (row[1], row[0]))
        total_columns = len(filtered_barcodes)
        total_counts = len(rs)
        yield "{} {} {}\n".format(total_rows, total_columns, total_counts)
        for row, column, count in rs:
            yield "{} {} {}\n".format(row, column, count)


In [7]:
raw_barcode_tsv = solo_dir / "GeneFull_Ex50pAS" / "raw" / "barcodes.tsv"
raw_matrix_mtx = solo_dir / "GeneFull_Ex50pAS" / "raw" / "matrix.mtx"
filtered_barcode_tsv = solo_dir / "GeneFull_Ex50pAS" / "filtered" / "barcodes.tsv"
test_mtx = analysis_dir / "test-lite.mtx"

with open(test_mtx, "wt") as outstream:
    for line in filter_mtx(raw_barcode_tsv, raw_matrix_mtx, filtered_barcode_tsv):
        outstream.write(line)

In [8]:
pwd

'/woldlab/loxcyc/home/diane/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/adrenal/ENCSR724KET_16f_nuc'