In [1]:
from scipy.io import mmread, mmwrite
from pathlib import Path
import scanpy
import pandas
import numpy



In [2]:
def scanpy_load_solo278a_mtx(analysis_dir, quantification="Gene", mode="filtered"):
    assert mode in ["filtered", "raw"], "STAR Solo only produces raw or filtered files"
    assert quantification in ["Gene", "GeneFull", "GeneFull_Ex50pAS", "SJ"]

    analysis_dir = Path(analysis_dir)
    feature_name = "features.tsv"

    solo_dir = analysis_dir / "Solo.out" / quantification / mode
    solo = scanpy.read_mtx(solo_dir / "matrix.mtx").T
    solo_vars = pandas.read_csv(
        solo_dir / feature_name, header=None, sep="\t"
    ).values.T
    solo_obs = pandas.read_csv(
        solo_dir / "barcodes.tsv", header=None, sep="\t"
    ).values.T
    solo.obs_names = solo_obs[0]
    solo.var_names = solo_vars[0]

    #solo.obs["counts"] = solo.X.sum(axis=1)
    #solo.obs["ngenes"] = numpy.array((solo.X > 0).sum(axis=1))

    return solo

def mmread_solo(analysis_dir, quantification="Gene", mode="filtered"):
    assert mode in ["filtered", "raw"], "STAR Solo only produces raw or filtered files"
    assert quantification in ["Gene", "GeneFull", "GeneFull_Ex50pAS", "SJ"]

    analysis_dir = Path(analysis_dir)
    feature_name = "features.tsv"
    
    solo_dir = analysis_dir / "Solo.out" / quantification / mode
    solo = mmread(solo_dir / "matrix.mtx").T
    solo_vars = pandas.read_csv(
        solo_dir / feature_name, header=None, sep="\t"
    ).values.T
    solo_obs = pandas.read_csv(
        solo_dir / "barcodes.tsv", header=None, sep="\t"
    ).values.T



In [3]:
#adata = scanpy_load_solo278a_mtx("fullsolo_multi_eoi_container", "GeneFull_Ex50pAS", "raw")

In [4]:
solo_dir = Path("fullsolo_multi_eoi_container/Solo.out")

In [5]:
solo_raw = mmread(solo_dir / "GeneFull_Ex50pAS" / "raw" / "matrix.mtx")
solo_raw.shape

(59526, 736320)

In [6]:
vars_raw = pandas.read_csv(
    solo_dir / "GeneFull_Ex50pAS" / "raw" / "features.tsv",
    header=None,
    sep="\t",
)
vars_raw.shape

(59526, 3)

In [7]:
obs_raw = pandas.read_csv(
    solo_dir / "GeneFull_Ex50pAS" / "raw" / "barcodes.tsv",
    header=None,
    sep="\t",
)
obs_raw.shape

(736320, 1)

In [8]:
solo_raw

<59526x736320 sparse matrix of type '<class 'numpy.int64'>'
	with 44857010 stored elements in COOrdinate format>

In [9]:
solo_filt = mmread(solo_dir / "GeneFull_Ex50pAS" / "filtered" /"matrix.mtx")
solo_filt.shape

(59526, 15708)

In [10]:
vars_filt = pandas.read_csv(
    solo_dir / "GeneFull_Ex50pAS" / "filtered" / "features.tsv",
    header=None,
    sep="\t",
)
vars_filt.shape

(59526, 3)

In [11]:
obs_filt = pandas.read_csv(
    solo_dir / "GeneFull_Ex50pAS" / "filtered" / "barcodes.tsv",
    header=None,
    sep="\t",
)
obs_filt.shape

(15708, 1)

In [12]:
solo_filt

<59526x15708 sparse matrix of type '<class 'numpy.int64'>'
	with 36913803 stored elements in COOrdinate format>

In [13]:
obs_filt

Unnamed: 0,0
0,AAACAGCCAAATGCCC
1,AAACAGCCACGTGCTG
2,AAACAGCCACTTAGGC
3,AAACAGCCAGGGAGCT
4,AAACAGCCATGTTTGG
...,...
15703,TTTGTTGGTGAGCAAG
15704,TTTGTTGGTGCTCACC
15705,TTTGTTGGTGCTTACT
15706,TTTGTTGGTGCTTAGA


In [14]:
obs_raw

Unnamed: 0,0
0,AAACAGCCAAACAACA
1,AAACAGCCAAACATAG
2,AAACAGCCAAACCCTA
3,AAACAGCCAAACCTAT
4,AAACAGCCAAACCTTG
...,...
736315,TTTGTTGGTTTGGGTA
736316,TTTGTTGGTTTGGTTC
736317,TTTGTTGGTTTGTCTA
736318,TTTGTTGGTTTGTGGA


In [15]:
obs_raw_filt = obs_raw[obs_raw[0].isin(obs_filt[0])]
obs_raw_filt

Unnamed: 0,0
20,AAACAGCCAAATGCCC
167,AAACAGCCACGTGCTG
184,AAACAGCCACTTAGGC
254,AAACAGCCAGGGAGCT
353,AAACAGCCATGTTTGG
...,...
736107,TTTGTTGGTGAGCAAG
736134,TTTGTTGGTGCTCACC
736139,TTTGTTGGTGCTTACT
736140,TTTGTTGGTGCTTAGA


In [16]:
obs_raw_filt[0].tolist() == obs_filt[0].tolist()

True

In [17]:
obs_raw[0] == "AAACAGCCAAATGCCC"

0         False
1         False
2         False
3         False
4         False
          ...  
736315    False
736316    False
736317    False
736318    False
736319    False
Name: 0, Length: 736320, dtype: bool

In [18]:
solo_raw_csr = solo_raw.tocsr()
solo_raw_csr.shape

(59526, 736320)

In [19]:
solo_raw_csr_filt = solo_raw_csr[:,obs_raw_filt.index]
solo_raw_csr_filt.shape

(59526, 15708)

In [20]:
numpy.all(solo_raw_csr_filt.todense() == solo_filt.todense())

True

In [21]:
solo_raw_filt_coo = solo_raw_csr_filt.tocoo()
solo_raw_filt_coo.shape

(59526, 15708)

In [22]:
solo_raw_filt_coo

<59526x15708 sparse matrix of type '<class 'numpy.int64'>'
	with 36913803 stored elements in COOrdinate format>

In [23]:
mmwrite("fullsolo_multi_eoi_container/my-filtered.mtx", solo_raw_csr_filt)

In [24]:
mmwrite("fullsolo_multi_eoi_container/filtered-round.mtx", solo_filt)

In [25]:
vars(solo_raw_csr_filt)

{'_shape': (59526, 15708),
 'maxprint': 50,
 'indices': array([ 161,  227,  381, ...,  916, 9934, 8513], dtype=int32),
 'indptr': array([       0,        0,       65, ..., 36913803, 36913803, 36913803],
       dtype=int32),
 'data': array([1, 1, 1, ..., 1, 1, 1])}

In [26]:
vars(solo_filt)

{'_shape': (59526, 15708),
 'maxprint': 50,
 'row': array([  103,   273,   326, ..., 58685, 58687, 58732], dtype=int32),
 'col': array([    0,     0,     0, ..., 15707, 15707, 15707], dtype=int32),
 'data': array([3, 2, 1, ..., 1, 4, 1]),
 'has_canonical_format': False}

In [27]:
solo_filt.getrow(273).getcol(0).data

array([2])

In [28]:
solo_raw_csr_filt.getrow(103).getcol(0).data[0]

3

In [29]:
obs_raw_seq = obs_raw.reset_index().set_index(0)

In [30]:
obs_raw_seq.loc["TTTGTTGGTGCTCACC"].values[0]

736134

In [31]:
obs_raw_seq.loc["AAACAGCCAAATGCCC"].values[0]

20

In [32]:
vars_raw_seq = vars_raw.reset_index().set_index(0)

In [33]:
solo_raw_dok = solo_raw.todok()

In [34]:
solo_raw_dok[0,20]

0

In [35]:
#assert 0
#data = []
#for barcode in obs_filt[0]:
#    obs = obs_raw_seq.loc[barcode].values[0]
#    for feature in vars_filt[0]:
#        col = vars_raw_seq.loc[feature].values[0]
#        data.append(solo_raw.getrow(obs).getcol(col).data[0])
#print(len(data))

AssertionError: 