## Mathew: load unprocessed Cell Ranger and salmon data

In [1]:
import numpy as np
import pandas as pd
import os
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
from pyroe import load_fry
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib_venn import venn2
from venny4py.venny4py import *
import scvelo as scv
import re
import louvain
from skmisc.loess import loess
from skbio.diversity.alpha import simpson, gini_index
import warnings
import sys
import ensembl_rest
import time
from Ensembl_converter import EnsemblConverter

## Cell Ranger

### load expression matrices

In [2]:
# load metadata
meta = pd.read_csv("../data/metadata.csv", index_col=0)
meta

Unnamed: 0_level_0,assay,chemistry,mouse_nr,infection,day_post_infection,organ,organ_day
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SampleID_1_27feb19,rna,v2,M7,naive,D0,spleen,spleen0
SampleID_2_29apr19,rna,v2,M8,naive,D0,spleen,spleen0
SampleID_51_24feb20,rna,v2,M9,infected,D7,spleen,spleen7
SampleID_54_24feb20,rna,v2,M10,infected,D7,spleen,spleen7
SampleID_2_27feb19,rna,v2,M1,infected,D14,spleen,spleen14
SampleID_4_2apr19,rna,v2,M2,infected,D14,spleen,spleen14
SampleID_4_26apr19,rna,v2,M3,infected,D14,spleen,spleen14
SampleID_3_27feb19,rna,v2,M4,infected,D28,spleen,spleen28
SampleID_4_1apr19,rna,v2,M5,infected,D28,spleen,spleen28
SampleID_52_24feb20,rna,v2,M9,infected,D7,mln,mln7


In [3]:
adatas = []
path = "../cellranger"

for _, folders, _ in os.walk(path):
    for folder in folders:
        if folder.startswith("Sample"):

            # get adata
            adata = sc.read_10x_mtx(path + "/" + folder)
            
            # add metadata
            for col_name, col_data in meta.loc[folder].items():
                adata.obs[col_name] = col_data
    
            # add ID
            adata.obs["id"] = folder

            # rename obs names
            adata.obs_names = [bc[:-2] + "_" + folder for bc in adata.obs_names]
    
            # append to list
            adatas.append(adata)

In [4]:
# merge all samples
cellranger = ad.concat(adatas)
cellranger

AnnData object with n_obs × n_vars = 58486 × 32285
    obs: 'assay', 'chemistry', 'mouse_nr', 'infection', 'day_post_infection', 'organ', 'organ_day', 'id'

In [5]:
# save
cellranger.write_h5ad("../data/mathew_cellranger_raw.h5ad")

## Salmon

### load expression matrices

In [6]:
adatas = []
path = "../salmon"

for _, folders, _ in os.walk(path):
    for folder in folders:

        if folder.startswith("SampleID"):
        
            # get adata
            adata = load_fry(path + "/" + folder + "/_quant_res", output_format = "all")
            
            # add metadata
            for col_name, col_data in meta.loc[folder].items():
                adata.obs[col_name] = col_data
    
            # add ID
            adata.obs["id"] = folder

            # rename obs names
            adata.obs_names = [bc + "_" + folder for bc in adata.obs_names]
    
            # append to list
            adatas.append(adata)

USA mode: True
Using pre-defined output format: all
Will populate output field X with sum of counts frorm ['U', 'S', 'A'].
USA mode: True
Using pre-defined output format: all
Will populate output field X with sum of counts frorm ['U', 'S', 'A'].
USA mode: True
Using pre-defined output format: all
Will populate output field X with sum of counts frorm ['U', 'S', 'A'].
USA mode: True
Using pre-defined output format: all
Will populate output field X with sum of counts frorm ['U', 'S', 'A'].
USA mode: True
Using pre-defined output format: all
Will populate output field X with sum of counts frorm ['U', 'S', 'A'].
USA mode: True
Using pre-defined output format: all
Will populate output field X with sum of counts frorm ['U', 'S', 'A'].
USA mode: True
Using pre-defined output format: all
Will populate output field X with sum of counts frorm ['U', 'S', 'A'].
USA mode: True
Using pre-defined output format: all
Will populate output field X with sum of counts frorm ['U', 'S', 'A'].
USA mode: True
U

In [7]:
# merge all samples
salmon = ad.concat(adatas)
salmon

AnnData object with n_obs × n_vars = 52908 × 32285
    obs: 'barcodes', 'assay', 'chemistry', 'mouse_nr', 'infection', 'day_post_infection', 'organ', 'organ_day', 'id'

In [8]:
# rename genes from ensembl ID to gene ID
e2n_path = "../../splici_references/mm10-2020-A_geneid_to_name.txt"
e2n = dict([ l.rstrip().split() for l in open(e2n_path).readlines()])
salmon.var_names = [e2n[e] for e in salmon.var_names]

### add spliced, unspliced and ambiguous counts

In [9]:
# load alevin-fry output files using mode = raw
adatas_usa = []

for _, folders, _ in os.walk(path):
    for folder in folders:

        if folder.startswith("SampleID"):
        
            # get adata
            adata = load_fry(path + "/" + folder + "/_quant_res", output_format = "raw")
            
            # add metadata
            for col_name, col_data in meta.loc[folder].items():
                adata.obs[col_name] = col_data
    
            # add ID
            adata.obs["id"] = folder

            # rename obs names
            adata.obs_names = [bc + "_" + folder for bc in adata.obs_names]
    
            # append to list
            adatas_usa.append(adata)

USA mode: True
Using pre-defined output format: raw
Will populate output field X with sum of counts frorm ['S'].
Will combine ['S'] into output layer spliced.
Will combine ['U'] into output layer unspliced.
Will combine ['A'] into output layer ambiguous.
USA mode: True
Using pre-defined output format: raw
Will populate output field X with sum of counts frorm ['S'].
Will combine ['S'] into output layer spliced.
Will combine ['U'] into output layer unspliced.
Will combine ['A'] into output layer ambiguous.
USA mode: True
Using pre-defined output format: raw
Will populate output field X with sum of counts frorm ['S'].
Will combine ['S'] into output layer spliced.
Will combine ['U'] into output layer unspliced.
Will combine ['A'] into output layer ambiguous.
USA mode: True
Using pre-defined output format: raw
Will populate output field X with sum of counts frorm ['S'].
Will combine ['S'] into output layer spliced.
Will combine ['U'] into output layer unspliced.
Will combine ['A'] into outp

In [10]:
# merge all samples
usa = ad.concat(adatas_usa)
usa

AnnData object with n_obs × n_vars = 52908 × 32285
    obs: 'barcodes', 'assay', 'chemistry', 'mouse_nr', 'infection', 'day_post_infection', 'organ', 'organ_day', 'id'
    layers: 'ambiguous', 'spliced', 'unspliced'

In [11]:
# convert gene IDs to gene names
usa.var_names = [e2n[e] for e in usa.var_names]
usa.var_names_make_unique()

In [12]:
# get same bc and cell ordering as in adata object
usa = usa[salmon.obs_names, salmon.var_names]

In [13]:
# add spliced, unspliced, ambiguous layers to hb6
salmon.layers["spliced"] = usa.layers["spliced"]
salmon.layers["unspliced"] = usa.layers["unspliced"]
salmon.layers["ambiguous"] = usa.layers["ambiguous"]

In [14]:
# save
salmon.write_h5ad("../data/mathew_salmon_raw.h5ad")