## Mathew: load unprocessed Cell Ranger and salmon data

In [None]:
import numpy as np
import pandas as pd
import os
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
from pyroe import load_fry
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib_venn import venn2
from venny4py.venny4py import *
import scvelo as scv
import re
import louvain
from skmisc.loess import loess
from skbio.diversity.alpha import simpson, gini_index
import warnings
import sys
import ensembl_rest
import time
from Ensembl_converter import EnsemblConverter

## Cell Ranger

### load expression matrices

In [None]:
# load metadata
#meta = pd.read_csv("../../Mathew/preprocessing_scripts/metadata.csv", index_col=0)
meta = pd.read_csv("~/data/Mathew/data/metadata.csv", index_col=0)
meta

In [None]:
adatas = []
path = "../../cellranger/mathew"

for _, folders, _ in os.walk(path):
    for folder in folders:
        if folder.startswith("Sample"):

            # get adata
            adata = sc.read_10x(path + "/" + folder)
            
            # add metadata
            for col_name, col_data in meta.loc[folder].items():
                adata.obs[col_name] = col_data
    
            # add ID
            adata.obs["id"] = folder
    
            # append to list
            adatas.append(adata)

In [None]:
# merge all samples
adata = ad.concat(adatas)
adata

In [None]:
# save
adata.write_h5ad("../../data/Mathew/mathew_cellranger_raw.h5ad")

## Salmon

### load expression matrices

In [None]:
adatas = []
path = "../salmon/mathew"

for _, folders, _ in os.walk(path):
    for folder in folders:

        if folder.startswith("SampleID"):
        
            # get adata
            adata = load_fry(path + "/" + folder + "/_quant_res", output_format = "all")
            
            # add metadata
            for col_name, col_data in meta.loc[folder].items():
                adata.obs[col_name] = col_data
    
            # add ID
            adata.obs["id"] = folder

            # rename obs names
            adata.obs_names = [bc + "_" + folder for bc in adata.obs_names]
    
            # append to list
            adatas.append(adata)

In [None]:
# merge all samples
adata = ad.concat(adatas)
adata

In [None]:
# rename genes from ensembl ID to gene ID
e2n_path = "../../splici_references/mm10-2020-A_geneid_to_name.txt"
e2n = dict([ l.rstrip().split() for l in open(e2n_path).readlines()])
adata.var_names = [e2n[e] for e in adata.var_names]

### add spliced, unspliced and ambiguous counts

In [None]:
# load alevin-fry output files using mode = raw
adatas_usa = []

for _, folders, _ in os.walk(path):
    for folder in folders:

        if folder.startswith("SampleID"):
        
            # get adata
            adata = load_fry(path + "/" + folder + "/_quant_res", output_format = "raw")
            
            # add metadata
            for col_name, col_data in meta.loc[folder].items():
                adata.obs[col_name] = col_data
    
            # add ID
            adata.obs["id"] = folder

            # rename obs names
            adata.obs_names = [bc + "_" + folder for bc in adata.obs_names]
    
            # append to list
            adatas_usa.append(adata)

In [None]:
# merge all samples
usa = ad.concat(adatas_usa)
usa

In [None]:
# convert gene IDs to gene names
usa.var_names = [e2n[e] for e in usa.var_names]
usa.var_names_make_unique()

In [None]:
# get same bc and cell ordering as in adata object
usa = usa[adata.obs_names, adata.var_names]

In [None]:
# add spliced, unspliced, ambiguous layers to hb6
adata.layers["spliced"] = usa.layers["spliced"]
adata.layers["unspliced"] = usa.layers["unspliced"]
adata.layers["ambiguous"] = usa.layers["ambiguous"]

In [None]:
# save
adata.write_h5ad("../../data/Mathew/mathew_salmon_raw.h5ad")