In [None]:
# Import dependencies
import os
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/files/CellRanger/"
os.chdir( wdir )

# folder structures
RESULTS_FOLDERNAME = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks/foetal/results/QC"

if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)

Example cellbender remove-background commands used (run in bash .sh files on the CCB cluster):

In [None]:
cellbender remove-background 
--input CellRanger_DEV16136_Ach_Jan2023/outs/raw_feature_bc_matrix.h5 
--output CellRanger_DEV16136_Ach_Jan2023/cellbenderout.h5 
--expected-cells 800 
--total-droplets-included 15000 
--fpr 0.01 
--epochs 150 
--cuda 
--low-count-threshold 5

cellbender remove-background 
--input CellRanger_DEV16127_Ach_Jan2023/outs/raw_feature_bc_matrix.h5 
--output CellRanger_DEV16127_Ach_Jan2023/cellbenderout.h5 
--expected-cells 8000 
--total-droplets-included 20000 
--fpr 0.01 
--epochs 150 
--cuda

In [None]:
import tables
import scipy.sparse as sp
import anndata
from typing import Dict, Optional


def anndata_from_h5(file: str,
                    analyzed_barcodes_only: bool = True) -> 'anndata.AnnData':
    """Load an output h5 file into an AnnData object for downstream work.

    Args:
        file: The h5 file
        analyzed_barcodes_only: False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.

    Returns:
        adata: The anndata object, populated with inferred latent variables
            and metadata.

    """

    d = dict_from_h5(file)
    X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')),
                      shape=d.pop('shape')).transpose().tocsr()

    # check and see if we have barcode index annotations, and if the file is filtered
    barcode_key = [k for k in d.keys() if (('barcode' in k) and ('ind' in k))]
    if len(barcode_key) > 0:
        max_barcode_ind = d[barcode_key[0]].max()
        filtered_file = (max_barcode_ind >= X.shape[0])
    else:
        filtered_file = True

    if analyzed_barcodes_only:
        if filtered_file:
            # filtered file being read, so we don't need to subset
            print('Assuming we are loading a "filtered" file that contains only cells.')
            pass
        elif 'barcode_indices_for_latents' in d.keys():
            X = X[d['barcode_indices_for_latents'], :]
            d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']]
        elif 'barcodes_analyzed_inds' in d.keys():
            X = X[d['barcodes_analyzed_inds'], :]
            d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']]
        else:
            print('Warning: analyzed_barcodes_only=True, but the key '
                  '"barcodes_analyzed_inds" or "barcode_indices_for_latents" '
                  'is missing from the h5 file. '
                  'Will output all barcodes, and proceed as if '
                  'analyzed_barcodes_only=False')

    # Construct the anndata object.
    adata = anndata.AnnData(X=X,
                            obs={'barcode': d.pop('barcodes').astype(str)},
                            var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys()
                                               else d.pop('name')).astype(str)},
                            dtype=X.dtype)
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this
    if 'genes' in d.keys():
        d['id'] = d.pop('genes')

    # For purely aesthetic purposes, rename "id" to "gene_id"
    if 'id' in d.keys():
        d['gene_id'] = d.pop('id')

    # If genomes are empty, try to guess them based on gene_id
    if 'genome' in d.keys():
        if np.array([s.decode() == '' for s in d['genome']]).all():
            if '_' in d['gene_id'][0].decode():
                print('Genome field blank, so attempting to guess genomes based on gene_id prefixes')
                d['genome'] = np.array([s.decode().split('_')[0] for s in d['gene_id']], dtype=str)

    # Add other information to the anndata object in the appropriate slot.
    _fill_adata_slots_automatically(adata, d)

    # Add a special additional field to .var if it exists.
    if 'features_analyzed_inds' in adata.uns.keys():
        adata.var['cellbender_analyzed'] = [True if (i in adata.uns['features_analyzed_inds'])
                                            else False for i in range(adata.shape[1])]

    if analyzed_barcodes_only:
        for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed')
                                     | adata.obs.columns.str.startswith('barcode_indices')]:
            try:
                del adata.obs[col]
            except Exception:
                pass
    else:
        # Add a special additional field to .obs if all barcodes are included.
        if 'barcodes_analyzed_inds' in adata.uns.keys():
            adata.obs['cellbender_analyzed'] = [True if (i in adata.uns['barcodes_analyzed_inds'])
                                                else False for i in range(adata.shape[0])]

    return adata

def dict_from_h5(file: str) -> Dict[str, np.ndarray]:
    """Read in everything from an h5 file and put into a dictionary."""
    d = {}
    with tables.open_file(file) as f:
        # read in everything
        for array in f.walk_nodes("/", "Array"):
            d[array.name] = array.read()
    return d


def _fill_adata_slots_automatically(adata, d):
    """Add other information to the adata object in the appropriate slot."""

    for key, value in d.items():
        try:
            if value is None:
                continue
            value = np.asarray(value)
            if len(value.shape) == 0:
                adata.uns[key] = value
            elif value.shape[0] == adata.shape[0]:
                if (len(value.shape) < 2) or (value.shape[1] < 2):
                    adata.obs[key] = value
                else:
                    adata.obsm[key] = value
            elif value.shape[0] == adata.shape[1]:
                if value.dtype.name.startswith('bytes'):
                    adata.var[key] = value.astype(str)
                else:
                    adata.var[key] = value
            else:
                adata.uns[key] = value
        except Exception:
            print('Unable to load data into AnnData: ', key, value, type(value))

In [None]:
sample_names = ['CellRanger_DEV15983_Ach_Sep2022', 'CellRanger_DEV15984_Ach_Sep2022',
                'CellRanger_DEV15984_Quad_Sep2022', 'CellRanger_DEV15985_Ach_Sep2022',
                'CellRanger_DEV15985_Quad_Sep2022', 'CellRanger_DEV16127_Ach_Jan2023',
                'CellRanger_DEV16127_Quad_Jan2023', 'CellRanger_DEV16134_Ach_Jan2023',
                'CellRanger_DEV16134_Quad_Jan2023', 'CellRanger_DEV16135DEV16171_Ach_Jan2023',
                'CellRanger_DEV16135DEV16171_Quad_Jan2023', 'CellRanger_DEV16136_Ach_Jan2023',
                'CellRanger_DEV16136_Quad_Jan2023', 'CellRanger_DEV16569_Ach_Jan2023',
                'CellRanger_DEV16569_Quad_Jan2023', 'CellRanger_OMB0785_Ach_Sep2022',
                'CellRanger_OMB1250_Quad_Sep2022', 'CellRanger_OMB1266_Quad_Sep2022',
                'CellRanger_OMB1556_Ach_Sep2022'
               ]

In [None]:
sample_names = ['CellRanger_OMB1248_Quad_Oct2021', 'CellRanger_OMB0792_Quad_Oct2021', 
                'CellRanger_OMB1691_Ach_Sep2023', 'CellRanger_OMB1687_Ach_Sep2023']

In [None]:
adata_dict={}
for sample_name in sample_names:
    # extract the sample-specific path to h5 file
    h5_filepath = os.path.join(sample_name, 'cellbenderout_filtered.h5')
    # load .h5 file into an anndata object
    adata_dict[sample_name] = anndata_from_h5(file = h5_filepath, analyzed_barcodes_only = True)

adata_dict

In [None]:
for key in adata_dict.keys():
    print(f'{key}: {adata_dict[key].n_obs} cells')

In [None]:
# 10Xloaded_cells = {
# "DEV15983_Achilles tendon": 10000,
# "DEV15984_Achilles tendon": 10000,
# "DEV15984_Quadriceps tendon": 10000,
# "DEV15985_Achilles tendon": 10000,
# "DEV15985_Quadriceps tendon": 10000, - 11327 by cellranger estimate
# "DEV16127_Achilles tendon": 10000,
# "DEV16127_Quadriceps tendon":10000, - 12640 by cellranger estimate
# "DEV16134_Achilles tendon": 5000,
# "DEV16134_Quadriceps tendon":10000, 
# "DEV1613516171_Achilles tendon": 10000,
# "DEV1613516171_Quadriceps tendon": 10000,- 11519 by cellranger estimate
# "DEV16136_Achilles tendon":970,
# "DEV16136_Quadriceps tendon": 9000,
# "DEV16569_Achilles tendon":10000,
# "DEV16569_Quadriceps tendon": 10000,
# "OMB0785_Achilles tendon": 10000 - 1837 by cellranger estimate
# "OMB1250_Quadriceps tendon": 10000, - 7713 by cellranger estimate
# "OMB1266_Quadriceps tendon": 10000, - 2490 by cellranger estimate
# "OMB1556_Achilles tendon": 10000, - 13358 by cellranger estimate
# }

In [None]:
for key in adata_dict.keys():
    adata_dict[key].write(os.path.join(key, 'cellbenderout_filtered_adata.h5ad'))
    print(f'{key} saved')

# CHECK

In [None]:
h5ad_dict={}
for sample_name in sample_names:
    # extract the sample-specific path to h5 file
    h5ad_filepath = os.path.join(sample_name, 'cellbenderout_filtered_adata.h5ad')
    # load .h5 file into an anndata object
    h5ad_dict[sample_name] = sc.read_h5ad(h5ad_filepath)

h5ad_dict

In [None]:
for key in h5ad_dict.keys():
    print(f'{key}: {adata_dict[key].n_obs} .h5 file cells')
    print(f'{key}: {h5ad_dict[key].n_obs} .h5ad file cells')

In [None]:
for key in h5ad_dict.keys():
    if h5ad_dict[key].n_obs == adata_dict[key].n_obs:
        print('Yass')
    else:
        print('Oh no')

In [None]:
del adata_dict

# MERGING WITH VELOCITY DATA

In [None]:
h5ad_dict['CellRanger_OMB1556_Ach_Sep2022'].obs 

In [None]:
for sample, adata in h5ad_dict.items():
    adata.obs['full_sample_name'] = sample
    adata.obs['sample'] = sample.split('_')[1]
    adata.obs['type'] = sample.split('_')[2]
    adata.obs['sampletype'] = adata.obs['sample'].astype(str)+'_'+adata.obs['type'].astype(str)
    adata.obs['barcode'] = adata.obs.index
    adata.obs.index = adata.obs['barcode'].str.split('-').str[0]+'.'+adata.obs['sampletype'].astype(str)
    adata.obs.index.name = 'CellID'
    adata.obs['sequencing'] = '3v3, NovaSeq'
    
    name = adata.obs['sample']
    if name.isin(['DEV16135DEV16171', 'DEV16134', 'DEV16136']).any():
        adata.obs['age'] = "12w"
        adata.obs['agegroup'] = "12w"
        adata.obs['libbatch']="Jan2023"
    if name.isin(['DEV16569','DEV16127']).any():
        adata.obs['age'] = "17w"
        adata.obs['agegroup'] = "17w"
        adata.obs['libbatch']="Jan2023"
    if (name=='DEV15983').any():
        adata.obs['age'] = "20w"
        adata.obs['agegroup'] = "20w"
        adata.obs['libbatch']="April2022"
    if (name=='DEV15984').any():
        adata.obs['age'] = "20w"
        adata.obs['agegroup'] = "20w"
        adata.obs['libbatch']="April2022"
    if (name=='DEV15985').any():
        adata.obs['age'] = "20w"
        adata.obs['agegroup'] = "20w"
        adata.obs['libbatch']="Dec2021"
    if (name=='OMB0785').any():
        adata.obs['age'] = "74yr"
        adata.obs['agegroup'] = "Adult"
        adata.obs['libbatch']="Oct2021"
    if (name=='OMB1250').any():
        adata.obs['age'] = "45yr"
        adata.obs['agegroup'] = "Adult"
        adata.obs['libbatch']="Oct2021"
    if (name=='OMB1266').any():
        adata.obs['age'] = "25yr"
        adata.obs['agegroup'] = "Adult"
        adata.obs['libbatch']="Oct2021"
    if (name=='OMB1556').any():
        adata.obs['age'] = "51yr"
        adata.obs['agegroup'] = "Adult"
        adata.obs['libbatch']="July2022"

    
h5ad_dict['CellRanger_OMB1556_Ach_Sep2022'].obs

In [None]:
h5ad_dict['CellRanger_DEV16136_Ach_Jan2023'].obs

In [None]:
def load_loom_files(adata_dict, data_dir):
    """
    Load spliced/unspliced count loom files into an `ldata_dict` dictionary object from a specified directory.
    The loom files should be in a subdirectory for each sample, named as CellRanger_SampleNr_Type_Date.
    The hierarchical structure of the directory should be:
    data_dir/
        CellRanger_DEV16127_Ach_Jan2023/
            velocyto/
                CellRanger_DEV16127_Ach_Sep2022.loom
        CellRanger_OMB1250_Quad_Jan2023/
            velocyto/
                CellRanger_OMB1250_Quad_Sep2022.loom
        ...
    
    Parameters
    ----------
    adata_dict : AnnData
        Dictionary of AnnData objects split by individual samples.
    data_dir : str
        Path to the directory containing the loom files for each sample.
    
    Returns
    -------
    ldata_dict : dict
        Dictionary containing spliced and unspliced count loom files for each sample, in the same order as `adata_dict` samples.
    """

    # create a dictionary to store the loaded loom files
    ldata_dict = {}
    
    for sample_dirname, adata in adata_dict.items():
    
           # construct the path to the loom file
            loom_filepath = os.path.join(data_dir, sample_dirname, 'velocyto', f'{sample_dirname}.loom')

            # check if the loom file exists before attempting to load it
            if os.path.exists(loom_filepath):
                # load the loom file into an scvelo AnnData object
                ldata = scv.read(loom_filepath, cache=True)

                # add the loaded loom data to the dictionary, keyed by the sampletype name
                ldata_dict[sample_dirname] = ldata
            else:
                # if the file is not found, skip it and move to the next one
                print(f"Loom file not found for {loom_filepath}, skipping...")
                continue
        
    return ldata_dict

In [None]:
ldata_dict = load_loom_files(h5ad_dict, wdir)
ldata_dict

In [None]:
for key in h5ad_dict.keys():
    print(f'{key}: {h5ad_dict[key].n_obs} cellbender cells')
    print(f'{key}: {ldata_dict[key].n_obs} velocyto cells')sc

In [None]:
ldata_dict['CellRanger_OMB1556_Ach_Sep2022'].obs

In [None]:
for sample_type in ldata_dict.keys():
    samplename = sample_type.split('_')[1] +'_'+ sample_type.split('_')[2]
    barcode = ldata_dict[sample_type].obs.index.str.split(':').str[1]
    barcode = barcode.str.replace('x', '')
    ldata_dict[sample_type].obs['barcode'] = barcode
    ldata_dict[sample_type].obs['sampletype'] = samplename
    ldata_dict[sample_type].obs.index = barcode + '.' + samplename
    
ldata_dict['CellRanger_OMB1556_Ach_Sep2022'].obs

In [None]:
for key in h5ad_dict.keys():
    names = set(h5ad_dict[key].obs_names)
    names2 = set(ldata_dict[key].obs_names)
    # Find the overlapping barcodes
    overlapping_barcodes = names.intersection(names2)
    print(f"Number of velocyto and cellbender overlapping barcodes for {key}: {len(overlapping_barcodes)}")

In [None]:
h5ad_dict['CellRanger_OMB1556_Ach_Sep2022'].var

In [None]:
for keys in ldata_dict.keys():
    ldata_dict[keys].var['Gene'] = ldata_dict[keys].var.index
    ldata_dict[keys].var.index = ldata_dict[keys].var['Accession']
    h5ad_dict[keys].var['gene_name'] = h5ad_dict[keys].var.index
    h5ad_dict[keys].var.index = h5ad_dict[keys].var['gene_id']
    ldata_dict[keys].var_names_make_unique()
    h5ad_dict[keys].var_names_make_unique()
    
h5ad_dict['CellRanger_OMB1556_Ach_Sep2022'].var

In [None]:
ldata_dict['CellRanger_OMB1556_Ach_Sep2022'].var

In [None]:
merged_dict = {}
for key in ldata_dict.keys():
    # merge matrices into original adata objects
    mdata = scv.utils.merge(h5ad_dict[key], ldata_dict[key])
    merged_dict[key] = mdata

merged_dict

In [None]:
def savefiles(merged_dict):
    for key in merged_dict.keys():
        sample_name = merged_dict[key].obs['sampletype'][0]
        filename = f"{sample_name}_unfiltered.h5ad"
        filepath = os.path.join(RESULTS_FOLDERNAME, filename)
        merged_dict[key].write(filepath)
        print(f"Saved file {filename} to {RESULTS_FOLDERNAME}.")

In [None]:
savefiles(merged_dict)