### 2023/1/23 Loading samples with RG

loading samples after background removal with CellBender

Cell Ranger v6.1.2

In [28]:
import os, sys, glob, re, math, pickle
import scprep, magic, phate
import numpy as np
import pandas as pd
from scipy import sparse as sp
import time,random,datetime
import scanpy as sc
import anndata
from typing import Dict, Optional
import tables
from bbknn import bbknn
%matplotlib inline
%load_ext memory_profiler



The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [2]:
# fps
dfp = '/home/cl2292/project/SBMA/data/'
pfp = '/home/cl2292/project/SBMA/result/'
pdfp = '/home/cl2292/project/SBMA/data/processed/'
sc.settings.figdir = pfp

In [3]:
# loader
#                 - 26wk WT = 3981, 3984, 3985
#                 - 26wk AR = 3980, 3982, 3983
data_folders = ['/home/cl2292/project/SBMA/26wk/AR3980',
                '/home/cl2292/project/SBMA/26wk/AR3982',
                '/home/cl2292/project/SBMA/26wk/AR3983',
                '/home/cl2292/project/SBMA/26wk/WT3981',
                '/home/cl2292/project/SBMA/26wk/WT3984',
                '/home/cl2292/project/SBMA/26wk/WT3985',
#                 - 52wk WT = 3820, 3825, 3829
#                 - 52wk AR = 3819, 3821, 3832               
                '/home/cl2292/project/SBMA/52wk/AR3819',
                '/home/cl2292/project/SBMA/52wk/AR3821',
                '/home/cl2292/project/SBMA/52wk/AR3832',
                '/home/cl2292/project/SBMA/52wk/WT3820',
                '/home/cl2292/project/SBMA/52wk/WT3825',
                '/home/cl2292/project/SBMA/52wk/WT3829',
               ]

files_not_found = []
for i in data_folders :
    if not os.path.exists(i) :
        files_not_found.append(i)
    if not files_not_found == [] :
        print('Folders not found...')
        for j in files_not_found :
            print(j)
        raise IOError('Change path to data')

total = time.time()


In [4]:
def anndata_from_h5(file: str,
                    analyzed_barcodes_only: bool = True) -> 'anndata.AnnData':
    """Load an output h5 file into an AnnData object for downstream work.

    Args:
        file: The h5 file
        analyzed_barcodes_only: False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.

    Returns:
        adata: The anndata object, populated with inferred latent variables
            and metadata.

    """

    d = dict_from_h5(file)
    X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')),
                      shape=d.pop('shape')).transpose().tocsr()

    # check and see if we have barcode index annotations, and if the file is filtered
    barcode_key = [k for k in d.keys() if (('barcode' in k) and ('ind' in k))]
    if len(barcode_key) > 0:
        max_barcode_ind = d[barcode_key[0]].max()
        filtered_file = (max_barcode_ind >= X.shape[0])
    else:
        filtered_file = True

    if analyzed_barcodes_only:
        if filtered_file:
            # filtered file being read, so we don't need to subset
            print('Assuming we are loading a "filtered" file that contains only cells.')
            pass
        elif 'barcode_indices_for_latents' in d.keys():
            X = X[d['barcode_indices_for_latents'], :]
            d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']]
        elif 'barcodes_analyzed_inds' in d.keys():
            X = X[d['barcodes_analyzed_inds'], :]
            d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']]
        else:
            print('Warning: analyzed_barcodes_only=True, but the key '
                  '"barcodes_analyzed_inds" or "barcode_indices_for_latents" '
                  'is missing from the h5 file. '
                  'Will output all barcodes, and proceed as if '
                  'analyzed_barcodes_only=False')

    # Construct the anndata object.
    adata = anndata.AnnData(X=X,
                            obs={'barcode': d.pop('barcodes').astype(str)},
                            var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys()
                                               else d.pop('name')).astype(str)},
                            dtype=X.dtype)
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this
    if 'genes' in d.keys():
        d['id'] = d.pop('genes')

    # For purely aesthetic purposes, rename "id" to "gene_id"
    if 'id' in d.keys():
        d['gene_id'] = d.pop('id')

    # If genomes are empty, try to guess them based on gene_id
    if 'genome' in d.keys():
        if np.array([s.decode() == '' for s in d['genome']]).all():
            if '_' in d['gene_id'][0].decode():
                print('Genome field blank, so attempting to guess genomes based on gene_id prefixes')
                d['genome'] = np.array([s.decode().split('_')[0] for s in d['gene_id']], dtype=str)

    # Add other information to the anndata object in the appropriate slot.
    _fill_adata_slots_automatically(adata, d)

    # Add a special additional field to .var if it exists.
    if 'features_analyzed_inds' in adata.uns.keys():
        adata.var['cellbender_analyzed'] = [True if (i in adata.uns['features_analyzed_inds'])
                                            else False for i in range(adata.shape[1])]

    if analyzed_barcodes_only:
        for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed')
                                     | adata.obs.columns.str.startswith('barcode_indices')]:
            try:
                del adata.obs[col]
            except Exception:
                pass
    else:
        # Add a special additional field to .obs if all barcodes are included.
        if 'barcodes_analyzed_inds' in adata.uns.keys():
            adata.obs['cellbender_analyzed'] = [True if (i in adata.uns['barcodes_analyzed_inds'])
                                                else False for i in range(adata.shape[0])]

    return adata


def dict_from_h5(file: str) -> Dict[str, np.ndarray]:
    """Read in everything from an h5 file and put into a dictionary."""
    d = {}
    with tables.open_file(file) as f:
        # read in everything
        for array in f.walk_nodes("/", "Array"):
            d[array.name] = array.read()
    return d


def _fill_adata_slots_automatically(adata, d):
    """Add other information to the adata object in the appropriate slot."""

    for key, value in d.items():
        try:
            if value is None:
                continue
            value = np.asarray(value)
            if len(value.shape) == 0:
                adata.uns[key] = value
            elif value.shape[0] == adata.shape[0]:
                if (len(value.shape) < 2) or (value.shape[1] < 2):
                    adata.obs[key] = value
                else:
                    adata.obsm[key] = value
            elif value.shape[0] == adata.shape[1]:
                if value.dtype.name.startswith('bytes'):
                    adata.var[key] = value.astype(str)
                else:
                    adata.var[key] = value
            else:
                adata.uns[key] = value
        except Exception:
            print('Unable to load data into AnnData: ', key, value, type(value))

In [5]:
# Load background-removed samples (CellBender)

running_cellcount=0
start = time.time()
adatas = {}
for i,folder in enumerate(data_folders) :
    sample_id = os.path.split(folder)[1][2:]
    print(sample_id)
    print('... storing %s into dict (%d/%d)' % (sample_id,i+1,len(data_folders)))
    adatas[sample_id] = anndata_from_h5(folder+'/cellbender_filtered.h5')
    running_cellcount+=adatas[sample_id].shape[0]
    print('...     read {} cells; total: {} in {:.2f}-s'.format(adatas[sample_id].shape[0],running_cellcount,time.time()-start))
batch_names = list(adatas.keys())

3980
... storing 3980 into dict (1/12)
Assuming we are loading a "filtered" file that contains only cells.
...     read 4145 cells; total: 4145 in 0.40-s
3982
... storing 3982 into dict (2/12)
Assuming we are loading a "filtered" file that contains only cells.
...     read 3956 cells; total: 8101 in 0.71-s
3983
... storing 3983 into dict (3/12)
Assuming we are loading a "filtered" file that contains only cells.
...     read 4610 cells; total: 12711 in 1.06-s
3981
... storing 3981 into dict (4/12)
Assuming we are loading a "filtered" file that contains only cells.
...     read 2680 cells; total: 15391 in 1.44-s
3984
... storing 3984 into dict (5/12)
Assuming we are loading a "filtered" file that contains only cells.
...     read 4905 cells; total: 20296 in 1.72-s
3985
... storing 3985 into dict (6/12)
Assuming we are loading a "filtered" file that contains only cells.
...     read 4566 cells; total: 24862 in 2.32-s
3819
... storing 3819 into dict (7/12)
Assuming we are loading a "filter

In [6]:
## Scrublet

for i in batch_names:
    sc.external.pp.scrublet(adatas[i])
    adatas[i].var_names_make_unique()
    adatas[i].obs_names_make_unique()
    
#merge dataset
print('\n... concatenating of {}-samples'.format(len(data_folders)))
adata = adatas[batch_names[0]].concatenate(adatas[batch_names[1]],adatas[batch_names[2]],
                                               adatas[batch_names[3]],adatas[batch_names[4]],
                                               adatas[batch_names[5]],adatas[batch_names[6]],
                                               adatas[batch_names[7]],adatas[batch_names[8]],
                                               adatas[batch_names[9]],adatas[batch_names[10]],
                                               adatas[batch_names[11]],
                                               batch_categories = batch_names)
print('Ncells=%d are doublets' % np.sum(adata.obs.predicted_doublet != False))
adata = adata[adata.obs.predicted_doublet == False, :]
adata

  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.50
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 7.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.3%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.50
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 4.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.6%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.53
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 4.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.8%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.30
Detected doublet rate = 1.0%
Estimated detectable doublet fraction = 15.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 6.2%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.53
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 4.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.52
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 3.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.9%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.62
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 1.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.9%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.63
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 1.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 2.4%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.64
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 2.8%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.62
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 5.8%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.63
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 1.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.8%


  utils.warn_names_duplicates("var")
  adata_sim = AnnData(scrub._E_sim)
  view_to_actual(adata)


Automatically set threshold at doublet score = 0.61
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 2.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.7%

... concatenating of 12-samples


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


Ncells=378 are doublets


View of AnnData object with n_obs × n_vars = 83093 × 32285
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'doublet_score', 'predicted_doublet', 'batch'
    var: 'feature_type', 'genome', 'gene_id', 'ambient_expression-3819', 'ambient_expression-3820', 'ambient_expression-3821', 'ambient_expression-3825', 'ambient_expression-3829', 'ambient_expression-3832', 'ambient_expression-3980', 'ambient_expression-3981', 'ambient_expression-3982', 'ambient_expression-3983', 'ambient_expression-3984', 'ambient_expression-3985'
    obsm: 'latent_gene_encoding'

In [7]:
# annotate metadata
WT = ['3981', '3984', '3985', #26wk
     '3820', '3825', '3829' #52wk
     ]
AR = ['3980', '3982', '3983', #26wk
     '3819', '3821', '3832' #52wk
     ]

wk26 = ['3981', '3984', '3985','3980', '3982', '3983']
wk52 = ['3820', '3825', '3829','3819', '3821', '3832']

genotype = []
for i in adata.obs['batch'] : # verbose loop for quality-assurance
    if i in WT :
        genotype.append('WT')
    elif i in AR :
        genotype.append('AR')   
    else :
        raise ValueError('Encountered unclassifiable genotype for sample {}'.format(i))
adata.obs['genotype']=genotype

timepoint = []
for i in adata.obs['batch'] :
    if i in wk26 :
        timepoint.append('26wk')
    elif i in wk52 :
        timepoint.append('52wk')
    else :
        raise ValueError('Encountered unclassifiable timepoint for animal {}'.format(i))
adata.obs['timepoint']=timepoint

print(adata)

  adata.obs['genotype']=genotype
  next(self.gen)


AnnData object with n_obs × n_vars = 83093 × 32285
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'doublet_score', 'predicted_doublet', 'batch', 'genotype', 'timepoint'
    var: 'feature_type', 'genome', 'gene_id', 'ambient_expression-3819', 'ambient_expression-3820', 'ambient_expression-3821', 'ambient_expression-3825', 'ambient_expression-3829', 'ambient_expression-3832', 'ambient_expression-3980', 'ambient_expression-3981', 'ambient_expression-3982', 'ambient_expression-3983', 'ambient_expression-3984', 'ambient_expression-3985'
    obsm: 'latent_gene_encoding'


In [9]:
# filter cells/genes, transform
adata.var['mt'] = adata.var_names.str.startswith('mt-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
print('Ncells=%d have >5percent mt expression' % np.sum(adata.obs['pct_counts_mt']>0.05))
print('Ncells=%d have <700 genes expressed' % np.sum(adata.obs['n_genes_by_counts']<700))
sc.pp.filter_cells(adata, min_genes=700)
sc.pp.filter_genes(adata, min_cells=3) # filtering cells gets rid of some genes of interest
adata = adata[adata.obs.pct_counts_mt <= 0.05, :]

Ncells=20249 have >5percent mt expression
Ncells=15040 have <700 genes expressed


In [27]:
sc.pl.dotplot(adata, 'Ar', groupby='louvain',)

  dot_ax.scatter(x, y, **kwds)


In [11]:
# normalization
sc.pp.normalize_total(adata)
sc.pp.sqrt(adata,chunked=True,chunk_size=10000)
adata.raw = adata

  view_to_actual(adata)


In [12]:
# calc embeddings for batch corrected
start = time.time()
print('starting embeddings...')
sc.tl.pca(adata,n_comps=100)
#sc.external.pp.bbknn(adata,batch_key='batch')
bbknn(adata,batch_key='batch') #pip install bbknn
#     sc.pp.neighbors(adata, n_neighbors=100, n_pcs=100)
sc.tl.louvain(adata,resolution=3) #pip install python-igraph #pip install louvain
sc.tl.umap(adata)

starting embeddings...


In [17]:
sc.pl.umap(adata, color = ['batch','genotype','timepoint'], size =1)

In [14]:
print(adata.obs['batch'].value_counts())
print(adata.obs['timepoint'].value_counts())

3821    8549
3825    7890
3819    7563
3832    7415
3820    7030
3829    6870
3983    2572
3980    2530
3981    2370
3984    2309
3985    2032
3982    1593
Name: batch, dtype: int64
52wk    45317
26wk    13406
Name: timepoint, dtype: int64


In [15]:
# save data objects
adata.write(os.path.join(pdfp,'230128_26&52wk.h5ad'))
print('saved @'+datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))

TypeError: Can't implicitly convert non-string objects to strings

Above error raised while writing key 'predicted_doublet' of <class 'h5py._hl.group.Group'> to /

In [21]:
sc.pl.umap(adata)

  cax = scatter(
