# CellBender

In [None]:
# Import dependencies
import os

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# Set a working directory
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/files/Teichmann Group/"
os.chdir( wdir )

# Set folder structures
RESULTS_FOLDERNAME = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks/foetal/results/SingleCellQC"
if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)

In [None]:
ls

In [None]:
import os
import subprocess
import pandas as pd

def run_cellbender(working_dir):
    for folder in os.listdir(working_dir):
        if folder.startswith('cellranger700'):
            print(f'PROCESSING FOLDER: {folder}')
            
            # Check if output files already exist in the folder
            output_files_exist = any(file.startswith('cellbender') for file in os.listdir(os.path.join(working_dir, 
                                                                                                       folder)))
            if output_files_exist:
                print(f"Output files already exist in folder {folder}. Skipping...")
                continue
            
            # Process the folder
            input_path = os.path.join(working_dir, folder, 'raw_feature_bc_matrix.h5')
            output_path = os.path.join(working_dir, folder, 'cellbenderout.h5')
            metrics_summary_path = os.path.join(working_dir, folder, 'metrics_summary.csv')
            log_file = os.path.join(working_dir, folder, 'cellbender_running.log')

            # Extract the estimated number of cells from metrics_summary.csv
            metrics_df = pd.read_csv(metrics_summary_path)
            expected_cells = float(metrics_df['Estimated Number of Cells'][0].replace(',', ''))
            
            if expected_cells is not None:
                # Run the CellBender subprocess and redirect stdout to log file
                command = [
                    'cellbender', 'remove-background',
                    '--input', input_path,
                    '--output', output_path,
                    '--expected-cells', str(int(expected_cells)),
                    '--total-droplets-included', '30000', # ran with 15000 for all but one sample
                    '--fpr', '0.01',
                    '--epochs', '150',
                    '--cuda'
                ]
                print(f'Running {command}')
                with open(log_file, 'w') as log:
                    try:
                        result = subprocess.run(command, check=True, stdout=log, 
                                                stderr=subprocess.PIPE, encoding='utf-8')
                    except subprocess.CalledProcessError as e:
                        print(f'Error occurred: {e.returncode}\n{e.stderr}')
                    else:
                        print(f'Command executed successfully')
            else:
                print(f"No estimated number of cells found for {folder}. Skipping...")


In [None]:
run_cellbender(wdir)

# Change to Pyscenic_env

In [None]:
import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
#import scvelo as scv

# Import dependencies
import os

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
wdir = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/files/Teichmann Group/"
os.chdir( wdir )

# folder structures
RESULTS_FOLDERNAME = "/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/notebooks/foetal/results/SingleCellQC"
if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)

In [None]:
import tables
import scipy.sparse as sp
import anndata
from typing import Dict, Optional


def anndata_from_h5(file: str,
                    analyzed_barcodes_only: bool = True) -> 'anndata.AnnData':
    """Load an output h5 file into an AnnData object for downstream work.

    Args:
        file: The h5 file
        analyzed_barcodes_only: False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count matrix.
            True to load a limited set of barcodes: only those analyzed by the
            algorithm. This allows relevant latent variables to be loaded
            properly into adata.obs and adata.obsm, rather than adata.uns.

    Returns:
        adata: The anndata object, populated with inferred latent variables
            and metadata.

    """

    d = dict_from_h5(file)
    X = sp.csc_matrix((d.pop('data'), d.pop('indices'), d.pop('indptr')),
                      shape=d.pop('shape')).transpose().tocsr()

    # check and see if we have barcode index annotations, and if the file is filtered
    barcode_key = [k for k in d.keys() if (('barcode' in k) and ('ind' in k))]
    if len(barcode_key) > 0:
        max_barcode_ind = d[barcode_key[0]].max()
        filtered_file = (max_barcode_ind >= X.shape[0])
    else:
        filtered_file = True

    if analyzed_barcodes_only:
        if filtered_file:
            # filtered file being read, so we don't need to subset
            print('Assuming we are loading a "filtered" file that contains only cells.')
            pass
        elif 'barcode_indices_for_latents' in d.keys():
            X = X[d['barcode_indices_for_latents'], :]
            d['barcodes'] = d['barcodes'][d['barcode_indices_for_latents']]
        elif 'barcodes_analyzed_inds' in d.keys():
            X = X[d['barcodes_analyzed_inds'], :]
            d['barcodes'] = d['barcodes'][d['barcodes_analyzed_inds']]
        else:
            print('Warning: analyzed_barcodes_only=True, but the key '
                  '"barcodes_analyzed_inds" or "barcode_indices_for_latents" '
                  'is missing from the h5 file. '
                  'Will output all barcodes, and proceed as if '
                  'analyzed_barcodes_only=False')

    # Construct the anndata object.
    adata = anndata.AnnData(X=X,
                            obs={'barcode': d.pop('barcodes').astype(str)},
                            var={'gene_name': (d.pop('gene_names') if 'gene_names' in d.keys()
                                               else d.pop('name')).astype(str)},
                            dtype=X.dtype)
    adata.obs.set_index('barcode', inplace=True)
    adata.var.set_index('gene_name', inplace=True)

    # For CellRanger v2 legacy format, "gene_ids" was called "genes"... rename this
    if 'genes' in d.keys():
        d['id'] = d.pop('genes')

    # For purely aesthetic purposes, rename "id" to "gene_id"
    if 'id' in d.keys():
        d['gene_id'] = d.pop('id')

    # If genomes are empty, try to guess them based on gene_id
    if 'genome' in d.keys():
        if np.array([s.decode() == '' for s in d['genome']]).all():
            if '_' in d['gene_id'][0].decode():
                print('Genome field blank, so attempting to guess genomes based on gene_id prefixes')
                d['genome'] = np.array([s.decode().split('_')[0] for s in d['gene_id']], dtype=str)

    # Add other information to the anndata object in the appropriate slot.
    _fill_adata_slots_automatically(adata, d)

    # Add a special additional field to .var if it exists.
    if 'features_analyzed_inds' in adata.uns.keys():
        adata.var['cellbender_analyzed'] = [True if (i in adata.uns['features_analyzed_inds'])
                                            else False for i in range(adata.shape[1])]

    if analyzed_barcodes_only:
        for col in adata.obs.columns[adata.obs.columns.str.startswith('barcodes_analyzed')
                                     | adata.obs.columns.str.startswith('barcode_indices')]:
            try:
                del adata.obs[col]
            except Exception:
                pass
    else:
        # Add a special additional field to .obs if all barcodes are included.
        if 'barcodes_analyzed_inds' in adata.uns.keys():
            adata.obs['cellbender_analyzed'] = [True if (i in adata.uns['barcodes_analyzed_inds'])
                                                else False for i in range(adata.shape[0])]

    return adata

def dict_from_h5(file: str) -> Dict[str, np.ndarray]:
    """Read in everything from an h5 file and put into a dictionary."""
    d = {}
    with tables.open_file(file) as f:
        # read in everything
        for array in f.walk_nodes("/", "Array"):
            d[array.name] = array.read()
    return d


def _fill_adata_slots_automatically(adata, d):
    """Add other information to the adata object in the appropriate slot."""

    for key, value in d.items():
        try:
            if value is None:
                continue
            value = np.asarray(value)
            if len(value.shape) == 0:
                adata.uns[key] = value
            elif value.shape[0] == adata.shape[0]:
                if (len(value.shape) < 2) or (value.shape[1] < 2):
                    adata.obs[key] = value
                else:
                    adata.obsm[key] = value
            elif value.shape[0] == adata.shape[1]:
                if value.dtype.name.startswith('bytes'):
                    adata.var[key] = value.astype(str)
                else:
                    adata.var[key] = value
            else:
                adata.uns[key] = value
        except Exception:
            print('Unable to load data into AnnData: ', key, value, type(value))

In [None]:
adata_dict={}
for folder in os.listdir(wdir):
        if folder.startswith('cellranger700'):
            print(f'PROCESSING FOLDER: {folder}')
            # extract the sample-specific path to h5 file
            h5_filepath = os.path.join(folder, 'cellbenderout_filtered.h5')
            # load .h5 file into an anndata object
            adata_dict[folder] = anndata_from_h5(file = h5_filepath, analyzed_barcodes_only = True)

adata_dict

In [None]:
for key in adata_dict.keys():
    print(f'{key}: {adata_dict[key].n_obs} cells')

In [None]:
for key in adata_dict.keys():
    adata_dict[key].write(os.path.join(key, 'cellbenderout_filtered_adata.h5ad'))
    print(f'{key} saved')

# CHECK

In [None]:
h5ad_dict={}
for folder in os.listdir(wdir):
        if folder.startswith('cellranger700'):
            # extract the sample-specific path to h5 file
            h5ad_filepath = os.path.join(folder, 'cellbenderout_filtered_adata.h5ad')
            # load .h5 file into an anndata object
            h5ad_dict[folder] = sc.read_h5ad(h5ad_filepath)
h5ad_dict

In [None]:
for key in h5ad_dict.keys():
    print(f'{key}: {adata_dict[key].n_obs} .h5 file cells')
    print(f'{key}: {h5ad_dict[key].n_obs} .h5ad file cells')

In [None]:
for key in h5ad_dict.keys():
    if h5ad_dict[key].n_obs == adata_dict[key].n_obs:
        print('Yass')
    else:
        print('Oh no')

In [None]:
del adata_dict

# Transferring files

In [None]:
velocyto_path = '/home/akurjan/Downloads/Velocyto-20230612T135134Z-001/Velocyto'
main_path = '/mnt/da8aa2c4-0136-465b-87a2-d12a59afec55/akurjan/analysis/files/Teichmann Group'

In [None]:
import re 

for folder in os.listdir(main_path):
    match = re.match(r"cellranger700_count_\d+_(.*)_[A-Za-z0-9-]+", folder)
    if match:
        new_name = match.group(1)
        os.rename(os.path.join(main_path, folder), os.path.join(main_path, new_name))

In [None]:
for folder in os.listdir(main_path):
    print(folder)

In [None]:
for folder in os.listdir(velocyto_path):
    if folder.endswith(("_velocyto", "_velocyo")):
        sample_id = folder.rstrip("_velocyto").rstrip("_velocyo")
        os.rename(os.path.join(velocyto_path, folder), os.path.join(velocyto_path, sample_id))

In [None]:
for folder in os.listdir(velocyto_path):
    print(folder)

In [None]:
# Get the list of folder names in velocyto_path
velocyto_folders = [folder for folder in os.listdir(velocyto_path) if os.path.isdir(os.path.join(velocyto_path, folder))]

# Get the list of folder names in main_path
main_folders = [folder for folder in os.listdir(main_path) if os.path.isdir(os.path.join(main_path, folder))]

# Check for discrepancies
discrepancies = set(velocyto_folders) - set(main_folders)

if discrepancies:
    print("Folder names in velocyto_path that do not have a matching folder in main_path:")
    for folder in discrepancies:
        print(folder)
else:
    print("All folder names in velocyto_path have a matching folder in main_path.")

In [None]:
for folder in os.listdir(main_path):
    folder_path = os.path.join(main_path, folder)
    velocyto_folder_path = os.path.join(folder_path, 'velocyto')

    # Create the 'velocyto' subfolder if it doesn't exist
    os.makedirs(velocyto_folder_path, exist_ok=True)

    print(f"Created 'velocyto' subfolder in '{folder}'")

In [None]:
import os
import shutil

def transfer_files(velocyto_path, main_path):
    for folder in os.listdir(velocyto_path):
        velocyto_folder = os.path.join(velocyto_path, folder)
        main_folder = os.path.join(main_path, folder)
        velocyto_subfolder = os.path.join(main_folder, 'velocyto')

        if os.path.isdir(main_folder) and os.path.isdir(velocyto_subfolder):
            # Get the list of files in the velocyto folder
            files = os.listdir(velocyto_folder)
            
            for file in files:
                source = os.path.join(velocyto_folder, file)
                destination = os.path.join(velocyto_subfolder, file)

                # Move the file to the velocyto subfolder
                shutil.move(source, destination)

                print(f"Moved '{file}' from '{velocyto_folder}' to '{velocyto_subfolder}'")
        else:
            print(f"Matching folder '{main_folder}' or 'velocyto' subfolder does not exist")


In [None]:
transfer_files(velocyto_path, main_path)

# Merging CellBender and Velocyto outputs

In [None]:
# Import dependencies
import os

# Print date and time:
import datetime
e = datetime.datetime.now()
print ("Current date and time = %s" % e)

# set a working directory
wdir = "/ceph/project/tendonhca/akurjan/files/Teichmann Group/"
os.chdir( wdir )

# folder structures
RESULTS_FOLDERNAME = "/ceph/project/tendonhca/akurjan/analysis/foetal/results/SingleCellQC/"
if not os.path.exists(RESULTS_FOLDERNAME):
    os.makedirs(RESULTS_FOLDERNAME)

In [None]:
import scanpy as sc

h5ad_dict={}
for folder in os.listdir(wdir):
    folder_path = os.path.join(wdir, folder)
    if os.path.isdir(folder_path):
        # extract the sample-specific path to h5ad file
        h5ad_filepath = os.path.join(wdir, folder, 'cellbenderout_filtered_adata.h5ad')
        # load .h5 file into an anndata object
        h5ad_dict[folder] = sc.read_h5ad(h5ad_filepath)
h5ad_dict

In [None]:
def load_loom_files(adata_dict, data_dir):
    """
    Load spliced/unspliced count loom files into an `ldata_dict` dictionary object from a specified directory.
    The loom files should be in a subdirectory for each sample, named as CellRanger_SampleNr_Type_Date.
    The hierarchical structure of the directory should be:
    data_dir/
        CellRanger_DEV16127_Ach_Jan2023/
            velocyto/
                CellRanger_DEV16127_Ach_Sep2022.loom
        CellRanger_OMB1250_Quad_Jan2023/
            velocyto/
                CellRanger_OMB1250_Quad_Sep2022.loom
        ...
    
    Parameters
    ----------
    adata_dict : AnnData
        Dictionary of AnnData objects split by individual samples.
    data_dir : str
        Path to the directory containing the loom files for each sample.
    
    Returns
    -------
    ldata_dict : dict
        Dictionary containing spliced and unspliced count loom files for each sample, in the same order as `adata_dict` samples.
    """

    # create a dictionary to store the loaded loom files
    ldata_dict = {}
    
    for sample_dirname, adata in adata_dict.items():
    
           # construct the path to the loom file
            loom_filepath = os.path.join(data_dir, sample_dirname, 'velocyto', f'{sample_dirname}.loom')

            # check if the loom file exists before attempting to load it
            if os.path.exists(loom_filepath):
                # load the loom file into an scvelo AnnData object
                ldata = scv.read(loom_filepath, cache=True)

                # add the loaded loom data to the dictionary, keyed by the sampletype name
                ldata_dict[sample_dirname] = ldata
            else:
                # if the file is not found, skip it and move to the next one
                print(f"Loom file not found for {loom_filepath}, skipping...")
                continue
        
    return ldata_dict

In [None]:
import scvelo as scv 

ldata_dict = load_loom_files(h5ad_dict, wdir)
ldata_dict

In [None]:
assert h5ad_dict.keys() == ldata_dict.keys()

In [None]:
for key in h5ad_dict.keys():
    print(f'{key}: {h5ad_dict[key].n_obs} cellbender cells')
    print(f'{key}: {ldata_dict[key].n_obs} velocyto cells')

In [None]:
ldata_dict['WSSS_THYst9384958'].obs

In [None]:
for sample_type in ldata_dict.keys():
    barcode = ldata_dict[sample_type].obs.index.str.split(':').str[1]
    barcode = barcode.str.replace('x', '')
    ldata_dict[sample_type].obs['barcode'] = barcode
    ldata_dict[sample_type].obs['samplename'] = sample_type
    ldata_dict[sample_type].obs.index = barcode + '.' + sample_type
    
ldata_dict['WSSS_THYst9384958'].obs

In [None]:
h5ad_dict['WSSS_THYst9384958'].obs

In [None]:
for sample_type in h5ad_dict.keys():
    barcode = h5ad_dict[sample_type].obs.index.str.split('-').str[0]
    h5ad_dict[sample_type].obs['barcode'] = barcode
    h5ad_dict[sample_type].obs['samplename'] = sample_type
    h5ad_dict[sample_type].obs.index = barcode + '.' + sample_type
    
h5ad_dict['WSSS_THYst9384958'].obs

In [None]:
for key in h5ad_dict.keys():
    names = set(h5ad_dict[key].obs_names)
    names2 = set(ldata_dict[key].obs_names)
    # Find the overlapping barcodes
    overlapping_barcodes = names.intersection(names2)
    print(f"Number of velocyto and cellbender overlapping barcodes for {key}: {len(overlapping_barcodes)}")

In [None]:
ldata_dict['WSSS_THYst9384958'].var

In [None]:
h5ad_dict['WSSS_THYst9384958'].var

In [None]:
for keys in ldata_dict.keys():
    ldata_dict[keys].var['Gene'] = ldata_dict[keys].var.index
    ldata_dict[keys].var.index = ldata_dict[keys].var['Accession']
    h5ad_dict[keys].var['gene_name'] = h5ad_dict[keys].var.index
    h5ad_dict[keys].var.index = h5ad_dict[keys].var['gene_id']
    ldata_dict[keys].var_names_make_unique()
    h5ad_dict[keys].var_names_make_unique()
    
h5ad_dict['WSSS_THYst9384958'].var

In [None]:
merged_dict = {}
for key in ldata_dict.keys():
    # merge matrices into original adata objects
    mdata = scv.utils.merge(h5ad_dict[key], ldata_dict[key])
    merged_dict[key] = mdata

merged_dict

# Metadata addition

In [None]:
del h5ad_dict, ldata_dict

In [None]:
import pandas as pd

metadata = pd.read_csv(os.path.join(wdir, '201015limb_samples_meta.csv'), na_values='NaN')
metadata.head()

In [None]:
metadata = metadata.iloc[:27]

In [None]:
metadata[['Sample ID', 'Sample stage', 'Norm. Sample Stage  ', 'Hospital ID', 'Gender', 'kit', 'Sequencing protocol ']]

In [None]:
for sample_name in merged_dict.keys():
    # Get the anndata object for the current sample
    anndata_obj = merged_dict[sample_name]
    
    # Match sample names and assign metadata values
    metadata_subset = metadata[metadata['Sample ID'] == sample_name]
    if not metadata_subset.empty:
        metadata_values = metadata_subset[['Sample stage', 'Norm. Sample Stage  ', 'Hospital ID', 'Gender', 'kit', 'Sequencing protocol ']].values[0]
        
        # Assign metadata values to the anndata object's .obs attribute
        anndata_obj.obs['sample_stage'] = metadata_values[0]
        anndata_obj.obs['norm_sample_stage'] = metadata_values[1]
        anndata_obj.obs['hospital_id'] = metadata_values[2]
        anndata_obj.obs['sex'] = metadata_values[3]
        anndata_obj.obs['kit'] = metadata_values[4]
        anndata_obj.obs['seq_protocol'] = metadata_values[5]

In [None]:
merged_dict['WSSS_THYst8796442'].obs.head()

# Saving Files

In [None]:
def savefiles(merged_dict):
    for key in merged_dict.keys():
        filename = f"{key}_unfiltered.h5ad"
        filepath = os.path.join(RESULTS_FOLDERNAME, filename)
        merged_dict[key].write(filepath)
        print(f"Saved file {filename} to {RESULTS_FOLDERNAME}.")

In [None]:
savefiles(merged_dict)