# 0 Import Libraries

In [1]:
import os
import glob
import importlib
import pandas as pd
import numpy as np
import shutil
import matplotlib.pyplot as plt
import anndata as ad
import scanpy as sc
import doubletdetection

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define the path to the data directories
source_dir = "../../../data/pAML/scRNA-qc-and-filtering/source"
output_dir = "../../../data/pAML/scRNA-qc-and-filtering/out"

adata_dir = "../../../data/pAML/scRNA-qc-and-filtering/out/adata"
clean_adata_dir = "../../../data/pAML/scRNA-qc-and-filtering/out/clean_adata"

filtered_adata_dir = "../../../data/pAML/scRNA-qc-and-filtering/out/filtered_adata"

metadata_dir = "../../../data/pAML/scRNA-qc-and-filtering/metadata"

# Define projetc name
patient = "all" # write all if you want to process all patients

# 1 Data Preparation and Loading

## 1.1 Data correction

In [None]:
# I believe I don't need to do this anymore, but I will keep it here for now

# # Step 1: Duplicate the column in all *_genes.tsv files
# for processed_file in glob.glob(os.path.join(source_dir, '*_genes.tsv')):
#     df_processed = pd.read_csv(processed_file, sep='\t', header=None)
#     df_processed = pd.concat([df_processed, df_processed], axis=1)  # Duplicate column
#     df_processed.to_csv(processed_file, sep='\t', index=False, header=False)
#     print(f"Duplicated columns in: {processed_file}")

# # Step 2: Replace the first column with Gene IDs from the corresponding raw files
# for processed_file in glob.glob(os.path.join(source_dir, '*_genes.tsv')):
#     # Get the corresponding raw file name
#     raw_file = processed_file.replace('_processed_genes.tsv', '_raw_genes.tsv').replace('processed', 'raw')

#     # Read processed and raw files
#     df_processed = pd.read_csv(processed_file, sep='\t', header=None)
#     df_raw = pd.read_csv(raw_file, sep='\t', header=None)  # raw: [Gene_ID, Gene_Symbol]

#     # Map Gene Symbols to Gene IDs
#     gene_symbol_to_id = dict(zip(df_raw[1], df_raw[0]))

#     # Replace the first column in processed with the corresponding Gene IDs
#     df_processed[0] = df_processed[1].map(gene_symbol_to_id)

#     # Save the updated processed file
#     df_processed.to_csv(processed_file, sep='\t', index=False, header=False)
#     print(f"Updated with Gene IDs: {processed_file}")


Duplicated columns in: ../../../data/scRNA-analysis/source/GSM7494274_AML8_DX_processed_genes.tsv
Duplicated columns in: ../../../data/scRNA-analysis/source/GSM7494274_AML8_DX_raw_genes.tsv
Duplicated columns in: ../../../data/scRNA-analysis/source/GSM7494275_AML8_REL_raw_genes.tsv
Duplicated columns in: ../../../data/scRNA-analysis/source/GSM7494276_AML8_REM_raw_genes.tsv
Duplicated columns in: ../../../data/scRNA-analysis/source/GSM7494275_AML8_REL_processed_genes.tsv
Duplicated columns in: ../../../data/scRNA-analysis/source/GSM7494276_AML8_REM_processed_genes.tsv
Updated with Gene IDs: ../../../data/scRNA-analysis/source/GSM7494274_AML8_DX_processed_genes.tsv
Updated with Gene IDs: ../../../data/scRNA-analysis/source/GSM7494274_AML8_DX_raw_genes.tsv
Updated with Gene IDs: ../../../data/scRNA-analysis/source/GSM7494275_AML8_REL_raw_genes.tsv
Updated with Gene IDs: ../../../data/scRNA-analysis/source/GSM7494276_AML8_REM_raw_genes.tsv
Updated with Gene IDs: ../../../data/scRNA-analysi

## 1.2 Create AnnData files

In [4]:
# Create AnnData files

files = [f for f in os.listdir(source_dir) if f.endswith('_raw_matrix.mtx')]


for prefix in set([f.split('_raw')[0] + '_raw_' for f in files]):
    adata = sc.read_10x_mtx(source_dir, prefix=prefix)
    adata.write_h5ad(os.path.join(adata_dir, prefix + '.h5ad'))

KeyboardInterrupt: 

## 1.3 Cellbender - Remove ambient RNA

Cellbender needs to be run in the terminal... 

After that we can get the clean_adatas from the respective folder

In [3]:
if patient == "all":
    adatas = [f for f in os.listdir(clean_adata_dir) if f.endswith('filtered.h5')]
else:
    adatas = [f for f in os.listdir(clean_adata_dir) if f.endswith('filtered.h5') and patient in f]

In [4]:
adatas

['GSM7494276_AML8_REM_raw__denoised_filtered.h5',
 'GSM7494290_AML9_REL_raw__denoised_filtered.h5',
 'GSM7494293_AML10_REL_raw__denoised_filtered.h5',
 'GSM7494331_AML13_REM_raw__denoised_filtered.h5',
 'GSM7494263_AML2_DX_raw__denoised_filtered.h5',
 'GSM7494316_AML25_DX_raw__denoised_filtered.h5',
 'GSM7494278_AML20_REM_raw__denoised_filtered.h5',
 'GSM7494275_AML8_REL_raw__denoised_filtered.h5',
 'GSM7494294_AML10_REM_raw__denoised_filtered.h5',
 'GSM7494288_AML27_REM_raw__denoised_filtered.h5',
 'GSM7494312_AML28_REL_raw__denoised_filtered.h5',
 'GSM7494325_AML18_REL_raw__denoised_filtered.h5',
 'GSM7494303_AML21_DX_raw__denoised_filtered.h5',
 'GSM7494306_AML24_DX_raw__denoised_filtered.h5',
 'GSM7494297_AML11_REM_raw__denoised_filtered.h5',
 'GSM7494271_AML7_DX_raw__denoised_filtered.h5',
 'GSM7494268_AML15_REM_raw__denoised_filtered.h5',
 'GSM7494265_AML2_REM_raw__denoised_filtered.h5',
 'GSM7494319_AML26_DX_raw__denoised_filtered.h5',
 'GSM7494279_AML5_DX_raw__denoised_filtered

In [5]:
def load_it(filename):
    """ Load a single adata file and add metadata 
    Args:
        filename (str): Name of the adata file
        adata_dir (str): Directory containing adata files
        metadata_dir (str): Directory containing metadata files
    Returns:
        adata (AnnData): AnnData object
    """
    split = filename.split('_')
    patient = split[1]
    dx = split[2]
   

    adata = sc.read_10x_h5(os.path.join(clean_adata_dir, filename))
    adata.obs['patient'] = patient
    adata.obs['dx'] = dx
    adata.obs['sample'] = split[0] + '_' + patient + '_' + dx
    adata.obs['batch'] = patient + dx
    adata.obs.index = adata.obs.index + '___' + patient + '_' + dx

    return adata

In [6]:
adatas = [load_it(f) for f in adatas]

In [8]:
adatas[1]

AnnData object with n_obs × n_vars = 9275 × 33538
    obs: 'patient', 'dx', 'sample', 'batch'
    var: 'gene_ids', 'feature_types', 'genome'

## 1.4 Only use cells filtered by paper - QC from authors

In [9]:
def filter_by_metadata(adata):
    """
    Filters an AnnData object based on a metadata file that contains the list of valid cells.

    Steps:
    1. Identifies the correct metadata file based on the sample name.
    2. Loads the metadata.
    3. Modifies the metadata index to match the format in `adata.obs.index`.
    4. Filters `adata` to retain only cells present in the metadata.
    5. Adds metadata columns to `adata.obs`.
    6. Prints the number of removed cells.

    Returns:
        AnnData: Filtered AnnData object with metadata added.
    """

    # Get sample name from AnnData
    sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample

    # Find the corresponding metadata file
    metadata_file = [f for f in os.listdir(metadata_dir) if sample in f]
    if not metadata_file:
        raise FileNotFoundError(f"No metadata file found for sample: {sample}")
    
    metadata_file = os.path.join(metadata_dir, metadata_file[0])
    
    # Load metadata file (assuming it's CSV or TSV)
    if metadata_file.endswith(".csv"):
        metadata = pd.read_csv(metadata_file, index_col=0)
    elif metadata_file.endswith(".tsv"):
        metadata = pd.read_csv(metadata_file, sep="\t", index_col=0)
    else:
        raise ValueError("Metadata file must be .csv or .tsv format")
    
    # Print number of cells considered by the authors (metadata file rows)
    num_cells_in_metadata = metadata.shape[0]
    print(f"Number of cells considered by the authors: {num_cells_in_metadata}")

    # Ensure metadata index (cell barcodes) matches the format in adata.obs.index
    patient = adata.obs['patient'][0]
    dx = adata.obs['dx'][0]
    metadata.index = metadata.index.astype(str) + '___' + patient + '_' + dx

    # Get cell barcodes in the AnnData object
    adata_cells = adata.obs.index.astype(str)

    # Find intersection (cells that exist in both)
    valid_cells = metadata.index.intersection(adata_cells)

    # Print the number of removed cells
    num_removed = len(adata_cells) - len(valid_cells)
    print(f"Number of removed cells: {num_removed}")

    # Subset AnnData to only keep valid cells
    adata = adata[valid_cells].copy()

    # Merge metadata into `adata.obs`
    adata.obs = adata.obs.merge(metadata, left_index=True, right_index=True, how="left")

    return adata


In [10]:
adatas = [filter_by_metadata(a) for a in adatas]

  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 3030
Number of removed cells: 6147
Number of cells considered by the authors: 4400
Number of removed cells: 4875
Number of cells considered by the authors: 2580
Number of removed cells: 6200
Number of cells considered by the authors: 6577
Number of removed cells: 25983


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample


Number of cells considered by the authors: 4933
Number of removed cells: 4120
Number of cells considered by the authors: 6227
Number of removed cells: 3692
Number of cells considered by the authors: 1990
Number of removed cells: 9304
Number of cells considered by the authors: 5157
Number of removed cells: 4682


  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 2695
Number of removed cells: 45990
Number of cells considered by the authors: 6673
Number of removed cells: 2631
Number of cells considered by the authors: 6585
Number of removed cells: 2112


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 2815
Number of removed cells: 20136
Number of cells considered by the authors: 3983
Number of removed cells: 7224
Number of cells considered by the authors: 6992
Number of removed cells: 3005


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 5525
Number of removed cells: 4669
Number of cells considered by the authors: 36
Number of removed cells: 13052
Number of cells considered by the authors: 1808
Number of removed cells: 8561
Number of cells considered by the authors: 1840
Number of removed cells: 7660
Number of cells considered by the authors: 7915
Number of removed cells: 3246


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 5278
Number of removed cells: 5085
Number of cells considered by the authors: 9167
Number of removed cells: 2405


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 1181
Number of removed cells: 2229
Number of cells considered by the authors: 4410
Number of removed cells: 5629
Number of cells considered by the authors: 2757
Number of removed cells: 4539
Number of cells considered by the authors: 2532
Number of removed cells: 3809
Number of cells considered by the authors: 7836
Number of removed cells: 2903


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 4830
Number of removed cells: 9087
Number of cells considered by the authors: 8524
Number of removed cells: 1773


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 3010
Number of removed cells: 10222
Number of cells considered by the authors: 6895
Number of removed cells: 4467
Number of cells considered by the authors: 4425
Number of removed cells: 4648


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 7191
Number of removed cells: 4340


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 7239
Number of removed cells: 2304
Number of cells considered by the authors: 6838
Number of removed cells: 3024


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 2389
Number of removed cells: 4167
Number of cells considered by the authors: 5179
Number of removed cells: 775
Number of cells considered by the authors: 6340
Number of removed cells: 2381


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 5120
Number of removed cells: 2877
Number of cells considered by the authors: 2669
Number of removed cells: 6788
Number of cells considered by the authors: 1736
Number of removed cells: 1990


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 8365
Number of removed cells: 2438
Number of cells considered by the authors: 5358
Number of removed cells: 3929


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 6757
Number of removed cells: 6032
Number of cells considered by the authors: 5259
Number of removed cells: 2980


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 3706
Number of removed cells: 10156
Number of cells considered by the authors: 2217
Number of removed cells: 5122
Number of cells considered by the authors: 3591
Number of removed cells: 7403
Number of cells considered by the authors: 3278
Number of removed cells: 13362


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 1521
Number of removed cells: 10166
Number of cells considered by the authors: 3702
Number of removed cells: 16447
Number of cells considered by the authors: 3029
Number of removed cells: 5555
Number of cells considered by the authors: 8058
Number of removed cells: 5036


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 6388
Number of removed cells: 5080
Number of cells considered by the authors: 2381
Number of removed cells: 7213
Number of cells considered by the authors: 5564
Number of removed cells: 4684


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample


Number of cells considered by the authors: 5588
Number of removed cells: 10494
Number of cells considered by the authors: 5153
Number of removed cells: 3876


  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 4476
Number of removed cells: 8899
Number of cells considered by the authors: 7335
Number of removed cells: 2078


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 1413
Number of removed cells: 15616
Number of cells considered by the authors: 4376
Number of removed cells: 3883
Number of cells considered by the authors: 5279
Number of removed cells: 2903


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 4731
Number of removed cells: 15294


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 4008
Number of removed cells: 5464
Number of cells considered by the authors: 7615
Number of removed cells: 2544


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 4213
Number of removed cells: 11730
Number of cells considered by the authors: 7887
Number of removed cells: 1321


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 6327
Number of removed cells: 4144


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 4686
Number of removed cells: 3717


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 3936
Number of removed cells: 4767
Number of cells considered by the authors: 7368
Number of removed cells: 1347


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 6037
Number of removed cells: 992
Number of cells considered by the authors: 3972
Number of removed cells: 5767


  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]
  sample = adata.obs['sample'][0]  # Assumes all cells in adata belong to the same sample
  patient = adata.obs['patient'][0]
  dx = adata.obs['dx'][0]


Number of cells considered by the authors: 2567
Number of removed cells: 20173
Number of cells considered by the authors: 500
Number of removed cells: 7098


In [13]:
adatas[2]

AnnData object with n_obs × n_vars = 2580 × 33538
    obs: 'patient', 'dx', 'sample', 'batch', 'GEO_ID', 'Lambo_et_al_ID', 'Patient_Sample', 'Library_ID', 'Counts', 'Features', 'Mitochondria_percent', 'Classified_Celltype', 'Seurat_Cluster', 'Malignant', 'Patient_ID', 'Biopsy_Origin', 'Age_Months', 'Disease_free_days', 'Clinical_Blast_Percent', 'Expected_Driving_Aberration', 'Subgroup', 'Color_Subgroup', 'Known_CNVs', 'Treatment_Outcome', 'nCount_RNA', 'nFeature_RNA'
    var: 'gene_ids', 'feature_types', 'genome'

## 1.5 Remove doublets

In [14]:
# Remove doublets
clf = doubletdetection.BoostClassifier(
    n_iters=10,
    clustering_algorithm="louvain",
    standard_scaling=True,
    pseudocount=0.1,
    n_jobs=-1)

In [15]:
def remove_doublets(adata):
    doublets = clf.fit(adata.X).predict(p_thresh=1e-3, voter_thresh=0.5)
    doublet_score = clf.doublet_score()

    adata.obs["doublet"] = doublets
    adata.obs["doublet_score"] = doublet_score

    adata.uns['doublets_removed'] = adata.obs.doublet.sum()
    adata = adata[adata.obs.doublet == 0]

    return adata


In [16]:
adatas = [remove_doublets(adata) for adata in adatas]

100%|██████████| 10/10 [00:14<00:00,  1.48s/it]
100%|██████████| 10/10 [00:12<00:00,  1.27s/it]
100%|██████████| 10/10 [00:07<00:00,  1.27it/s]
100%|██████████| 10/10 [00:51<00:00,  5.12s/it]
100%|██████████| 10/10 [00:17<00:00,  1.74s/it]
100%|██████████| 10/10 [00:02<00:00,  3.94it/s]
100%|██████████| 10/10 [00:06<00:00,  1.44it/s]
100%|██████████| 10/10 [00:15<00:00,  1.60s/it]
100%|██████████| 10/10 [00:07<00:00,  1.30it/s]
100%|██████████| 10/10 [00:31<00:00,  3.20s/it]
100%|██████████| 10/10 [00:33<00:00,  3.35s/it]
100%|██████████| 10/10 [00:08<00:00,  1.21it/s]
100%|██████████| 10/10 [00:11<00:00,  1.15s/it]
100%|██████████| 10/10 [00:34<00:00,  3.42s/it]
100%|██████████| 10/10 [00:16<00:00,  1.60s/it]
100%|██████████| 10/10 [00:01<00:00,  7.46it/s]
100%|██████████| 10/10 [00:05<00:00,  1.68it/s]
100%|██████████| 10/10 [00:05<00:00,  1.75it/s]
100%|██████████| 10/10 [00:38<00:00,  3.89s/it]
100%|██████████| 10/10 [00:16<00:00,  1.64s/it]
100%|██████████| 10/10 [00:42<00:00,  4.

In [17]:
for adata in adatas:
    adata.write_h5ad(os.path.join(filtered_adata_dir, adata.obs['sample'][0] + '_filtered.h5ad'))

  adata.write_h5ad(os.path.join(filtered_adata_dir, adata.obs['sample'][0] + '_filtered.h5ad'))
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  adata.write_h5ad(os.path.join(filtered_adata_dir, adata.obs['sample'][0] + '_filtered.h5ad'))
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  adata.write_h5ad(os.path.join(filtered_adata_dir, adata.obs['sample'][0] + '_filtered.h5ad'))
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = 

In [18]:
len(adatas)

75

## 1.6 Cocatenate AnnData files and store in disk

In [56]:
# Concatenate all AnnData objects
adata = ad.concat(adatas, join='outer')



In [57]:
adata.raw = adata

In [58]:
adata.write(os.path.join(output_dir, patient + "_analysis_data.h5ad"))  # Save as H5AD file

In [60]:
adata.obs_names

Index(['AAACCCACAGCTGTGC-1___AML8_REM', 'AAACCCATCTAAGAAG-1___AML8_REM',
       'AAACGAAGTACTGGGA-1___AML8_REM', 'AAACGCTAGCCTTCTC-1___AML8_REM',
       'AAACGCTAGGACTTCT-1___AML8_REM', 'AAACGCTAGTTCTACG-1___AML8_REM',
       'AAACGCTAGTTTCGGT-1___AML8_REM', 'AAACGCTCACAGCTTA-1___AML8_REM',
       'AAACGCTCACATTCGA-1___AML8_REM', 'AAACGCTCAGCTTTGA-1___AML8_REM',
       ...
       'TTTGGAGAGGTAAACT-1___AML8_DX', 'TTTGGTTCAAACAGGC-1___AML8_DX',
       'TTTGGTTCAGCATTGT-1___AML8_DX', 'TTTGGTTTCACCTTAT-1___AML8_DX',
       'TTTGTTGAGAGGGCGA-1___AML8_DX', 'TTTGTTGCACAGCCTG-1___AML8_DX',
       'TTTGTTGCACCAGCTG-1___AML8_DX', 'TTTGTTGCATCGCTAA-1___AML8_DX',
       'TTTGTTGCATGTTTGG-1___AML8_DX', 'TTTGTTGCATTCATCT-1___AML8_DX'],
      dtype='object', length=11583)

In [None]:
""" def load_it(filename, adata_dir, metadata_dir):
    """ Load a single adata file and add metadata 
    Args:
        filename (str): Name of the adata file
        adata_dir (str): Directory containing adata files
        metadata_dir (str): Directory containing metadata files
    Returns:
        adata (AnnData): AnnData object
    """
    split = filename.split('_')
    patient = split[1]
    dx = split[2]
   

    adata = sc.read_h5ad(os.path.join(adata_dir, filename))
    adata.obs['patient'] = patient
    adata.obs['dx'] = dx
    adata.obs['sample'] = split[0] + '_' + patient + '_' + dx
    adata.obs['batch'] = patient + dx

    # Only consider genes with more than 1 count
    sc.pp.filter_genes(adata, min_counts=1)

    #Add metadata
    metadata_file = split[0] + '_' + patient + '_' + dx + '_' + 'processed' + '_' + 'metadata' + '.tsv'
    metadata = pd.read_csv(metadata_dir + '/' + metadata_file, sep="\t")

    # Set the cell barcode as the index in metadata
    metadata = metadata.set_index("Cell_Barcode")

    # Keep only barcodes that exist in adata
    metadata = metadata.reindex(adata.obs.index)

    # Add metadata columns to adata.obs
    adata.obs['counts'] = metadata['Counts']
    adata.obs['features'] = metadata['Features']
    adata.obs['percent_mito'] = metadata['Mitochondria_percent']
    adata.obs['cell_type'] = metadata['Classified_Celltype']
    adata.obs['malignant'] = metadata['Malignant']
    adata.obs['subgroup'] = metadata['Subgroup']
    
    adata.obs['cell_barcode'] = adata.obs.index
    adata.obs.index = adata.obs.index + '_' + patient + '_' + dx

    return adata """