In [3]:
# Description:
"""
This script processes a single-nucleus RNA-seq dataset from afca, removes cells with unspecified sex, 
and filters cell types with fewer than 200 total cells or fewer than 100 cells per age group. For each remaining cell type, 
it filters out lowly expressed genes, while retaining those expressed in at least 3 cells or included in the predefined gene list. 
Each processed cell type is saved as a separate .h5ad file.
"""

# Import the libraries
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import anndata as ad

In [4]:
# load the dataset data and get the metadata
adata = ad.read_h5ad("/hpc/shared/onco_janssen/dhaynessimmons/projects/ageing_flies/data/adata_body_S_v1.0.h5ad")

In [5]:
# Set the functions
def split_by_batch_prefix(name):
    """
    Splits the string into two parts:
    - Part before 'AFCA' or 'FCA'
    - The rest starting with 'AFCA' or 'FCA'
    """
    match = re.search(r'(AFCA|FCA)', name)
    i = match.start()
    return name[i:]

In [6]:
# print out the basics
print([i for i in adata.obs.columns])
print("shape of full data: ", adata.shape)
#value couns for important columns
print("\nAge value counts: ", adata.obs["age"].value_counts())
print("\nSex value counts: ", adata.obs["sex"].value_counts())
print("\nDataset value counts: ", adata.obs["dataset"].value_counts())

['tissue', 'sex', 'age', 'sex_age', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'log1p_total_counts_mt', 'dataset', 'fca_annotation', 'afca_annotation', 'afca_annotation_broad']
shape of full data:  (276273, 15992)

Age value counts:  age
5     96594
30    84496
70    49963
50    45220
Name: count, dtype: int64

Sex value counts:  sex
female    148049
male      123879
mix         4345
Name: count, dtype: int64

Dataset value counts:  dataset
AFCA    179679
FCA      96594
Name: count, dtype: int64


In [7]:
# Get the unique afca cell types
for cell_type in sorted(adata.obs['afca_annotation'].unique().tolist()):
    print(cell_type)

16-cell germline cyst in germarium region 2a and 2b
CNS surface associated glial cell
adult Malpighian tubule principal cell
adult Malpighian tubule principal cell of initial segment
adult Malpighian tubule principal cell of lower segment
adult Malpighian tubule principal cell of lower ureter
adult Malpighian tubule stellate cell of main segment
adult alary muscle
adult differentiating enterocyte
adult fat body_body
adult glial cell
adult heart ventral longitudinal muscle
adult hindgut
adult midgut enterocyte
adult midgut-hindgut hybrid zone
adult oenocyte
adult peripheral nervous system
adult reticular neuropil associated glial cell_body
adult salivary gland
adult tracheal cell
adult ventral nervous system
anterior ejaculatory duct
antimicrobial peptide-producing cell
cardia (1)
cardia (2)
cardiomyocyte, working adult heart muscle (non-ostia)
cell body glial cell
copper cell
crop
cyst cell
ejaculatory bulb
ejaculatory bulb epithelium
enteroblast
enterocyte of anterior adult midgut epi

In [8]:
# get cells where sex is neither F nor M
mix_adata = adata[(adata.obs.sex != "female")&(adata.obs.sex != "male")].copy()
print(mix_adata.obs.shape)
# Remove them from the dataset
mf_adata = adata[~(adata.obs.index.isin(mix_adata.obs.index))&(adata.obs['afca_annotation'] != "unannotated")].copy()
# get the batch from the row name
mf_adata.obs['indiv'] = mf_adata.obs.index.map(lambda x: split_by_batch_prefix(x))
print(mf_adata.obs.shape)

(4345, 15)
(265979, 16)


In [None]:
# Get the observation dataframe as a pandas dataframe
mf_adata_obs = mf_adata.obs.copy()
print(type(mf_adata_obs))
cell_list = []

# Set the save path 
save_path = "/hpc/shared/onco_janssen/dhaynessimmons/projects/ageing_flies/data/ct_specific/"
os.makedirs(save_path, exist_ok=True)

# Gene list of interest
gene_list = [
    "Su(var)205", "Su(var)3-9", "G9a", "HP1b", "HP1c", "HP4",
    "HP5", "HP6", "ADD1", "Su(var)2-HP2", "Su(var)3-7", "Lam",
    "LamC", "LBR", "Kdm4A", "Kdm4B", "His2Av", "His3.3A", "His3.3B"
]

# Loop through the afca cell types and create a new adata object for each cell type
for cell_type in mf_adata_obs['afca_annotation'].unique():
    print("\n\nWorking on cell type: ", cell_type, "\n")	

    # Editting the cell type name to make it a valid file name
    cell_name = cell_type.replace(" ", "_").replace("/", "_")

    # Get the indices of the cells that match the current cell type
    indices = mf_adata_obs[mf_adata_obs['afca_annotation'] == cell_type].index
    # Create new pandas dataframe with the indices
    cell_type_df = mf_adata_obs.loc[indices]

    # Evaluate the QC of the new adata object
    cell_cnt = cell_type_df.shape[0]
    print("number of cells: ", cell_cnt)
    print("Min number of genes expressed : ", cell_type_df.n_genes_by_counts.min())

    if cell_cnt < 200:
        print("Not enough cells to proceed with analysis")
        continue
    else:
        print("Sufficient cells to proceed with analysis")

        # Check that each age group has at least 100 cells
        age_grouped = cell_type_df.groupby('age', observed=False).size()
        del cell_type_df
        min_value = age_grouped.min()
        del age_grouped
        print("Minimum number of cells in an age group: ", min_value)
        if min_value < 100:
            print("Not enough cells in an age group to proceed with analysis")
            continue
        else:
            print("Sufficient cells in each age group to proceed with analysis")
            # Create a new adata object with the cell type data
            cell_list.append(cell_type)

    # Create a new adata object for this cell type
    cell_type_adata = mf_adata[indices].copy()
    print("\nshape of cell type data: ", cell_type_adata.shape)


    # ----------- Custom gene filtering starts here ------------ #

    # Compute how many cells express each gene
    gene_expression_counts = np.array((cell_type_adata.X > 0).sum(axis=0)).flatten()

    # Get gene names
    gene_names = pd.Index(cell_type_adata.var_names)

    # Genes expressed in >= 3 cells
    genes_expressed_enough = gene_expression_counts >= 3

    # Create a boolean mask to keep genes that are either:
    # - expressed in enough cells
    # - or present in the gene_list
    gene_list_set = set(gene_list)
    genes_in_list = gene_names.isin(gene_list_set)

    # Combine masks
    genes_to_keep = genes_expressed_enough | genes_in_list

    # Filter genes
    cell_type_adata = cell_type_adata[:, genes_to_keep].copy()

    print("\nshape of cell type data after filtering: ", cell_type_adata.shape)
    print("Columns in the observation dataframe: ", cell_type_adata.obs.columns)
    print(cell_type_adata.obs.iloc[:, -3:])
 

    # ----------- End of custom filtering ------------ #

    # Save the new adata object
    cell_type_adata.write_h5ad(f"{save_path}{cell_name}.h5ad")
    print("\n\tSaved the new adata object to: ", f"{save_path}{cell_name}.h5ad\n")

# Print list of successfully processed cell types
print(cell_list)
del mf_adata_obs

<class 'pandas.core.frame.DataFrame'>


Working on cell type:  follicle cell 

number of cells:  25251
Min number of genes expressed :  226
Sufficient cells to proceed with analysis
Minimum number of cells in an age group:  5826
Sufficient cells in each age group to proceed with analysis

shape of cell type data:  (25251, 15992)

shape of cell type data after filtering:  (25251, 12468)
Columns in the observation dataframe:  Index(['tissue', 'sex', 'age', 'sex_age', 'n_genes_by_counts', 'total_counts',
       'total_counts_mt', 'pct_counts_mt', 'log1p_n_genes_by_counts',
       'log1p_total_counts', 'log1p_total_counts_mt', 'dataset',
       'fca_annotation', 'afca_annotation', 'afca_annotation_broad', 'indiv'],
      dtype='object')
                                                   afca_annotation  \
AAACCCAAGGATACAT-1_AFCA_female_body_30_S1            follicle cell   
AAACGAAAGGAGTCTG-1_AFCA_female_body_30_S1            follicle cell   
AAAGAACAGCTGAGCA-1_AFCA_female_body_30_S1      