In [None]:
### Description
"""
This script loads a single-nucleus RNA-seq dataset from afca, removes cells with unspecified sex, 
and filters cell types based on two criteria: at least 200 cells in total and at least 100 cells per age group. 
Cell types that meet these thresholds are saved as separate .h5ad files for downstream analysis.
"""

# Import the libraries
import os
import sys
import random
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import anndata as ad

In [None]:
# load the dataset data and get the metadata
adata = ad.read_h5ad("/hpc/shared/onco_janssen/dhaynessimmons/data/ageing_flies/afca_data/base_body_h5ad_data/adata_body_S_v1.0.h5ad")

In [None]:
# print out the basics
print([i for i in adata.obs.columns])
print("shape of full data: ", adata.shape)
#value couns for important columns
print("\nAge value counts: ", adata.obs["age"].value_counts())
print("\nSex value counts: ", adata.obs["sex"].value_counts())
print("\nDataset value counts: ", adata.obs["dataset"].value_counts())

['tissue', 'sex', 'age', 'sex_age', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'log1p_total_counts_mt', 'dataset', 'fca_annotation', 'afca_annotation', 'afca_annotation_broad']
shape of full data:  (276273, 15992)

Age value counts:  age
5     96594
30    84496
70    49963
50    45220
Name: count, dtype: int64

Sex value counts:  sex
female    148049
male      123879
mix         4345
Name: count, dtype: int64

Dataset value counts:  dataset
AFCA    179679
FCA      96594
Name: count, dtype: int64


In [None]:
# Get the unique afca cell types
for cell_type in sorted(adata.obs['afca_annotation'].unique().tolist()):
    print(cell_type)

16-cell germline cyst in germarium region 2a and 2b
CNS surface associated glial cell
adult Malpighian tubule principal cell
adult Malpighian tubule principal cell of initial segment
adult Malpighian tubule principal cell of lower segment
adult Malpighian tubule principal cell of lower ureter
adult Malpighian tubule stellate cell of main segment
adult alary muscle
adult differentiating enterocyte
adult fat body_body
adult glial cell
adult heart ventral longitudinal muscle
adult hindgut
adult midgut enterocyte
adult midgut-hindgut hybrid zone
adult oenocyte
adult peripheral nervous system
adult reticular neuropil associated glial cell_body
adult salivary gland
adult tracheal cell
adult ventral nervous system
anterior ejaculatory duct
antimicrobial peptide-producing cell
cardia (1)
cardia (2)
cardiomyocyte, working adult heart muscle (non-ostia)
cell body glial cell
copper cell
crop
cyst cell
ejaculatory bulb
ejaculatory bulb epithelium
enteroblast
enterocyte of anterior adult midgut epi

: 

In [None]:
# get cells where sex is neither F nor M
mix_adata = adata[(adata.obs.sex != "female")&(adata.obs.sex != "male")].copy()
print(mix_adata.obs.shape)
# Remove them from the dataset
mf_adata = adata[~(adata.obs.index.isin(mix_adata.obs.index))&(adata.obs['afca_annotation'] != "unannotated")].copy()
print(mf_adata.obs.shape)


(4345, 15)
(265979, 15)


In [None]:
# Get the observation dataframe as a pandas dataframe
mf_adata_obs = mf_adata.obs.copy()
print(type(mf_adata_obs))
cell_list =[]

# Loop through the afca cell types and create a new adata object for each cell type
for cell_type in mf_adata_obs['afca_annotation'].unique():
    cell_type = cell_type.replace("/", "-")
    
    print("\n\nWorking on cell type: ", cell_type, "\n")	
    fig_path = f"/hpc/shared/onco_janssen/dhaynessimmons/figures/ageing_flies/{cell_type}"
    data_path = f"/hpc/shared/onco_janssen/dhaynessimmons/data/ageing_flies/base_body_h5ad_data/{cell_type}"
    os.makedirs(fig_path, exist_ok=True)

    # Get the indices of the cells that match the current cell type
    indices = mf_adata_obs[mf_adata_obs['afca_annotation'] == cell_type].index
    # Create new pandas dataframe with the indices
    cell_type_df = mf_adata_obs.loc[indices]

    # Evaluate the QC of the new adata object
    cell_cnt = cell_type_df.shape[0]
    print("number of cells: ", cell_cnt)
    print("Min number of genes expressed : ", cell_type_df.n_genes_by_counts.min())

    if cell_cnt < 200:
        print("Not enough cells to proceed with analysis")
        continue
    else:
        print("Sufficient cells to proceed with analysis")# Check that each age groups has at least 100 cells
        age_grouped = cell_type_df.groupby('age', observed=False).size()
        del cell_type_df
        min_value = age_grouped.min()
        del age_grouped
        print("Minimum number of cells in an age group: ", min_value)
        if min_value < 100:
            print("Not enough cells in an age group to proceed with analysis")
            continue
        else:
            print("Sufficient cells in each age group to proceed with analysis")
            # Create a new adata object with the cell type data
            cell_list.append(cell_type)
print(cell_list)
del mf_adata_obs


<class 'pandas.core.frame.DataFrame'>


Working on cell type:  follicle cell 

number of cells:  25251
Min number of genes expressed :  226
Sufficient cells to proceed with analysis
Minimum number of cells in an age group:  5826
Sufficient cells in each age group to proceed with analysis


Working on cell type:  adult fat body_body 

number of cells:  31311
Min number of genes expressed :  224
Sufficient cells to proceed with analysis
Minimum number of cells in an age group:  2703
Sufficient cells in each age group to proceed with analysis


Working on cell type:  adult hindgut 

number of cells:  683
Min number of genes expressed :  286
Sufficient cells to proceed with analysis
Minimum number of cells in an age group:  146
Sufficient cells in each age group to proceed with analysis


Working on cell type:  adult oenocyte 

number of cells:  5925
Min number of genes expressed :  233
Sufficient cells to proceed with analysis
Minimum number of cells in an age group:  1081
Sufficient cell

In [None]:
# Create unique dataset based on the cell list
new_adata = adata[adata.obs['afca_annotation'].isin(cell_list)].copy()
# Save the new adata object
# new_adata.write_h5ad("/hpc/shared/onco_janssen/dhaynessimmons/data/ageing_flies/afca_data/base_body_h5ad_data/adata_body_filtered_220425.h5ad")


In [None]:
# Create the cell_type_specific datasets and save them 
save_path = "/hpc/shared/onco_janssen/dhaynessimmons/data/ageing_flies/afca_data/base_body_h5ad_data/ct_specific/"
for cell_type in cell_list:
    print("\n\nWorking on cell type: ", cell_type, "\n")
    ct_subset = new_adata[new_adata.obs['afca_annotation'] == cell_type].copy()

    ct_subset.write_h5ad(f"{save_path}{cell_type}.h5ad")