In [1]:
import os
import scanpy as sc
import numpy as np
import pandas as pd
import anndata
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def make_kfold(anndata_give,name_column,n_fold,override = False): 
    '''
    Is a function that takes an Anndata as input, and creates a new column containing a group number. Equally distributed within each cell type.

    Keyword arguments:
    anndata_give -- Choose the anndata.
    name_column -- Choose a name for the column with the fold group name
    n_fold -- Choose the nuber of fold group.
    override -- Autorise or not to overwrite a column existing and replace it by the new column (default False)
    '''

    # Check if the anndata provided is a real anndata.
    assert type(anndata_give) == anndata._core.anndata.AnnData, "Error : Type of '" + anndata_give + "' is not an anndata." 
    #Check if the name_column is a string
    assert type(name_column) == str, "Error : Column name is not a string."
    # Check whether the n_fold supplied is a int.
    assert type(n_fold) == int, "Error : Type of n_fold is incorrect. Change it to a int."
    # Autorise or not to overwrite a column already existing
    assert override == True and name_column not in anndata_give.obs.columns, "Error : Column name exists. Activate override mode to overwrite column"
        
    # Create a new column, which only have "None" inside
    anndata_give.obs[name_column] = None 

    # Loop over the cell types to create fold groups in each of them
    for cell_type in anndata_give.obs.celltype.unique():
        # Retrieve the list of cells of the current cell type
        cell_name_list = list(anndata_give.obs.index[anndata_give.obs.celltype == cell_type])
        # Monitors whether the number of cells within the cell type is sufficient
        if len(cell_name_list) < 20 :
            print("Error : Not enought cell in :", cell_type, "\nIt won't be taken into account for training and testing.")
            continue
        # Shuffle the list of cells to change their order
        random.shuffle(cell_name_list)
        # Split the list of cells into the required number of groups
        array_fold = np.array_split(np.array(cell_name_list),n_fold)
        # Assign to each fold group the index of their group
        for fold_index in range(len(array_fold)):
            anndata_give.obs.loc[array_fold[fold_index] , name_column] = fold_index

In [3]:
def run_finetuning(anndata_give, name_column, n_fold, list_type_cell=None):
    '''
    run_finetuning is a function that takes as input 1 anndata , a column and a fold number or a list of fold numbers, with a list of cell types to be excluded for the training phase.
    On output, the function provides 2 anndata, one for the test with the fold numbers chosen beforehand. And an anndata for training with the anndata not selected for the test phase and without the cell type list, if this has been supplied.
    
    Keyword arguments:
    anndata_give -- Choose the anndata.
    name_column -- Name of column with folds (group) name
    n_fold -- Provide (by a value, or a list) the folds for the test anndata
    list_type_cell -- List of cell types to be excluded from the training anndata. (default None)
    '''

    # Check if the anndata provided is a real anndata.
    assert type(anndata_give) == anndata._core.anndata.AnnData, "Error : Type of '" + anndata_give + "' is not an anndata." 
    #Check if the name_column is a string
    assert type(name_column) == str, "Error : Column name is not a string."
    # Check that the given column name exists in the anndata
    assert name_column in anndata_give.obs.columns , "Error : Column name don't exists."
    # Check whether the n_fold supplied can be read by the function.
    assert type(n_fold) == int or type(n_fold) == list , "Error : Type of n_fold is incorrect. Change it to a int or a list."
    # Check if "list_type_cell" is not none and if it's a list
    assert list_type_cell is None or type(list_type_cell) == list , "Error : " + list_type_cell + " is not a list."
 
    # Anndata creation block in case of a list.
    if type(n_fold) == list :
        # Go through the values in the list, and if one of them is not present in the column provided, display the error.
        for index in n_fold :
            assert anndata_give.obs[name_column].isin([index]).any() == True, "Error: The fold number in your list is not in " + name_column + " column."   
        # The values present in the list are scanned, separating the cells containing or not containing the fold number supplied as input. Either in the test or in the training.
        anndata_test = anndata_give[[k in n_fold for k in anndata_give.obs[name_column]]]
        # If no cell type list has been provided, then anndata_training returns only the groups that have not been given as input.
        if list_type_cell is None :
            anndata_training = anndata_give[[index not in n_fold for index in anndata_give.obs[name_column]]]
        # If a list of cell types has been provided, then anndata_training returns only the groups that have not been given as input and without the cell types present in the list.
        else :
            anndata_training = anndata_give[[index not in list_type_cell for index in anndata_give.obs.celltype]] and anndata_give[[index not in n_fold for index in anndata_give.obs[name_column]]]
            
   
    
    # Anndata creation block in case of a int.
    if type(n_fold) == int :
        # Check if the n_fold are not in colum provided, display the error.
        assert anndata_give.obs[name_column].isin([n_fold]).any() == True , "Error : The fold number is not present in " + name_column + " column."   
        # Separate the cells contain in name_column the n_fold in a new anndata name 'anndata_test' and the other in a anndata_training.
        anndata_test = anndata_give[anndata_give.obs[name_column] == n_fold]
        # If no cell type list has been provided, then anndata_training returns only the groups that have not been given as input.
        if list_type_cell is None :
            anndata_training = anndata_give[anndata_give.obs[name_column] != n_fold]
        # If a list of cell types has been provided, then anndata_training returns only the groups that have not been given as input and without the cell types present in the list.    
        else :
            anndata_training = anndata_give[[index not in list_type_cell for index in anndata_give.obs.celltype]] and anndata_give[anndata_give.obs[name_column] != n_fold] 
            
    return anndata_test, anndata_training

In [4]:
# Input path of the reference/raw data
# Input path of the reference/raw data
PATH_PROJECT = "/mnt/DOSI/PLATEFORMES/BIOINFORMATIQUE/04_PROJECT/scLLM"
PATH_EXPERIMENT = os.path.join( PATH_PROJECT, "Human_Thymus_Development_Atlas")
PATH_EXPERIMENT_REFERENCE = os.path.join( PATH_EXPERIMENT, "01_Reference")
PATH_EXPERIMENT_REFERENCE_EXTRA = os.path.join( PATH_EXPERIMENT_REFERENCE, "00_Dataset")
PATH_EXPERIMENT_OUTPUT = os.path.join( PATH_EXPERIMENT, "05_Output")

PATH_INPUT_FILE = os.path.join( PATH_EXPERIMENT_REFERENCE_EXTRA, "Human_Thymus_Development_Atlas.h5ad")


# Output path of the pre processed dataset
PATH_EXPERIMENT_OUTPUT = os.path.join( PATH_EXPERIMENT, "05_Output")
ANALYSIS_NAME = "01_Datapreprocessing"
EXTRA_ANALYSIS_NAME_ANNDATA = "Preprocess_Anndata_File_scBERT"
PATH_ANALYSIS_OUTPUT_ANNDATA = os.path.join( PATH_EXPERIMENT_OUTPUT, ANALYSIS_NAME, EXTRA_ANALYSIS_NAME_ANNDATA)

PATH_OUTPUT_FILE_ANNDATA_TEST = os.path.join( PATH_ANALYSIS_OUTPUT_ANNDATA, "Human_Thymus_Development_Atlas_test_Preprocess.h5ad")
PATH_OUTPUT_FILE_ANNDATA_TRAINING = os.path.join( PATH_ANALYSIS_OUTPUT_ANNDATA, "Human_Thymus_Development_Atlas_training_Preprocess.h5ad")
os.makedirs(os.path.dirname(PATH_OUTPUT_FILE_ANNDATA_TEST), exist_ok = True)

MINIMAL_NUMBER_CELL_BY_TYPE = 40

In [5]:
# Read the reference/raw data
dataset_anndata = sc.read_h5ad(PATH_INPUT_FILE)

In [6]:
# Look to see if a column exists containing the name of the cell type. If this column exists, it must be written correctly. It must be "celltype".
dataset_anndata.obs.columns

Index(['assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'sex_ontology_term_id',
       'tissue_ontology_term_id', 'Sample', 'n_counts', 'n_genes', 'donor_id',
       'sort', 'method', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'suspension_type', 'tissue_type', 'cell_type',
       'assay', 'disease', 'organism', 'sex', 'tissue',
       'self_reported_ethnicity', 'development_stage', 'observation_joinid'],
      dtype='object')

In [7]:
# Rename the column cell_type by celltype (In this case : rename cell_type by celltype)
dataset_anndata.obs.rename(columns={"cell_type" : "celltype"}, inplace=True)

In [8]:
# We check whether a column in "var" exists and contains the names of the genes ans if it' in index
dataset_anndata.var

Unnamed: 0_level_0,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length
feature_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,False,TSPAN6,NCBITaxon:9606,gene,4536
ENSG00000000005,False,TNMD,NCBITaxon:9606,gene,1476
ENSG00000000419,False,DPM1,NCBITaxon:9606,gene,9276
ENSG00000000457,False,SCYL3,NCBITaxon:9606,gene,6883
ENSG00000000460,False,C1orf112,NCBITaxon:9606,gene,5970
...,...,...,...,...,...
ENSG00000283096,False,RP11-157J13.1,NCBITaxon:9606,gene,1259
ENSG00000283103,False,LLNLR-245B6.1,NCBITaxon:9606,gene,4467
ENSG00000283117,False,MGC4859,NCBITaxon:9606,gene,3118
ENSG00000283118,False,RP11-107E5.4,NCBITaxon:9606,gene,644


In [9]:
# If this is not the case, we copy the previous index and put it in a column to avoid losing information.
dataset_anndata.var["index_column"] = dataset_anndata.var.index

#On vient remplacer le nom de la var
dataset_anndata.var.rename(columns={"feature_name" : "gene_name"}, inplace=True)

# Move the "gene_name" column to index. (This overwrites the previous column)
dataset_anndata.var = dataset_anndata.var.set_index("gene_name")

#On recopie encore, car il a besoins en colonne et pas uniquement en index, avec le même non
dataset_anndata.var["gene_name"] = dataset_anndata.var.index

AnnData expects .var.index to contain strings, but got values like:
    ['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


In [10]:
dataset_anndata.var

Unnamed: 0_level_0,feature_is_filtered,feature_reference,feature_biotype,feature_length,index_column,gene_name
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TSPAN6,False,NCBITaxon:9606,gene,4536,ENSG00000000003,TSPAN6
TNMD,False,NCBITaxon:9606,gene,1476,ENSG00000000005,TNMD
DPM1,False,NCBITaxon:9606,gene,9276,ENSG00000000419,DPM1
SCYL3,False,NCBITaxon:9606,gene,6883,ENSG00000000457,SCYL3
C1orf112,False,NCBITaxon:9606,gene,5970,ENSG00000000460,C1orf112
...,...,...,...,...,...,...
RP11-157J13.1,False,NCBITaxon:9606,gene,1259,ENSG00000283096,RP11-157J13.1
LLNLR-245B6.1,False,NCBITaxon:9606,gene,4467,ENSG00000283103,LLNLR-245B6.1
MGC4859,False,NCBITaxon:9606,gene,3118,ENSG00000283117,MGC4859
RP11-107E5.4,False,NCBITaxon:9606,gene,644,ENSG00000283118,RP11-107E5.4


In [11]:
#We keep only 8000 cell from the dataset and 3000 gene (dataset_anndata = dataset_anndata[0:8000, 0:3000])
#dataset_anndata = dataset_anndata[0:3000, 0:5000]

In [12]:
# Loop over the cell types to create fold groups in each of them
for celltype in dataset_anndata.obs.celltype.unique():
    # Retrieve the list of cells of the current cell type
    cell_name_list = list(dataset_anndata.obs.index[dataset_anndata.obs.celltype == celltype])
    # Monitors whether the number of cells within the cell type is sufficient
    if len(cell_name_list) < MINIMAL_NUMBER_CELL_BY_TYPE :
        # Remove the all the cell from the cell type
        print("Warning: The cell type ", celltype, " is removed from the dataset because it does not contain enough cells (", MINIMAL_NUMBER_CELL_BY_TYPE, ").", sep='')
        dataset_anndata = dataset_anndata[~dataset_anndata.obs.celltype.str.contains(celltype)]



In [13]:
make_kfold(dataset_anndata, "K_Fold", 5, override = True)

  anndata_give.obs[name_column] = None


In [14]:
liste = [3,4]
dataset_anndata_test, dataset_anndata_training = run_finetuning(dataset_anndata, "K_Fold", liste)

In [17]:
dataset_anndata_training

View of AnnData object with n_obs × n_vars = 153539 × 32922
    obs: 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'Sample', 'n_counts', 'n_genes', 'donor_id', 'sort', 'method', 'file', 'mito', 'doublet_scores', 'predicted_doublets', 'suspension_type', 'tissue_type', 'celltype', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'K_Fold'
    var: 'feature_is_filtered', 'feature_reference', 'feature_biotype', 'feature_length', 'index_column', 'gene_name'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_umap'

In [15]:
# We write a new file containing the pre-processed data.
dataset_anndata_test.write_h5ad(PATH_OUTPUT_FILE_ANNDATA_TEST)

TypeError: Can't implicitly convert non-string objects to strings

Above error raised while writing key 'K_Fold' of <class 'h5py._hl.group.Group'> to /

In [None]:
# We write a new file containing the pre-processed data. 
dataset_anndata_training.write_h5ad(PATH_OUTPUT_FILE_ANNDATA_TRAINING)