#### Pre-processing data to optimize the use of scGPT

In [None]:
import os
import scanpy as sc
import numpy as np
import pandas as pd
import anndata
import random

In [3]:
# Input path of the reference/raw data
PATH_PROJECT = "/mnt/DOSI/PLATEFORMES/BIOINFORMATIQUE/04_PROJECT/scLLM"
PATH_EXPERIMENT = os.path.join( PATH_PROJECT, "Human_Thymus_Development_Atlas")
PATH_EXPERIMENT_REFERENCE = os.path.join( PATH_EXPERIMENT, "01_Reference")
PATH_EXPERIMENT_REFERENCE_EXTRA = os.path.join( PATH_EXPERIMENT_REFERENCE, "00_Dataset")
PATH_EXPERIMENT_OUTPUT = os.path.join( PATH_EXPERIMENT, "05_Output")

PATH_INPUT_FILE = os.path.join( PATH_EXPERIMENT_REFERENCE_EXTRA, "Human_Thymus_Development_Atlas.h5ad")


# Output path of the pre processed dataset
ANALYSIS_NAME = "01_Datapreprocessing"
EXTRA_ANALYSIS_NAME_ANNDATA = "Preprocess_Anndata_File_scGPT"
PATH_ANALYSIS_OUTPUT_ANNDATA = os.path.join( PATH_EXPERIMENT_OUTPUT, ANALYSIS_NAME, EXTRA_ANALYSIS_NAME_ANNDATA)

PATH_OUTPUT_FILE_ANNDATA = os.path.join( PATH_ANALYSIS_OUTPUT_ANNDATA, "Human_Thymus_Development_Atlas_Preprocess.h5ad")
os.makedirs(os.path.dirname(PATH_OUTPUT_FILE_ANNDATA), exist_ok = True)

# Constant to filter the minimum number of cells per cell type for annotation
MINIMAL_NUMBER_CELL_BY_TYPE = 40

In [4]:
# Read the reference/raw data
dataset_anndata = sc.read_h5ad(PATH_INPUT_FILE)

In [5]:
# Look to see if a column exists containing the name of the cell type. If this column exists, it must be written correctly. It must be "celltype".
dataset_anndata.obs.columns

Index(['assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'sex_ontology_term_id',
       'tissue_ontology_term_id', 'Sample', 'n_counts', 'n_genes', 'donor_id',
       'sort', 'method', 'file', 'mito', 'doublet_scores',
       'predicted_doublets', 'suspension_type', 'tissue_type', 'cell_type',
       'assay', 'disease', 'organism', 'sex', 'tissue',
       'self_reported_ethnicity', 'development_stage', 'observation_joinid'],
      dtype='object')

In [6]:
# Rename the column cell_type by celltype (In this case : rename cell_type by celltype)
dataset_anndata.obs.rename(columns={"cell_type" : "celltype"}, inplace=True)

##### We check whether a column in "var" exists and contains the names of the genes ans if it's in index

In [7]:
dataset_anndata.var

Unnamed: 0_level_0,feature_is_filtered,feature_name,feature_reference,feature_biotype,feature_length
feature_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,False,TSPAN6,NCBITaxon:9606,gene,4536
ENSG00000000005,False,TNMD,NCBITaxon:9606,gene,1476
ENSG00000000419,False,DPM1,NCBITaxon:9606,gene,9276
ENSG00000000457,False,SCYL3,NCBITaxon:9606,gene,6883
ENSG00000000460,False,C1orf112,NCBITaxon:9606,gene,5970
...,...,...,...,...,...
ENSG00000283096,False,RP11-157J13.1,NCBITaxon:9606,gene,1259
ENSG00000283103,False,LLNLR-245B6.1,NCBITaxon:9606,gene,4467
ENSG00000283117,False,MGC4859,NCBITaxon:9606,gene,3118
ENSG00000283118,False,RP11-107E5.4,NCBITaxon:9606,gene,644


In [8]:
# If this is not the case, we copy the previous index and put it in a column to avoid losing information.
dataset_anndata.var["index_column"] = dataset_anndata.var.index

# We repalce the column name by gene_name to be understood by scGPT
dataset_anndata.var.rename(columns={"feature_name" : "gene_name"}, inplace=True)

# Move the "gene_name" column to index. (This overwrites the previous column)
dataset_anndata.var = dataset_anndata.var.set_index("gene_name")

# We also need the column gene_name inside the column and not only index
dataset_anndata.var["gene_name"] = dataset_anndata.var.index

AnnData expects .var.index to contain strings, but got values like:
    ['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


In [11]:
# Loop over the cell types to create fold groups in each of them
for celltype in dataset_anndata.obs.celltype.unique():
    # Retrieve the list of cells of the current cell type
    cell_name_list = list(dataset_anndata.obs.index[dataset_anndata.obs.celltype == celltype])
    # Monitors whether the number of cells within the cell type is sufficient
    if len(cell_name_list) < MINIMAL_NUMBER_CELL_BY_TYPE :
        # Remove the all the cell from the cell type
        print("Warning: The cell type ", celltype, " is removed from the dataset because it does not contain enough cells (", MINIMAL_NUMBER_CELL_BY_TYPE, ").", sep='')
        dataset_anndata = dataset_anndata[~dataset_anndata.obs.celltype.str.contains(celltype)]



In [None]:
# We write a new file containing the pre-processed data with a clean Anndata. 
dataset_anndata.write_h5ad(PATH_OUTPUT_FILE_ANNDATA)

## R MATRIX : For selection variable gene

In [15]:
from scipy import io

EXTRA_ANALYSIS_NAME_MATRIX = "Matrix_Files"
PATH_OUTPUT_FILE_MATRIX = os.path.join(PATH_EXPERIMENT_OUTPUT, ANALYSIS_NAME, EXTRA_ANALYSIS_NAME_MATRIX)

# Output path of the pre processed dataset
os.makedirs(PATH_OUTPUT_FILE_MATRIX, exist_ok = True)

In [17]:
with open(os.path.join(PATH_OUTPUT_FILE_MATRIX) + '/barcodes.tsv', 'w') as f:
    for item in dataset_anndata.obs_names:
        f.write(item + '\n')

In [18]:
with open(os.path.join(PATH_OUTPUT_FILE_MATRIX) + '/features.tsv', 'w') as f:
    for item in ['\t'.join([x,x,'Gene Expression']) for x in dataset_anndata.var_names]:
        f.write(item + '\n')

In [19]:
io.mmwrite(os.path.join(PATH_OUTPUT_FILE_MATRIX) + '/matrix.mtx', dataset_anndata.X.T)

#### Need to do a gzip of every file currently in the reperotry (before enter the cell above) | (gzip root_file/Matrix_Files/*)

In [16]:
dataset_anndata.obs.to_csv(os.path.join(PATH_OUTPUT_FILE_MATRIX) + '/metadata.csv') 