# Import

In [1]:
import pandas as pd
import scanpy as sc

import sys

sys.path.append("../../")
from tools.curation_tools import CuratedDataset
from tools.perturbseq_anndata_schema import ObsSchema, VarSchema

from unified_metadata_schema.unified_metadata_schema import Experiment

# Pre-process the data

In [None]:
import GEOparse

data_source_link = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE228nnn/GSE228115/suppl/GSE228115%5FRNA%5Fcounts.txt.gz"

data_df = pd.read_table(
    data_source_link,
    index_col=0,
    header=0,
    compression="gzip",
)
metadata_df = GEOparse.get_GEO(
    'GSE228115', destdir=".", silent=True)

metadata_df = metadata_df.phenotype_data

display(data_df.head())
display(metadata_df.head())

Unnamed: 0,CRISPRi_target_chr17-45309260_rep1,CRISPRi_target_chr17-45907842_rep1,CRISPRi_target_chr17-45219542_rep1,CRISPRi_target_chr17-46264873_rep1,CRISPRi_target_chr17-45840618_rep1,CRISPRi_target_chr17-45894000_rep1,CRISPRi_target_chr17-45583947_rep1,CRISPRi_target_chr17-45784528_rep1,CRISPRi_target_chr17-46264873_rep2,CRISPRi_target_chr17-45903110_rep1,...,CRISPRi_target_chr17-45223036_rep5,CRISPRi_target_chr17-45429323_rep3,CRISPRi_target_chr17-45429323_rep4,CRISPRi_target_chr17-45432051_rep3,CRISPRi_target_chr17-45432051_rep4,CRISPRi_target_chr17-45849095_rep3,CRISPRi_target_chr17-45849095_rep4,KOLF2.1J_NGN2_rep1,KOLF2.1J_NGN2_rep2,KOLF2.1J_NGN2_rep3
A1BG,8,5,2,3,134,4,7,2,10,14,...,11,23,21,17,14,20,20,88,106,97
A1BG-AS1,4,4,1,4,115,5,4,3,4,13,...,10,13,16,15,22,19,25,84,77,88
A1CF,0,0,0,0,6,1,0,3,0,0,...,0,1,0,1,0,0,0,8,8,3
A2M,121,26,187,48,633,78,50,98,190,687,...,134,430,612,461,354,555,351,3,4,9
A2M-AS1,2,1,1,1,38,1,2,1,2,4,...,1,3,4,3,2,4,2,51,48,62


Unnamed: 0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,taxid_ch1,...,contact_zip/postal_code,contact_country,instrument_model,library_selection,library_source,library_strategy,relation,supplementary_file_1,series_id,data_row_count
GSM7114268,CRISPRi_target_chr17-45309260_rep1,GSM7114268,Public on Apr 11 2023,Mar 23 2023,Apr 11 2023,SRA,1,BC1,Homo sapiens,9606,...,35806,USA,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,NONE,"GSE228115,GSE228121",0
GSM7114269,CRISPRi_target_chr17-45907842_rep1,GSM7114269,Public on Apr 11 2023,Mar 23 2023,Apr 11 2023,SRA,1,BC1,Homo sapiens,9606,...,35806,USA,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,NONE,"GSE228115,GSE228121",0
GSM7114270,CRISPRi_target_chr17-45219542_rep1,GSM7114270,Public on Apr 11 2023,Mar 23 2023,Apr 11 2023,SRA,1,BC1,Homo sapiens,9606,...,35806,USA,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,NONE,"GSE228115,GSE228121",0
GSM7114271,CRISPRi_target_chr17-46264873_rep1,GSM7114271,Public on Apr 11 2023,Mar 23 2023,Apr 11 2023,SRA,1,BC1,Homo sapiens,9606,...,35806,USA,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,NONE,"GSE228115,GSE228121",0
GSM7114272,CRISPRi_target_chr17-45840618_rep1,GSM7114272,Public on Apr 11 2023,Mar 23 2023,Apr 11 2023,SRA,1,BC1,Homo sapiens,9606,...,35806,USA,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,BioSample: https://www.ncbi.nlm.nih.gov/biosam...,NONE,"GSE228115,GSE228121",0


In [3]:
# assign index to metadata_df
metadata_df.index = metadata_df['title']
# clean up column names
metadata_df.columns = metadata_df.columns.str.replace('_ch1', '')
# drop unnecessary columns
metadata_df = metadata_df[['title', 'organism', 'characteristics.0.cell line', 'characteristics.1.cell type']]

display(metadata_df.head())

Unnamed: 0_level_0,title,organism,characteristics.0.cell line,characteristics.1.cell type
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CRISPRi_target_chr17-45309260_rep1,CRISPRi_target_chr17-45309260_rep1,Homo sapiens,BC1,Neurons
CRISPRi_target_chr17-45907842_rep1,CRISPRi_target_chr17-45907842_rep1,Homo sapiens,BC1,Neurons
CRISPRi_target_chr17-45219542_rep1,CRISPRi_target_chr17-45219542_rep1,Homo sapiens,BC1,Neurons
CRISPRi_target_chr17-46264873_rep1,CRISPRi_target_chr17-46264873_rep1,Homo sapiens,BC1,Neurons
CRISPRi_target_chr17-45840618_rep1,CRISPRi_target_chr17-45840618_rep1,Homo sapiens,BC1,Neurons


In [4]:
# check if sample names match
assert set(data_df.columns) == set(metadata_df['title'])
assert len(data_df.columns) == len(metadata_df)

In [5]:
# convert to AnnData
adata = sc.AnnData(
    X=data_df.T, obs=metadata_df, var=data_df.index.to_frame().rename(columns={0: 'gene_symbol'})
)

In [6]:
# save the non-curated AnnData object
adata.write_h5ad("../non_curated/h5ad/rogers_2024.h5ad")

# Initialise the dataset object

In [7]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,

    noncurated_path = "../non_curated/h5ad/rogers_2024.h5ad"
)

# Load the dataset

In [8]:
# cur_data.download_data()
cur_data.load_data()
# show the data
cur_data.show_obs()

Loading data from ../non_curated/h5ad/rogers_2024.h5ad
Observation data:
DataFrame shape: (384, 4)
--------------------------------------------------
                                                                 title  \
title                                                                    
CRISPRi_target_chr17-45309260_rep1  CRISPRi_target_chr17-45309260_rep1   
CRISPRi_target_chr17-45907842_rep1  CRISPRi_target_chr17-45907842_rep1   
CRISPRi_target_chr17-45219542_rep1  CRISPRi_target_chr17-45219542_rep1   
CRISPRi_target_chr17-46264873_rep1  CRISPRi_target_chr17-46264873_rep1   
CRISPRi_target_chr17-45840618_rep1  CRISPRi_target_chr17-45840618_rep1   
...                                                                ...   
CRISPRi_target_chr17-45849095_rep3  CRISPRi_target_chr17-45849095_rep3   
CRISPRi_target_chr17-45849095_rep4  CRISPRi_target_chr17-45849095_rep4   
KOLF2.1J_NGN2_rep1                                  KOLF2.1J_NGN2_rep1   
KOLF2.1J_NGN2_rep2                  

In [9]:
cur_data.show_var()

Variable data:
DataFrame shape: (36591, 1)
--------------------------------------------------
         gene_symbol
A1BG            A1BG
A1BG-AS1    A1BG-AS1
A1CF            A1CF
A2M              A2M
A2M-AS1      A2M-AS1
...              ...
ZXDC            ZXDC
ZYG11A        ZYG11A
ZYG11B        ZYG11B
ZYX              ZYX
ZZEF1          ZZEF1

[36591 rows x 1 columns]
--------------------------------------------------


# OBS slot curation

#### Add `perturbation_name` column based on the `title`

In [10]:
cur_data.create_columns(
    slot = "obs",
    col_dict={
        "perturbation_name": cur_data.adata.obs['title']
    },
    overwrite=True
)

cur_data.replace_entries(
    slot="obs",
    column="perturbation_name",
    to_replace='CRISPRi_target[_|-]',
    replace_value=''
)

Column perturbation_name added to adata.obs
Replaced entries CRISPRi_target[_|-] ->  in column perturbation_name of adata.obs


#### Add `perturbed_target_coord` column based on the `perturbation_name`

In [11]:
cur_data.create_columns(
    slot = "obs",
    col_dict={
        "perturbed_target_coord": cur_data.adata.obs['perturbation_name']
    },
    overwrite=True
)

Column perturbed_target_coord added to adata.obs


#### Clean up `perturbation_target_symbol` column

In [13]:
# Remove the _repX suffix from perturbed_target_coord
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_coord",
    to_replace=r'_rep\d{1,}',
    replace_value=''
)

# Replace the - with : in perturbed_target_coord
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_coord",
    to_replace=r'-',
    replace_value=r':'
)

# Replace the 'Safe Harbor' and 'GFP' with 'control_nontargeting' in perturbed_target_coord
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_coord",
    to_replace=r"Safe Harbor",
    replace_value="control_nontargeting"
)

cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_coord",
    to_replace=r"GFP",
    replace_value="control_nontargeting"
)

# Replace the 'KOLF2.1J_NGN2' and 'iCellGluta' with 'control_untreated' in perturbed_target_coord
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_coord",
    to_replace=r"KOLF2.1J_NGN2",
    replace_value="control_untreated"
)

cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_coord",
    to_replace=r"iCellGluta",
    replace_value="control_untreated"
)


Replaced entries _rep\d{1,} ->  in column perturbed_target_coord of adata.obs
Replaced entries - -> : in column perturbed_target_coord of adata.obs
Replaced entries Safe Harbor -> control_nontargeting in column perturbed_target_coord of adata.obs
Replaced entries GFP -> control_nontargeting in column perturbed_target_coord of adata.obs
Replaced entries KOLF2.1J_NGN2 -> control_untreated in column perturbed_target_coord of adata.obs
Replaced entries iCellGluta -> control_untreated in column perturbed_target_coord of adata.obs


### Show unique perturbations

In [14]:
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_coord')

Unique values in adata.obs.perturbed_target_coord: 94
--------------------------------------------------
{'chr17:44901094',
 'chr17:44947795',
 'chr17:44960270',
 'chr17:44970338',
 'chr17:44976410',
 'chr17:44979760',
 'chr17:44987687',
 'chr17:44988418',
 'chr17:45021085',
 'chr17:45043372',
 'chr17:45093169',
 'chr17:45111031',
 'chr17:45132706',
 'chr17:45147554',
 'chr17:45190950',
 'chr17:45219542',
 'chr17:45223036',
 'chr17:45228413',
 'chr17:45241662',
 'chr17:45247571',
 'chr17:45261951',
 'chr17:45288879',
 'chr17:45291439',
 'chr17:45293628',
 'chr17:45302897',
 'chr17:45307692',
 'chr17:45309260',
 'chr17:45309544',
 'chr17:45315000',
 'chr17:45321056',
 'chr17:45326805',
 'chr17:45371250',
 'chr17:45385639',
 'chr17:45429139',
 'chr17:45429323',
 'chr17:45432051',
 'chr17:45432088',
 'chr17:45433464',
 'chr17:45583947',
 'chr17:45613065',
 'chr17:45640752',
 'chr17:45661411',
 'chr17:45771150',
 'chr17:45784528',
 'chr17:45787210',
 'chr17:45813425',
 'chr17:45840618',
 '

### Add `perturbed_target_number` column

In [15]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_coord',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_coord of adata.obs and stored in perturbed_target_number


### Add `perturbed_target_biotype`

In [16]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        'perturbed_target_biotype': ['regulatory' if e.startswith('chr') else None for e in cur_data.adata.obs['perturbed_target_coord']]
    },
    overwrite=True
)     

Column perturbed_target_biotype added to adata.obs


### Add empty `perturbed_target_ensg`, `perturbed_target_symbol`, `perturbed_target_biotype` columns

In [17]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        'perturbed_target_ensg': None, 
        'perturbed_target_symbol': None
    }
)

Column perturbed_target_ensg added to adata.obs
Column perturbed_target_symbol added to adata.obs


In [18]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_coord', 'perturbed_target_number'])

Observation data:
DataFrame shape: (384, 3)
--------------------------------------------------
                                      perturbation_name  \
title                                                     
CRISPRi_target_chr17-45309260_rep1  chr17-45309260_rep1   
CRISPRi_target_chr17-45907842_rep1  chr17-45907842_rep1   
CRISPRi_target_chr17-45219542_rep1  chr17-45219542_rep1   
CRISPRi_target_chr17-46264873_rep1  chr17-46264873_rep1   
CRISPRi_target_chr17-45840618_rep1  chr17-45840618_rep1   
...                                                 ...   
CRISPRi_target_chr17-45849095_rep3  chr17-45849095_rep3   
CRISPRi_target_chr17-45849095_rep4  chr17-45849095_rep4   
KOLF2.1J_NGN2_rep1                   KOLF2.1J_NGN2_rep1   
KOLF2.1J_NGN2_rep2                   KOLF2.1J_NGN2_rep2   
KOLF2.1J_NGN2_rep3                   KOLF2.1J_NGN2_rep3   

                                   perturbed_target_coord  \
title                                                       
CRISPRi_target_

### Add treatment information

Add treatment information with the dataset

In [19]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "treatment_label": None, 
        "treatment_id": None
    }
)

Column treatment_label added to adata.obs
Column treatment_id added to adata.obs


### Add perturbation information

In [20]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None
    }
)

Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs


### Add timepoint information

In [21]:
cur_data.create_columns(
    slot="obs",
    col_dict={"timepoint": "P0DT0H0M0S"},
)

Column timepoint added to adata.obs


### Add model system information

In [22]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "model_system_label": "stem cell-derived cells", 
        "model_system_id": None
    }
)

Column model_system_label added to adata.obs
Column model_system_id added to adata.obs


### Add tissue information

In [23]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        'tissue': 'brain'
    }
)

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Column tissue added to adata.obs
Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        brain              brain      brain  UBERON:0000955
--------------------------------------------------


### Add cell type information

In [24]:
cur_data.show_unique(
    slot='obs',
    column='characteristics.1.cell type'
)

Unique values in adata.obs.characteristics.1.cell type: 2
--------------------------------------------------
{'Gluta Neurons', 'Neurons'}
--------------------------------------------------


In [25]:
# KOLF2.1J NGN2 are incorrectly labeled as neurons, but they are actually glutamatergic neurons
# so we will map them to the correct cell type
cur_data.map_values_from_column(
    ref_col='characteristics.0.cell line',
    target_col='characteristics.1.cell type',
    map_dict={
        'KOLF2.1J NGN2': 'glutamatergic neurons'
    }
)

Mapped value KOLF2.1J NGN2 in column characteristics.0.cell line to glutamatergic neurons in column characteristics.1.cell type of adata.obs


In [26]:
cur_data.replace_entries(
    slot='obs',
    column='characteristics.1.cell type',
    to_replace=r'Gluta Neurons',
    replace_value=r'glutamatergic neurons'
)

cur_data.standardize_ontology(
    input_column='characteristics.1.cell type',
    column_type='term_name',
    ontology_type='cell_type',
    overwrite=True
)

Replaced entries Gluta Neurons -> glutamatergic neurons in column characteristics.1.cell type of adata.obs
Mapped 2 cell_type ontology terms from `characteristics.1.cell type` column to ontology terms
DataFrame shape: (2, 4)
--------------------------------------------------
            input_column     input_column_lower             name_lower  \
0                Neurons                neurons                neurons   
1  glutamatergic neurons  glutamatergic neurons  glutamatergic neurons   

  ontology_id  
0  CL:0000540  
1  CL:0000679  
--------------------------------------------------


### Add cell line information

In [27]:
# correct cell line name from cellosaurus (https://www.cellosaurus.org/CVCL_D1J6)
cur_data.replace_entries(
    slot='obs',
    column='characteristics.0.cell line',
    to_replace=r'KOLF2.1J NGN2',
    replace_value=r'KOLF2.1J AAVS1-TREG3-NGN2'
)

# correct the Fuji cell line name (https://www.fujifilmcdi.com/icell-glutaneurons-01279-ggln01279)
cur_data.replace_entries(
    slot='obs',
    column='characteristics.0.cell line',
    to_replace=r'iCellGluta',
    replace_value=r'iCell GlutaNeurons'
)

Replaced entries KOLF2.1J NGN2 -> KOLF2.1J AAVS1-TREG3-NGN2 in column characteristics.0.cell line of adata.obs
Replaced entries iCellGluta -> iCell GlutaNeurons in column characteristics.0.cell line of adata.obs


In [29]:
# standardization won't work because these cells are not in the ontology
cur_data.standardize_ontology(
    input_column='characteristics.0.cell line',
    column_type='term_name',
    ontology_type='cell_line'
)

# proceed with manually creating the columns
cur_data.create_columns(
    slot='obs',
    col_dict={
        "cell_line_label": cur_data.adata.obs['characteristics.0.cell line'],
        "cell_line_id": None
    }
)

Column cell_line_label added to adata.obs
Column cell_line_id added to adata.obs


### Add disease information

In [None]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        "disease_label": None,
        "disease_id": None
    }
)

Column disease_label added to adata.obs
Column disease_id added to adata.obs


### Add species information

In [31]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "species": "Homo sapiens"
    }
)

Column species added to adata.obs


### Add sex information

In [32]:
cur_data.adata.obs

Unnamed: 0,title,organism,characteristics.0.cell line,characteristics.1.cell type,perturbation_name,perturbed_target_coord,perturbed_target_number,perturbed_target_biotype,perturbed_target_ensg,perturbed_target_symbol,...,tissue,tissue_label,tissue_id,cell_type_label,cell_type_id,cell_line_label,cell_line_id,disease_label,disease_id,species
0,CRISPRi_target_chr17-45309260_rep1,Homo sapiens,BC1,Neurons,chr17-45309260_rep1,chr17:45309260,1,regulatory,,,...,brain,brain,UBERON:0000955,neuron,CL:0000540,BC1,,,,Homo sapiens
1,CRISPRi_target_chr17-45907842_rep1,Homo sapiens,BC1,Neurons,chr17-45907842_rep1,chr17:45907842,1,regulatory,,,...,brain,brain,UBERON:0000955,neuron,CL:0000540,BC1,,,,Homo sapiens
2,CRISPRi_target_chr17-45219542_rep1,Homo sapiens,BC1,Neurons,chr17-45219542_rep1,chr17:45219542,1,regulatory,,,...,brain,brain,UBERON:0000955,neuron,CL:0000540,BC1,,,,Homo sapiens
3,CRISPRi_target_chr17-46264873_rep1,Homo sapiens,BC1,Neurons,chr17-46264873_rep1,chr17:46264873,1,regulatory,,,...,brain,brain,UBERON:0000955,neuron,CL:0000540,BC1,,,,Homo sapiens
4,CRISPRi_target_chr17-45840618_rep1,Homo sapiens,BC1,Neurons,chr17-45840618_rep1,chr17:45840618,1,regulatory,,,...,brain,brain,UBERON:0000955,neuron,CL:0000540,BC1,,,,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,CRISPRi_target_chr17-45849095_rep3,Homo sapiens,XCL4,Neurons,chr17-45849095_rep3,chr17:45849095,1,regulatory,,,...,brain,brain,UBERON:0000955,neuron,CL:0000540,XCL4,,,,Homo sapiens
380,CRISPRi_target_chr17-45849095_rep4,Homo sapiens,XCL4,Neurons,chr17-45849095_rep4,chr17:45849095,1,regulatory,,,...,brain,brain,UBERON:0000955,neuron,CL:0000540,XCL4,,,,Homo sapiens
381,KOLF2.1J_NGN2_rep1,Homo sapiens,KOLF2.1J AAVS1-TREG3-NGN2,glutamatergic neurons,KOLF2.1J_NGN2_rep1,control_untreated,0,,,,...,brain,brain,UBERON:0000955,glutamatergic neuron,CL:0000679,KOLF2.1J AAVS1-TREG3-NGN2,,,,Homo sapiens
382,KOLF2.1J_NGN2_rep2,Homo sapiens,KOLF2.1J AAVS1-TREG3-NGN2,glutamatergic neurons,KOLF2.1J_NGN2_rep2,control_untreated,0,,,,...,brain,brain,UBERON:0000955,glutamatergic neuron,CL:0000679,KOLF2.1J AAVS1-TREG3-NGN2,,,,Homo sapiens


In [33]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "sex_label": None, 
        "sex_id": None
    }
)

cur_data.map_values_from_column(
    ref_col='cell_line_label',
    target_col='sex_label',
    map_dict={
        'BC1': 'female', # https://www.cellosaurus.org/CVCL_RX99
        'XCL4': 'female', # https://cdn.stemcell.com/media/files/pis/DX21378-PIS_1_1_0.pdf
        'iCell GlutaNeurons': 'male', # https://www.fujifilmcdi.com/icell-glutaneurons-01279-ggln01279
        'KOLF2.1J AAVS1-TREG3-NGN2': 'male' # https://www.cellosaurus.org/CVCL_D1J6
    }
)

Column sex_label added to adata.obs
Column sex_id added to adata.obs
Mapped value BC1 in column cell_line_label to female in column sex_label of adata.obs
Mapped value XCL4 in column cell_line_label to female in column sex_label of adata.obs
Mapped value iCell GlutaNeurons in column cell_line_label to male in column sex_label of adata.obs
Mapped value KOLF2.1J AAVS1-TREG3-NGN2 in column cell_line_label to male in column sex_label of adata.obs


### Add developmental stage information

In [34]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None
    }
)

Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs


### Add guides

In [35]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        'guide_sequence': None
    }
)

Column guide_sequence added to adata.obs


### Match schema column order

In [36]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [37]:
cur_data.validate_data(slot='obs')

adata.obs is valid according to the obs_schema.
Validated data:


Unnamed: 0,perturbation_name,perturbed_target_coord,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,guide_sequence,perturbation_type_label,perturbation_type_id,timepoint,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_label,disease_id
0,chr17-45309260_rep1,chr17:45309260,1,,,regulatory,,CRISPRi,,P0DT0H0M0S,...,neuron,CL:0000540,BC1,,female,,adult,,,
1,chr17-45907842_rep1,chr17:45907842,1,,,regulatory,,CRISPRi,,P0DT0H0M0S,...,neuron,CL:0000540,BC1,,female,,adult,,,
2,chr17-45219542_rep1,chr17:45219542,1,,,regulatory,,CRISPRi,,P0DT0H0M0S,...,neuron,CL:0000540,BC1,,female,,adult,,,
3,chr17-46264873_rep1,chr17:46264873,1,,,regulatory,,CRISPRi,,P0DT0H0M0S,...,neuron,CL:0000540,BC1,,female,,adult,,,
4,chr17-45840618_rep1,chr17:45840618,1,,,regulatory,,CRISPRi,,P0DT0H0M0S,...,neuron,CL:0000540,BC1,,female,,adult,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,chr17-45849095_rep3,chr17:45849095,1,,,regulatory,,CRISPRi,,P0DT0H0M0S,...,neuron,CL:0000540,XCL4,,female,,adult,,,
380,chr17-45849095_rep4,chr17:45849095,1,,,regulatory,,CRISPRi,,P0DT0H0M0S,...,neuron,CL:0000540,XCL4,,female,,adult,,,
381,KOLF2.1J_NGN2_rep1,control_untreated,0,,,,,CRISPRi,,P0DT0H0M0S,...,glutamatergic neuron,CL:0000679,KOLF2.1J AAVS1-TREG3-NGN2,,male,,adult,,,
382,KOLF2.1J_NGN2_rep2,control_untreated,0,,,,,CRISPRi,,P0DT0H0M0S,...,glutamatergic neuron,CL:0000679,KOLF2.1J AAVS1-TREG3-NGN2,,male,,adult,,,


# VAR slot curation

### Standardise genes

In [38]:
cur_data.show_var()

Variable data:
DataFrame shape: (36591, 1)
--------------------------------------------------
         gene_symbol
A1BG            A1BG
A1BG-AS1    A1BG-AS1
A1CF            A1CF
A2M              A2M
A2M-AS1      A2M-AS1
...              ...
ZXDC            ZXDC
ZYG11A        ZYG11A
ZYG11B        ZYG11B
ZYX              ZYX
ZZEF1          ZZEF1

[36591 rows x 1 columns]
--------------------------------------------------


In [39]:
cur_data.standardize_genes(
    slot="var", input_column="gene_symbol", input_column_type="gene_symbol"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Mapped potential synonyms in gene_symbol of the provided dataframe to gene symbols
Converted 23978/36591 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Validate var metadata

In [40]:
cur_data.validate_data(slot='var')

adata.var is valid according to the var_schema.
Validated data:


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG,ENSG00000121410,A1BG
A1BG-AS1,ENSG00000268895,A1BG-AS1
A1CF,ENSG00000148584,A1CF
A2M,ENSG00000175899,A2M
A2M-AS1,ENSG00000245105,A2M-AS1
...,...,...
ZXDC,ENSG00000070476,ZXDC
ZYG11A,ENSG00000203995,ZYG11A
ZYG11B,ENSG00000162378,ZYG11B
ZYX,ENSG00000159840,ZYX


# Metadata curation

### Auto-populate available metadata

In [41]:
cur_data.populate_exp_metadata()

Experiment metadata populated with available fields from adata.obs:
--------------------------------------------------
{'associated_diseases': [{'term_id': None, 'term_label': None}],
 'experiment': {'number_of_perturbed_entities': 384,
                'number_of_perturbed_targets': 94,
                'perturbation_type': [{'term_id': None,
                                       'term_label': 'CRISPRi'}],
                'perturbed_target_biotype': ['regulatory'],
                'perturbed_targets': None,
                'timepoints': ['P0DT0H0M0S'],
                'treatments': [{'term_id': None, 'term_label': None}]},
 'model_system': {'cell_line': [{'term_id': None, 'term_label': 'BC1'},
                                {'term_id': None, 'term_label': 'XCL4'},
                                {'term_id': None,
                                 'term_label': 'iCell GlutaNeurons'},
                                {'term_id': None,
                                 'term_label': 'KOLF2.

### Manually curate metadata

Study details

In [42]:
cur_data.add_exp_metadata(
    metadata_slot="study",
    metadata={
        "title": "Neuronal MAPT expression is mediated by long-range interactions with cis-regulatory elements",
        "study_uri": "https://doi.org/10.1016/j.ajhg.2023.12.015",
        "year": 2024,
        "first_author": {"first_name": "Brianne", "last_name": "Rogers"},
        "last_author": {"first_name": "Jesse", "last_name": "Cochran"},
    }
)

Metadata for 'study' successfully validated:
--------------------------------------------------
{'first_author': {'first_name': 'Brianne', 'last_name': 'Rogers'},
 'last_author': {'first_name': 'Jesse', 'last_name': 'Cochran'},
 'study_uri': 'https://doi.org/10.1016/j.ajhg.2023.12.015',
 'title': 'Neuronal MAPT expression is mediated by long-range interactions '
          'with cis-regulatory elements',
 'year': 2024}
--------------------------------------------------


Experiment details

In [43]:
cur_data.exp_metadata

{'study': {'title': 'Neuronal MAPT expression is mediated by long-range interactions with cis-regulatory elements',
  'study_uri': 'https://doi.org/10.1016/j.ajhg.2023.12.015',
  'year': 2024,
  'first_author': {'first_name': 'Brianne', 'last_name': 'Rogers'},
  'last_author': {'first_name': 'Jesse', 'last_name': 'Cochran'}},
 'experiment': {'treatments': [{'term_id': None, 'term_label': None}],
  'timepoints': ['P0DT0H0M0S'],
  'perturbation_type': [{'term_id': None, 'term_label': 'CRISPRi'}],
  'perturbed_target_biotype': ['regulatory'],
  'number_of_perturbed_targets': 94,
  'perturbed_targets': None,
  'number_of_perturbed_entities': 384},
 'perturbation': {},
 'assay': {},
 'model_system': {'model_system': [{'term_id': None,
    'term_label': 'stem cell-derived cells'}],
  'tissue': [{'term_id': 'UBERON:0000955', 'term_label': 'brain'}],
  'cell_type': [{'term_id': 'CL:0000540', 'term_label': 'neuron'},
   {'term_id': 'CL:0000679', 'term_label': 'glutamatergic neuron'}],
  'cell_l

In [44]:
cur_data.add_exp_metadata(
    metadata_slot='experiment',
    metadata={
        "title": "CRISPRi to profile potential cis-regulatory elements of MAPT",
        "summary": "A total of 384 samples, including non-targeting controls (AAVS1 GSH and GFP) and untrgeted controls (KOLF2.1J NGN2 and iCell GlutaNeurons), were perturbed using dCas-KRAB and sgRNAs targeting potential cis-regulatory elements of MAPT. Perturbations were performed on two different neuronal progenitor cell lines (XLC4, BC1) differentiated into neurons.",
        "replicates": "Multiple replicates",
        "number_of_samples": 384
    }
)

Metadata for 'experiment' successfully validated:
--------------------------------------------------
{'number_of_perturbed_entities': 384,
 'number_of_perturbed_targets': 94,
 'number_of_samples': 384,
 'perturbation_type': [{'term_id': None, 'term_label': 'CRISPRi'}],
 'perturbed_target_biotype': ['regulatory'],
 'perturbed_targets': None,
 'replicates': 'Multiple replicates',
 'summary': 'A total of 384 samples, including non-targeting controls (AAVS1 '
            'GSH and GFP) and untrgeted controls (KOLF2.1J NGN2 and iCell '
            'GlutaNeurons), were perturbed using dCas-KRAB and sgRNAs '
            'targeting potential cis-regulatory elements of MAPT. '
            'Perturbations were performed on two different neuronal progenitor '
            'cell lines (XLC4, BC1) differentiated into neurons.',
 'timepoints': ['P0DT0H0M0S'],
 'title': 'CRISPRi to profile potential cis-regulatory elements of MAPT',
 'treatments': [{'term_id': None, 'term_label': None}]}
---------------

Perturbation details

In [45]:
cur_data.add_exp_metadata(
    metadata_slot='perturbation',
    metadata={
         "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022895",
            "term_label": "dCas9-KRAB",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "arrayed",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "inhibition",
                },
            ],
            "manufacturer": "Cochran",
            "lentiviral_generation": "3",
            "grnas_per_target": "1+",
            "total_grnas": "96",
            "total_variants": None
        }
    }
)

Metadata for 'perturbation' successfully validated:
--------------------------------------------------
{'enzyme_delivery_method': {'term_id': None,
                            'term_label': 'lentiviral transduction'},
 'enzyme_expression_control': {'term_id': None,
                               'term_label': 'constitutive expression'},
 'enzyme_integration_state': {'term_id': None,
                              'term_label': 'random locus integration'},
 'library': {'accession': None,
             'grnas_per_target': '1+',
             'lentiviral_generation': '3',
             'library_format': {'term_id': None, 'term_label': 'arrayed'},
             'library_name': 'custom',
             'library_perturbation_type': [{'term_id': None,
                                            'term_label': 'inhibition'}],
             'library_scope': {'term_id': None, 'term_label': 'focused'},
             'manufacturer': 'Cochran',
             'total_grnas': '96',
             'total_variants':

Assay details

In [46]:
cur_data.add_exp_metadata(
    metadata_slot='assay',
    metadata={
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "CRISPR screen",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "QuantSeq 3′ mRNA-Seq Library Prep Kit FWD",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina NovaSeq 6000"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "htseq-count"},
        "software_analysis": {"term_id": None, "term_label": "custom"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh38",
        }
    }
)

Metadata for 'assay' successfully validated:
--------------------------------------------------
{'method_name': {'term_id': None, 'term_label': 'CRISPR screen'},
 'method_uri': None,
 'readout_dimensionality': {'term_id': None,
                            'term_label': 'high-dimensional assay'},
 'readout_technology': {'term_id': None, 'term_label': 'rna-seq'},
 'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
 'reference_genome': {'term_id': None, 'term_label': 'GRCh38'},
 'sequencing_library_kit': {'term_id': None,
                            'term_label': 'QuantSeq 3′ mRNA-Seq Library Prep '
                                          'Kit FWD'},
 'sequencing_platform': {'term_id': None,
                         'term_label': 'Illumina NovaSeq 6000'},
 'sequencing_strategy': {'term_id': None, 'term_label': 'barcode sequencing'},
 'software_analysis': {'term_id': None, 'term_label': 'custom'},
 'software_counts': {'term_id': None, 'term_label': 'htseq-count'}}
-------

Model system details

In [47]:
cur_data.add_exp_metadata(
    metadata_slot='model_system',
    metadata={
        "species": "Homo sapiens",
        "passage_number": None,
        }
)

Metadata for 'model_system' successfully validated:
--------------------------------------------------
{'cell_line': [{'term_id': None, 'term_label': 'BC1'},
               {'term_id': None, 'term_label': 'XCL4'},
               {'term_id': None, 'term_label': 'iCell GlutaNeurons'},
               {'term_id': None, 'term_label': 'KOLF2.1J AAVS1-TREG3-NGN2'}],
 'cell_type': [{'term_id': 'CL:0000540', 'term_label': 'neuron'},
               {'term_id': 'CL:0000679', 'term_label': 'glutamatergic neuron'}],
 'developmental_stage': [{'term_id': None, 'term_label': 'adult'}],
 'model_system': [{'term_id': None, 'term_label': 'stem cell-derived cells'}],
 'passage_number': None,
 'sex': [{'term_id': None, 'term_label': 'female'},
         {'term_id': None, 'term_label': 'male'}],
 'species': 'Homo sapiens',
 'tissue': [{'term_id': 'UBERON:0000955', 'term_label': 'brain'}]}
--------------------------------------------------


Associated dataset details

In [48]:
cur_data.add_exp_metadata(
    metadata_slot='associated_datasets',
    metadata=[
        {
            "dataset_accession": "GSE228115",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE228115",
            "dataset_description": "Raw counts",
            "dataset_file_name": "GSE228115_RNA_counts.txt.gz",
        }
    ]
)

Metadata for 'associated_datasets' successfully validated:
--------------------------------------------------
[{'dataset_accession': 'GSE228115',
  'dataset_description': 'Raw counts',
  'dataset_file_name': 'GSE228115_RNA_counts.txt.gz',
  'dataset_uri': 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE228115'}]
--------------------------------------------------


### Validate metadata

In [49]:
cur_data.validate_exp_metadata()

Experiment metadata successfully validated:
--------------------------------------------------
{'assay': {'method_name': {'term_id': None, 'term_label': 'CRISPR screen'},
           'method_uri': None,
           'readout_dimensionality': {'term_id': None,
                                      'term_label': 'high-dimensional assay'},
           'readout_technology': {'term_id': None, 'term_label': 'rna-seq'},
           'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
           'reference_genome': {'term_id': None, 'term_label': 'GRCh38'},
           'sequencing_library_kit': {'term_id': None,
                                      'term_label': 'QuantSeq 3′ mRNA-Seq '
                                                    'Library Prep Kit FWD'},
           'sequencing_platform': {'term_id': None,
                                   'term_label': 'Illumina NovaSeq 6000'},
           'sequencing_strategy': {'term_id': None,
                                   'term_label':

# Save the dataset

In [51]:
cur_data.save_curated_data()

  adata.obs = adata.obs.fillna(value=np.nan)


Curated data saved to ../curated/h5ad/rogers_2024_curated.h5ad


In [50]:
cur_data.curated_path

'../curated/h5ad/rogers_2024_curated.h5ad'