# Import

In [1]:
import pandas as pd
import json

from curation_tools.curation_tools import (
    CuratedDataset,
    ObsSchema,
    VarSchema,
    Experiment,
    download_file,
    upload_parquet_to_bq
)

import logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    handlers=[
        logging.FileHandler("curation.log"),
        logging.StreamHandler(),  # keep console output too
    ],
    force=True,
)

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



# Download data

In [2]:
noncurated_path = "../non_curated/h5ad/nadig_2025_jurkat.h5ad"
download_file(
    url="https://ftp.ncbi.nlm.nih.gov/geo/series/GSE264nnn/GSE264667/suppl/GSE264667%5Fjurkat%5Fraw%5Fsinglecell%5F01.h5ad",
    dest_path=noncurated_path
)

2025-12-09 15:29:48,012 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): ftp.ncbi.nlm.nih.gov:443
2025-12-09 15:29:48,425 DEBUG urllib3.connectionpool: https://ftp.ncbi.nlm.nih.gov:443 "GET /geo/series/GSE264nnn/GSE264667/suppl/GSE264667_jurkat_raw_singlecell_01.h5ad HTTP/1.1" 200 9366490264


Downloaded https://ftp.ncbi.nlm.nih.gov/geo/series/GSE264nnn/GSE264667/suppl/GSE264667%5Fjurkat%5Fraw%5Fsinglecell%5F01.h5ad to ../non_curated/h5ad/nadig_2025_jurkat.h5ad


# Initialise the dataset object

In [3]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    noncurated_path=noncurated_path
)

cur_data.load_data()

Loading data from ../non_curated/h5ad/nadig_2025_jurkat.h5ad


2025-12-09 15:37:30,168 DEBUG h5py._conv: Creating converter from 3 to 5


In [4]:
cur_data.adata.obs

Unnamed: 0_level_0,gem_group,gene,gene_id,transcript,gene_transcript,sgID_AB,mitopercent,UMI_count,z_gemgroup_UMI
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAACCCAAGAAACTGT-27,27,NELFE,ENSG00000204356,P1P2,5601_NELFE_P1P2_ENSG00000204356,NELFE_+_31926720.23-P1P2|NELFE_-_31926676.23-P1P2,0.063665,13194.0,0.106271
AAACCCAAGAAATCCA-12,12,EMC7,ENSG00000134153,P1P2,2616_EMC7_P1P2_ENSG00000134153,EMC7_+_34394068.23-P1P2|EMC7_-_34393868.23-P1P2,0.049182,9719.0,-0.054858
AAACCCAAGAAATTCG-56,56,TAF1D,ENSG00000166012,P2,8659_TAF1D_P2_ENSG00000166012,TAF1D_-_93471390.23-P2|TAF1D_+_93471338.23-P2,0.055632,11576.0,-0.138458
AAACCCAAGAAGCCAC-26,26,EIF2B2,ENSG00000119718,P1P2,2536_EIF2B2_P1P2_ENSG00000119718,EIF2B2_-_75469671.23-P1P2|EIF2B2_-_75469856.23...,0.044284,12849.0,-0.422243
AAACCCAAGACAACTA-5,5,RPP30,ENSG00000148688,P1P2,7491_RPP30_P1P2_ENSG00000148688,RPP30_+_92631924.23-P1P2|RPP30_-_92631746.23-P1P2,0.072090,11555.0,-1.806991
...,...,...,...,...,...,...,...,...,...
TTTGTTGTCTGTAAGC-21,21,UHRF1,ENSG00000276043,P1P2,9583_UHRF1_P1P2_ENSG00000276043,UHRF1_+_4910126.23-P1P2|UHRF1_+_4909924.23-P1P2,0.065960,5003.0,-1.069798
TTTGTTGTCTTAGCTT-9,9,EIF1AX,ENSG00000173674,P1P2,2527_EIF1AX_P1P2_ENSG00000173674,EIF1AX_+_20159985.23-P1P2|EIF1AX_-_20159725.23...,0.063649,12726.0,0.727394
TTTGTTGTCTTCCTAA-56,56,MCM3,ENSG00000112118,P1P2,4900_MCM3_P1P2_ENSG00000112118,MCM3_-_52149354.23-P1P2|MCM3_-_52149307.23-P1P2,0.030737,9630.0,-0.521912
TTTGTTGTCTTCGTAT-29,29,MED20,ENSG00000124641,P1P2,4943_MED20_P1P2_ENSG00000124641,MED20_+_41888837.23-P1P2|MED20_+_41888805.23-P1P2,0.056258,5777.0,-1.199783


# OBS slot curation

### Show unique perturbations

In [5]:
cur_data.show_unique(slot = 'obs', column = 'sgID_AB')

Unique values in adata.obs.sgID_AB: 2679
--------------------------------------------------
{'AAAS_-_53715438.23-P1P2|AAAS_+_53715355.23-P1P2',
 'AAMP_+_219134851.23-P1P2|AAMP_+_219134841.23-P1P2',
 'AAR2_-_34824434.23-P1P2|AAR2_+_34824488.23-P1P2',
 'AARS2_+_44281027.23-P1P2|AARS2_+_44281044.23-P1P2',
 'AARS_+_70323362.23-P1P2|AARS_-_70323332.23-P1P2',
 'AASDHPPT_+_105948405.23-P1P2|AASDHPPT_+_105948450.23-P1P2',
 'AATF_-_35306286.23-P1P2|AATF_-_35306346.23-P1P2',
 'ABCB10_+_229694285.23-P1P2|ABCB10_-_229694297.23-P1P2',
 'ABCB7_-_74376019.23-P1P2|ABCB7_+_74375885.23-P1P2',
 'ABCE1_-_146019502.23-P1P2|ABCE1_-_146019516.23-P1P2',
 'ABCF1_+_30539238.23-P1|ABCF1_+_30539469.23-P1',
 'ABCF1_-_30546354.23-P2|ABCF1_-_30546344.23-P2',
 'ABCG1_-_43639282.23-P1P2|ABCG1_+_43639503.23-P1P2',
 'ABHD11_-_73153094.23-P1P2|ABHD11_+_73152963.23-P1P2',
 'ABHD17A_-_1885483.23-P1P2|ABHD17A_+_1885470.23-P1P2',
 'ABT1_-_26597263.23-P1P2|ABT1_+_26597412.23-P1P2',
 'ACD_-_67694143.23-P1P2|ACD_+_67694124.23-P

### Rename `sgID_AB` to `perturbation_name`

In [6]:
cur_data.rename_columns(slot = 'obs', name_dict = {'sgID_AB': 'perturbation_name'})

Renamed columns in adata.obs: {'sgID_AB': 'perturbation_name'}


### Add guide RNA information

In [7]:
# download the guide RNA spreadsheet
download_file(
    url="https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-025-02169-3/MediaObjects/41588_2025_2169_MOESM4_ESM.xlsx",
    dest_path="../supplementary/nadig_2025_guide_info.xlsx"
)

# read in the guide RNA spreadsheet
# guides for the essential library are in "ST20"
guide_info_df = pd.read_excel("../supplementary/nadig_2025_guide_info.xlsx", sheet_name="ST20")

# create perturbation_name column in guide_info_df
guide_info_df['perturbation_name'] = guide_info_df['sgID_A'] + '|' + guide_info_df['sgID_B']
# replace commas with hyphens in perturbation_name
guide_info_df['perturbation_name'] = guide_info_df['perturbation_name'].str.replace(',', '-')
# check that all perturbation names in cur_data are in guide_info_df
print(f"All perturbation names in cur_data are in guide_info_df: {cur_data.adata.obs['perturbation_name'].isin(guide_info_df['perturbation_name']).all()}")
# create guide_sequence column in guide_info_df
guide_info_df['guide_sequence'] = guide_info_df['targeting sequence A'] + '|' + guide_info_df['targeting sequence B']
# subset for necessary columns
guide_info_df = guide_info_df[['perturbation_name', 'guide_sequence']]
# merge cur_data.adata.obs with guide_info_df on perturbation_name
cur_data.adata.obs = cur_data.adata.obs.merge(guide_info_df, on='perturbation_name', how='left')
# check that there are no missing guide sequences
print(f"Number of missing guide sequences: {cur_data.adata.obs['guide_sequence'].isna().sum()}")


File ../supplementary/nadig_2025_guide_info.xlsx already exists. Skipping download.
All perturbation names in cur_data are in guide_info_df: True
Number of missing guide sequences: 0


  return dispatch(args[0].__class__)(*args, **kw)


### Standardise perturbation targets

In [8]:
cur_data.standardize_genes(
    slot='obs',
    input_column='gene',
    input_column_type='gene_symbol',
    multiple_entries=False,
    # remove_version=True,
    # version_sep='.'
)

Mapping gene symbols: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2394/2394 [00:00<00:00, 10716.16it/s]


--------------------------------------------------
Successfully mapped 2392 out of 2394 gene symbols.
--------------------------------------------------
Couldn't map gene symbols: ['AC118549.1', 'MTRNR2L1']
--------------------------------------------------


In [9]:
cur_data.adata.obs.head()

Unnamed: 0_level_0,UMI_count,guide_sequence,mitopercent,gene_id,gem_group,transcript,gene,perturbation_name,gene_transcript,z_gemgroup_UMI,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,perturbed_target_coord,perturbed_target_chromosome
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,13194.0,GGTTGTGAGCCCGCTATCAG|GCAGAGGGCCTTACCCGAGG,0.063665,ENSG00000204356,27,P1P2,NELFE,NELFE_+_31926720.23-P1P2|NELFE_-_31926676.23-P1P2,5601_NELFE_P1P2_ENSG00000204356,0.106271,ENSG00000204356,NELFE,protein_coding,chr6:31952087-31959038;-1,6
1,9719.0,GGAGACTGGGAGGGCTTCCG|GCCGCCGAGATCCAGTCCTG,0.049182,ENSG00000134153,12,P1P2,EMC7,EMC7_+_34394068.23-P1P2|EMC7_-_34393868.23-P1P2,2616_EMC7_P1P2_ENSG00000134153,-0.054858,ENSG00000134153,EMC7,protein_coding,chr15:34083980-34101910;-1,15
2,11576.0,GTATCTATTAGTGAGTATAT|GAATTTAGAAGCAGAGGATC,0.055632,ENSG00000166012,56,P2,TAF1D,TAF1D_-_93471390.23-P2|TAF1D_+_93471338.23-P2,8659_TAF1D_P2_ENSG00000166012,-0.138458,ENSG00000166012,TAF1D,protein_coding,chr11:93729948-93784391;-1,11
3,12849.0,GGTGTGGATTCCGCCGGTGA|GACCACCGCTGGAGCAACGC,0.044284,ENSG00000119718,26,P1P2,EIF2B2,EIF2B2_-_75469671.23-P1P2|EIF2B2_-_75469856.23...,2536_EIF2B2_P1P2_ENSG00000119718,-0.422243,ENSG00000119718,EIF2B2,protein_coding,chr14:75002894-75012366;1,14
4,11555.0,GGGGACATCCCAGAGACTCT|GCGGTCATGGGACTTCAGCA,0.07209,ENSG00000148688,5,P1P2,RPP30,RPP30_+_92631924.23-P1P2|RPP30_-_92631746.23-P1P2,7491_RPP30_P1P2_ENSG00000148688,-1.806991,ENSG00000148688,RPP30,protein_coding,chr10:90871487-90908553;1,10


### Add `perturbed_target_number` column

In [10]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [11]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


In [12]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (262956, 2)
--------------------------------------------------
                                        perturbation_name  \
index                                                       
0       NELFE_+_31926720.23-P1P2|NELFE_-_31926676.23-P1P2   
1         EMC7_+_34394068.23-P1P2|EMC7_-_34393868.23-P1P2   
2           TAF1D_-_93471390.23-P2|TAF1D_+_93471338.23-P2   
3       EIF2B2_-_75469671.23-P1P2|EIF2B2_-_75469856.23...   
4       RPP30_+_92631924.23-P1P2|RPP30_-_92631746.23-P1P2   
...                                                   ...   
262951    UHRF1_+_4910126.23-P1P2|UHRF1_+_4909924.23-P1P2   
262952  EIF1AX_+_20159985.23-P1P2|EIF1AX_-_20159725.23...   
262953    MCM3_-_52149354.23-P1P2|MCM3_-_52149307.23-P1P2   
262954  MED20_+_41888837.23-P1P2|MED20_+_41888805.23-P1P2   
262955    RPF1_-_84945270.23-P1P2|RPF1_+_84945350.23-P1P2   

        perturbed_target_chromosome_encoding  
index                                         
0             

### Add metadata

In [13]:
cur_data.create_columns(
    overwrite=True,
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # perturbation type
        "perturbation_type_label": "CRISPRi",
        "perturbation_type_id": None,
        "data_modality": "CRISPR screen",
        "significant": None,
        "significance_criteria": None,
        "score_interpretation": None,

        # treatment
        "treatment_label": None,
        "treatment_id": None,
        # replicates
        "technical_replicate": None,
        "biological_replicate": None,
        # model system
        "model_system_label": "cell_line",
        "model_system_id": None,
        "tissue": "blood",
        "cell_line_label": "JURKAT cell",
        "cell_type_label": "T cell",
        "disease_label": "T-cell childhood acute lymphocytic leukemia",

        "timepoint": "P7DT0H0M0S",
        "species": "Homo sapiens",
        "sex_label": "male",
        "sex_id": None,
        "developmental_stage_label": "adolescent",
        "developmental_stage_id": None,

        "study_title": "Transcriptome-wide analysis of differential expression in perturbation atlases",
        "study_uri": "https://doi.org/10.1038/s41588-025-02169-3",
        "study_year": 2025,
        "first_author": "Ajay Nadig",
        "last_author": "Luke J. Oâ€™Connor",

        "experiment_title": "Jurkat day 7 essential gene set Perturb-seq experiment",
        "experiment_summary": """
            Jurkat cells were transduced with a sgRNA library targeting a set of 2,393 common essential genes and sampled at day 7 after lentiviral transduction.
            Multiplexed CRISPRi library containing two distinct guides targeting the same gene were used.
            """,

        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],

        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": None,
        "library_generation_method_label": "dCas9-KRAB-Zim3",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "lentivirus transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentivirus transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive transgene expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive transgene expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",

        "library_manufacturer": "Weissman Lab",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "2",
        "library_total_grnas": str(cur_data.adata.obs['guide_sequence'].str.split('|').explode().nunique()),
        "library_total_variants": None,

        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime v3",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina NovaSeq 6000",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "TRADE",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh38",

        "license_label": "free to use license",
        "license_id": "SWO:1000061",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "GSE264667",
                "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE264667&format=file&file=GSE264667%5Fjurkat%5Fraw%5Fsinglecell%5F01%2Eh5ad",
                "dataset_description": "Raw data for Jurkat Perturb-seq experiment",
                "dataset_file_name": "GSE264667_jurkat_raw_singlecell_01.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column data_modality added to adata.obs
Column significant added to adata.obs
Column significance_criteria added to adata.obs
Column score_interpretation added to adata.obs
Column treatment_label added to adata.obs
Column treatment_id added to adata.obs
Column technical_replicate added to adata.obs
Column biological_replicate added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column tissue added to adata.obs
Column cell_line_label added to adata.obs
Column cell_type_label added to adata.obs
Column disease_label added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs

### Curate replicate information

In [14]:
cur_data.adata.obs['technical_replicate'] = cur_data.adata.obs['gem_group'].astype(str)

### Curate tissue information


In [15]:
cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell type information

In [16]:
cur_data.standardize_ontology(
    input_column='cell_type_label',
    column_type='term_name',
    ontology_type='cell_type',
    overwrite=True
)

Mapped 1 cell_type ontology terms from `cell_type_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower ontology_id
0       T cell             t cell     t cell  CL:0000084
--------------------------------------------------
Overwriting column cell_type_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell line information

In [17]:
cur_data.standardize_ontology(
    input_column='cell_line_label',
    column_type='term_name',
    ontology_type='cell_line',
    overwrite=True
)

Mapped 1 cell_line ontology terms from `cell_line_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower   name_lower  ontology_id
0  JURKAT cell        jurkat cell  jurkat cell  CLO:0007043
--------------------------------------------------
Overwriting column cell_line_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Curate disease information

In [18]:
cur_data.standardize_ontology(
    input_column='disease_label',
    column_type='term_name',
    ontology_type='disease',
    overwrite=True
)

Mapped 1 disease ontology terms from `disease_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                                  input_column  \
0  T-cell childhood acute lymphocytic leukemia   

                            input_column_lower  \
0  t-cell childhood acute lymphocytic leukemia   

                                    name_lower    ontology_id  
0  t-cell childhood acute lymphocytic leukemia  MONDO:0000871  
--------------------------------------------------
Overwriting column disease_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Match schema column order

In [19]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [20]:
cur_data.validate_data(slot='obs', verbose=True)

2025-12-09 15:38:26,942 DEBUG curation_tools.curation_tools: Applying dtype casting on adata.obs for columns: ['dataset_id', 'sample_id', 'data_modality', 'significant', 'significance_criteria', 'perturbation_name', 'perturbed_target_coord', 'perturbed_target_chromosome', 'perturbed_target_chromosome_encoding', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_biotype', 'guide_sequence', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'technical_replicate', 'biological_replicate', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_label', 'disease_id', 'study_title', 'study_uri', 'study_year', 'first_author', 'last_author', 'experiment_title', 'experiment_summary', 'number_of_perturbed_targets', 'number_of

Unnamed: 0,dataset_id,sample_id,data_modality,significant,significance_criteria,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,...,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,score_interpretation,reference_genome_id,reference_genome_label,associated_datasets,license_label,license_id
0,nadig_2025_jurkat,1,CRISPR screen,,,NELFE_+_31926720.23-P1P2|NELFE_-_31926676.23-P1P2,chr6:31952087-31959038;-1,6,6,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
1,nadig_2025_jurkat,2,CRISPR screen,,,EMC7_+_34394068.23-P1P2|EMC7_-_34393868.23-P1P2,chr15:34083980-34101910;-1,15,15,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
2,nadig_2025_jurkat,3,CRISPR screen,,,TAF1D_-_93471390.23-P2|TAF1D_+_93471338.23-P2,chr11:93729948-93784391;-1,11,11,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
3,nadig_2025_jurkat,4,CRISPR screen,,,EIF2B2_-_75469671.23-P1P2|EIF2B2_-_75469856.23...,chr14:75002894-75012366;1,14,14,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
4,nadig_2025_jurkat,5,CRISPR screen,,,RPP30_+_92631924.23-P1P2|RPP30_-_92631746.23-P1P2,chr10:90871487-90908553;1,10,10,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262951,nadig_2025_jurkat,262952,CRISPR screen,,,UHRF1_+_4910126.23-P1P2|UHRF1_+_4909924.23-P1P2,chr19:4903080-4962154;1,19,19,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
262952,nadig_2025_jurkat,262953,CRISPR screen,,,EIF1AX_+_20159985.23-P1P2|EIF1AX_-_20159725.23...,chrX:20124525-20141982;-1,X,23,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
262953,nadig_2025_jurkat,262954,CRISPR screen,,,MCM3_-_52149354.23-P1P2|MCM3_-_52149307.23-P1P2,chr6:52264011-52284881;-1,6,6,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
262954,nadig_2025_jurkat,262955,CRISPR screen,,,MED20_+_41888837.23-P1P2|MED20_+_41888805.23-P1P2,chr6:41905354-41921139;-1,6,6,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061


# VAR slot curation

### Standardise genes

In [23]:
cur_data.show_var()

Variable data:
DataFrame shape: (8882, 13)
--------------------------------------------------
                  gene_name         chr    start      end           class  \
gene_id                                                                     
ENSG00000237491   LINC01409        chr1   778747   810065  gene_version10   
ENSG00000228794   LINC01128        chr1   825138   868202   gene_version9   
ENSG00000188976       NOC2L        chr1   944203   959309  gene_version11   
ENSG00000188290        HES4        chr1   998962  1000172  gene_version10   
ENSG00000187608       ISG15        chr1  1001138  1014540  gene_version10   
...                     ...         ...      ...      ...             ...   
ENSG00000198786      MT-ND5        chrM    12337    14148   gene_version2   
ENSG00000198695      MT-ND6        chrM    14149    14673   gene_version2   
ENSG00000198727      MT-CYB        chrM    14747    15887   gene_version2   
ENSG00000271254  AC240274.1  KI270711.1     4612    29626  

In [22]:
cur_data.create_columns(
    slot = 'var',
    col_dict={'gene_ensembl_id': cur_data.adata.var.index},
    overwrite=True
)

Column gene_ensembl_id added to adata.var


In [24]:
cur_data.standardize_genes(
    slot="var",
    input_column="gene_ensembl_id",
    input_column_type="ensembl_gene_id",
    remove_version=False,
    multiple_entries=False
)

2025-12-09 15:40:02,112 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Missing Ensembl IDs: ['ENSG00000272316', 'ENSG00000176659', 'ENSG00000270019', 'ENSG00000259972', 'ENSG00000272872', 'ENSG00000271254', 'ENSG00000283440', 'ENSG00000215271', 'ENSG00000287263', 'ENSG00000272583', 'ENSG00000239665', 'ENSG00000264920', 'ENSG00000269028', 'ENSG00000215067', 'ENSG00000260793', 'ENSG00000270195', 'ENSG00000277203', 'ENSG00000237356', 'ENSG00000276345', 'ENSG00000228434', 'ENSG00000112096', 'ENSG00000283633', 'ENSG00000130723', 'ENSG00000148362']; attempting to fetch latest IDs...


2025-12-09 15:40:03,504 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 5054


Fetched latest Ensembl IDs: {'ENSG00000272316': 'ENSG00000290555', 'ENSG00000176659': 'ENSG00000228340', 'ENSG00000270019': 'ENSG00000228857', 'ENSG00000259972': 'ENSG00000290890', 'ENSG00000272872': 'ENSG00000206195', 'ENSG00000283440': 'ENSG00000244558', 'ENSG00000215271': 'ENSG00000290292', 'ENSG00000287263': 'ENSG00000177738', 'ENSG00000272583': 'ENSG00000228826', 'ENSG00000239665': 'ENSG00000165630', 'ENSG00000264920': 'ENSG00000234494', 'ENSG00000269028': nan, 'ENSG00000215067': 'ENSG00000267532', 'ENSG00000260793': 'ENSG00000267394', 'ENSG00000270195': nan, 'ENSG00000277203': 'ENSG00000288709', 'ENSG00000237356': 'ENSG00000258731', 'ENSG00000228434': 'ENSG00000290758', 'ENSG00000112096': 'ENSG00000291237', 'ENSG00000283633': 'ENSG00000290418', 'ENSG00000130723': nan, 'ENSG00000148362': 'ENSG00000310560'}
--------------------------------------------------
Successfully mapped 8858 out of 8858 Ensembl IDs.
--------------------------------------------------


### Validate var metadata

In [25]:
cur_data.validate_data(slot='var')

2025-12-09 15:40:12,048 INFO curation_tools.curation_tools: adata.var is valid according to the var_schema.
2025-12-09 15:40:12,049 DEBUG curation_tools.curation_tools: Validated adata.var preview (shape=(8882, 2)):
                 ensembl_gene_id gene_symbol
index                                       
ENSG00000237491  ENSG00000237491   LINC01409
ENSG00000228794  ENSG00000228794   LINC01128
ENSG00000188976  ENSG00000188976       NOC2L
ENSG00000188290  ENSG00000188290        HES4
ENSG00000187608  ENSG00000187608       ISG15


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000237491,ENSG00000237491,LINC01409
ENSG00000228794,ENSG00000228794,LINC01128
ENSG00000188976,ENSG00000188976,NOC2L
ENSG00000188290,ENSG00000188290,HES4
ENSG00000187608,ENSG00000187608,ISG15
...,...,...
ENSG00000198786,ENSG00000198786,MT-ND5
ENSG00000198695,ENSG00000198695,MT-ND6
ENSG00000198727,ENSG00000198727,MT-CYB
ENSG00000271254,,


# Save the dataset

In [26]:
cur_data.save_curated_data_h5ad()

  adata.obs = adata.obs.fillna(value=np.nan)
... storing 'dataset_id' as categorical
... storing 'data_modality' as categorical
... storing 'significance_criteria' as categorical
... storing 'perturbation_name' as categorical
... storing 'perturbed_target_coord' as categorical
... storing 'perturbed_target_chromosome' as categorical
... storing 'perturbed_target_ensg' as categorical
... storing 'perturbed_target_symbol' as categorical
... storing 'perturbed_target_biotype' as categorical
... storing 'guide_sequence' as categorical
... storing 'perturbation_type_label' as categorical
... storing 'perturbation_type_id' as categorical
... storing 'timepoint' as categorical
... storing 'treatment_label' as categorical
... storing 'treatment_id' as categorical
... storing 'technical_replicate' as categorical
... storing 'biological_replicate' as categorical
... storing 'model_system_label' as categorical
... storing 'model_system_id' as categorical
... storing 'species' as categorical
... s

âœ… Curated h5ad data saved to ../curated/h5ad/nadig_2025_jurkat_curated.h5ad


In [27]:
cur_data.save_curated_data_parquet(split_metadata=True, save_metadata_only=True)

âœ… Metadata saved to ../curated/parquet/nadig_2025_jurkat_curated_metadata.parquet


# Upload to BigQuery

In [28]:
upload_parquet_to_bq(
    parquet_path='../curated/parquet/nadig_2025_jurkat_curated_metadata.parquet',
    bq_dataset_id='prj-ext-dev-pertcat-437314.perturb_seq',
    bq_table_name='metadata',
    key_columns=['dataset_id', 'sample_id'],
    verbose=True
)

2025-12-09 15:43:15,679 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-09 15:43:15,681 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-09 15:43:16,462 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-09 15:43:16,463 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-09 15:43:16,832 DEBUG google.cloud.bigquery.opentelemetry_tracing: This service is instrumented using OpenTelemetry. OpenTelemetry or one of its components could not be imported; please add compatible versions of opentelemetry-api and opentelemetry-instrumentation packages in order to get BigQuery Tracing data.
2025-12-09 15:43:16,833 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-12-09 15:43:16,834 DEBUG google.auth.transport.requests: Making reques

Staging table: loading `.parquet` file ../curated/parquet/nadig_2025_jurkat_curated_metadata.parquet to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging...


2025-12-09 15:43:17,839 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable HTTP/1.1" 200 0
2025-12-09 15:43:18,501 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "PUT /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable&upload_id=AHVrFxPK01FloSb92_5O-rgMJOq_B0dM5GSqA5epNibmBBN0kpH1LiKQX7ORs7m-FtmL2VoAlvbQ4XlYlxoyNmSg479ztC3V6eA7q14vgA42oQ HTTP/1.1" 200 14002
2025-12-09 15:43:18,608 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/d5fc481f-c3d6-4a74-a428-19ff4b1afa89?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-09 15:43:18,611 DEBUG google.api_core.retry: Retrying due to , sleeping 0.9s ...
2025-12-09 15:43:19,588 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-4

Staging table: loaded 262956 rows to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-12-09 15:43:23,230 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-09 15:43:23,230 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-09 15:43:23,598 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-12-09 15:43:23,599 DEBUG google.auth.transport.requests: Making request: POST https://oauth2.googleapis.com/token
2025-12-09 15:43:23,600 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): oauth2.googleapis.com:443
2025-12-09 15:43:23,678 DEBUG urllib3.connectionpool: https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None
2025-12-09 15:43:23,680 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): bigquery.googleapis.com:443
2025-12-09 15:43:24,026 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?pre

Staging table: added ingested_at timestamp column to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-12-09 15:43:30,970 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/datasets/perturb_seq/tables/metadata?prettyPrint=false HTTP/1.1" 200 None
2025-12-09 15:43:31,462 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?prettyPrint=false HTTP/1.1" 200 None
2025-12-09 15:43:31,540 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/186f3f53-69de-44b8-a451-5c80873d6033?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-09 15:43:36,469 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/queries/186f3f53-69de-44b8-a451-5c80873d6033?maxResults=0&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-09 15:43:36,552 DEBUG urllib3.connectionpool: https://bigquery.go

Merge completed: staging â†’ prj-ext-dev-pertcat-437314.perturb_seq.metadata with type-safe casting.
Staging table: deleted prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


# Upload to GC Storage

In [29]:
!gcloud storage cp ../curated/h5ad/nadig_2025_jurkat_curated.h5ad gs://perturbation-catalogue-lake/perturbseq/curated/

uploading large objects. If you would like to opt-out and instead
perform a normal upload, run:
`gcloud config set storage/parallel_composite_upload_enabled False`
`gcloud config set storage/parallel_composite_upload_enabled True`
Note that with parallel composite uploads, your object might be
uploaded as a composite object
(https://cloud.google.com/storage/docs/composite-objects), which means
that any user who downloads your object will need to use crc32c
checksums to verify data integrity. gcloud storage is capable of
computing crc32c checksums, but this might pose a problem for other
clients.

Copying file://../curated/h5ad/nadig_2025_jurkat_curated.h5ad to gs://perturbation-catalogue-lake/perturbseq/curated/nadig_2025_jurkat_curated.h5ad
  Completed files 32/1 | 8.8GiB/8.8GiB | 46.9MiB/s                             

Average throughput: 39.3MiB/s
