# Import

In [1]:
import pandas as pd
import json

from curation_tools.curation_tools import (
    CuratedDataset,
    ObsSchema,
    VarSchema,
    Experiment,
    download_file,
    upload_parquet_to_bq
)

import logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    handlers=[
        logging.FileHandler("curation.log"),
        logging.StreamHandler(),  # keep console output too
    ],
    force=True,
)

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



# Download data

In [2]:
noncurated_path = "../non_curated/h5ad/nadig_2025_hepg2.h5ad"
download_file(
    url="https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE264667&format=file&file=GSE264667%5Fhepg2%5Fraw%5Fsinglecell%5F01%2Eh5ad",
    dest_path=noncurated_path
)

File ../non_curated/h5ad/nadig_2025_hepg2.h5ad already exists. Skipping download.


# Initialise the dataset object

In [3]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    noncurated_path=noncurated_path
)

cur_data.load_data()

Loading data from ../non_curated/h5ad/nadig_2025_hepg2.h5ad


2025-12-08 15:37:10,522 DEBUG h5py._conv: Creating converter from 3 to 5


In [4]:
cur_data.adata.obs

Unnamed: 0_level_0,gem_group,gene,gene_id,transcript,gene_transcript,sgID_AB,mitopercent,UMI_count,z_gemgroup_UMI
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAACCCAAGAATAGTC-3,3,KIAA1143,ENSG00000163807,P1P2,4360_KIAA1143_P1P2_ENSG00000163807,KIAA1143_+_44803075.23-P1P2|KIAA1143_+_4480308...,0.114029,11234.0,-0.611091
AAACCCAAGACAGCTG-12,12,FEN1,ENSG00000168496,P1P2,3057_FEN1_P1P2_ENSG00000168496,FEN1_-_61560380.23-P1P2|FEN1_+_61560617.23-P1P2,0.095229,11068.0,-0.070171
AAACCCAAGACCCTTA-53,53,RNPS1,ENSG00000205937,P1P2,7407_RNPS1_P1P2_ENSG00000205937,RNPS1_+_2318108.23-P1P2|RNPS1_+_2318045.23-P1P2,0.086603,16743.0,0.208552
AAACCCAAGACGCCCT-39,39,PHF10,ENSG00000130024,P1P2,6279_PHF10_P1P2_ENSG00000130024,PHF10_-_170124315.23-P1P2|PHF10_+_170124573.23...,0.084000,21488.0,0.091101
AAACCCAAGAGGCCAT-24,24,HSF1,ENSG00000185122,P1P2,3959_HSF1_P1P2_ENSG00000185122,HSF1_+_145515304.23-P1P2|HSF1_-_145515300.23-P1P2,0.099000,19293.0,0.041390
...,...,...,...,...,...,...,...,...,...
TTTGTTGTCTTACGGA-22,22,ADAT3,ENSG00000213638,P1,141_ADAT3_P1_ENSG00000213638,ADAT3_-_1905438.23-P1|ADAT3_+_1905424.23-P1,0.093179,16259.0,-0.275738
TTTGTTGTCTTCGATT-35,35,DHX16,ENSG00000204560,P1P2,2204_DHX16_P1P2_ENSG00000204560,DHX16_+_30640731.23-P1P2|DHX16_-_30640796.23-P1P2,0.026757,35692.0,-0.256314
TTTGTTGTCTTGATTC-31,31,EBNA1BP2,ENSG00000117395,P1P2,2456_EBNA1BP2_P1P2_ENSG00000117395,EBNA1BP2_-_43637779.23-P1P2|EBNA1BP2_+_4363789...,0.066386,25457.0,0.024672
TTTGTTGTCTTGGGCG-39,39,RPL8,ENSG00000161016,P1P2,7480_RPL8_P1P2_ENSG00000161016,RPL8_+_146017745.23-P1P2|RPL8_-_146017783.23-P1P2,0.074004,30012.0,0.854286


# OBS slot curation

### Show unique perturbations

In [5]:
cur_data.show_unique(slot = 'obs', column = 'sgID_AB')

Unique values in adata.obs.sgID_AB: 2679
--------------------------------------------------
{'AAAS_-_53715438.23-P1P2|AAAS_+_53715355.23-P1P2',
 'AAMP_+_219134851.23-P1P2|AAMP_+_219134841.23-P1P2',
 'AAR2_-_34824434.23-P1P2|AAR2_+_34824488.23-P1P2',
 'AARS2_+_44281027.23-P1P2|AARS2_+_44281044.23-P1P2',
 'AARS_+_70323362.23-P1P2|AARS_-_70323332.23-P1P2',
 'AASDHPPT_+_105948405.23-P1P2|AASDHPPT_+_105948450.23-P1P2',
 'AATF_-_35306286.23-P1P2|AATF_-_35306346.23-P1P2',
 'ABCB10_+_229694285.23-P1P2|ABCB10_-_229694297.23-P1P2',
 'ABCB7_-_74376019.23-P1P2|ABCB7_+_74375885.23-P1P2',
 'ABCE1_-_146019502.23-P1P2|ABCE1_-_146019516.23-P1P2',
 'ABCF1_+_30539238.23-P1|ABCF1_+_30539469.23-P1',
 'ABCF1_-_30546354.23-P2|ABCF1_-_30546344.23-P2',
 'ABCG1_-_43639282.23-P1P2|ABCG1_+_43639503.23-P1P2',
 'ABHD11_-_73153094.23-P1P2|ABHD11_+_73152963.23-P1P2',
 'ABHD17A_-_1885483.23-P1P2|ABHD17A_+_1885470.23-P1P2',
 'ABT1_-_26597263.23-P1P2|ABT1_+_26597412.23-P1P2',
 'ACD_-_67694143.23-P1P2|ACD_+_67694124.23-P

### Rename `sgID_AB` to `perturbation_name`

In [6]:
cur_data.rename_columns(slot = 'obs', name_dict = {'sgID_AB': 'perturbation_name'})

Renamed columns in adata.obs: {'sgID_AB': 'perturbation_name'}


### Add guide RNA information

In [7]:
# download the guide RNA spreadsheet
download_file(
    url="https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-025-02169-3/MediaObjects/41588_2025_2169_MOESM4_ESM.xlsx",
    dest_path="../supplementary/nadig_2025_guide_info.xlsx"
)

# read in the guide RNA spreadsheet
# guides for the essential library are in "ST20" sheet
guide_info_df = pd.read_excel("../supplementary/nadig_2025_guide_info.xlsx", sheet_name="ST20")

# create perturbation_name column in guide_info_df
guide_info_df['perturbation_name'] = guide_info_df['sgID_A'] + '|' + guide_info_df['sgID_B']
# replace commas with hyphens in perturbation_name
guide_info_df['perturbation_name'] = guide_info_df['perturbation_name'].str.replace(',', '-')
# check that all perturbation names in cur_data are in guide_info_df
print(f"All perturbation names in cur_data are in guide_info_df: {cur_data.adata.obs['perturbation_name'].isin(guide_info_df['perturbation_name']).all()}")
# create guide_sequence column in guide_info_df
guide_info_df['guide_sequence'] = guide_info_df['targeting sequence A'] + '|' + guide_info_df['targeting sequence B']
# subset for necessary columns
guide_info_df = guide_info_df[['perturbation_name', 'guide_sequence']]
# merge cur_data.adata.obs with guide_info_df on perturbation_name
cur_data.adata.obs = cur_data.adata.obs.merge(guide_info_df, on='perturbation_name', how='left')
# check that there are no missing guide sequences
print(f"Number of missing guide sequences: {cur_data.adata.obs['guide_sequence'].isna().sum()}")


File ../supplementary/nadig_2025_guide_info.xlsx already exists. Skipping download.
All perturbation names in cur_data are in guide_info_df: True
Number of missing guide sequences: 0


  return dispatch(args[0].__class__)(*args, **kw)


### Standardise perturbation targets

In [8]:
cur_data.standardize_genes(
    slot='obs',
    input_column='gene',
    input_column_type='gene_symbol',
    multiple_entries=False,
    # remove_version=True,
    # version_sep='.'
)

Mapping gene symbols: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2394/2394 [00:00<00:00, 44948.23it/s]


--------------------------------------------------
Successfully mapped 2392 out of 2394 gene symbols.
--------------------------------------------------
Couldn't map gene symbols: ['AC118549.1', 'MTRNR2L1']
--------------------------------------------------


In [9]:
cur_data.adata.obs.head()

Unnamed: 0_level_0,z_gemgroup_UMI,guide_sequence,gene_id,mitopercent,UMI_count,gene,gem_group,perturbation_name,transcript,gene_transcript,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,perturbed_target_coord,perturbed_target_chromosome
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,-0.611091,GACCATGAGCAAGCGGAACC|GCAGAGCTACCATGAGCAAG,ENSG00000163807,0.114029,11234.0,KIAA1143,3,KIAA1143_+_44803075.23-P1P2|KIAA1143_+_4480308...,P1P2,4360_KIAA1143_P1P2_ENSG00000163807,ENSG00000163807,KIAA1143,protein_coding,chr3:44737661-44762033;-1,3
1,-0.070171,GCGGGAGCGCGGGCTTTGGA|GACTGGCCCAAGGCTCACAG,ENSG00000168496,0.095229,11068.0,FEN1,12,FEN1_-_61560380.23-P1P2|FEN1_+_61560617.23-P1P2,P1P2,3057_FEN1_P1P2_ENSG00000168496,ENSG00000168496,FEN1,protein_coding,chr11:61792725-61797238;1,11
2,0.208552,GCTTGACTCTGACGTCAGAG|GCGGCGGGAAGATGTAAGTT,ENSG00000205937,0.086603,16743.0,RNPS1,53,RNPS1_+_2318108.23-P1P2|RNPS1_+_2318045.23-P1P2,P1P2,7407_RNPS1_P1P2_ENSG00000205937,ENSG00000205937,RNPS1,protein_coding,chr16:2253116-2268397;-1,16
3,0.091101,GAATGGAGGAGGCCCAGCGG|GAGTACAACGCCGGGCCGTG,ENSG00000130024,0.084,21488.0,PHF10,39,PHF10_-_170124315.23-P1P2|PHF10_+_170124573.23...,P1P2,6279_PHF10_P1P2_ENSG00000130024,ENSG00000130024,PHF10,protein_coding,chr6:169703902-169725566;-1,6
4,0.04139,GACAGCCCCGGGGCCCAGCA|GCGCGCCCGTTGCAAGATGG,ENSG00000185122,0.099,19293.0,HSF1,24,HSF1_+_145515304.23-P1P2|HSF1_-_145515300.23-P1P2,P1P2,3959_HSF1_P1P2_ENSG00000185122,ENSG00000185122,HSF1,protein_coding,chr8:144291529-144314727;1,8


### Add `perturbed_target_number` column

In [10]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [11]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


In [12]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (145473, 2)
--------------------------------------------------
                                        perturbation_name  \
index                                                       
0       KIAA1143_+_44803075.23-P1P2|KIAA1143_+_4480308...   
1         FEN1_-_61560380.23-P1P2|FEN1_+_61560617.23-P1P2   
2         RNPS1_+_2318108.23-P1P2|RNPS1_+_2318045.23-P1P2   
3       PHF10_-_170124315.23-P1P2|PHF10_+_170124573.23...   
4       HSF1_+_145515304.23-P1P2|HSF1_-_145515300.23-P1P2   
...                                                   ...   
145468        ADAT3_-_1905438.23-P1|ADAT3_+_1905424.23-P1   
145469  DHX16_+_30640731.23-P1P2|DHX16_-_30640796.23-P1P2   
145470  EBNA1BP2_-_43637779.23-P1P2|EBNA1BP2_+_4363789...   
145471  RPL8_+_146017745.23-P1P2|RPL8_-_146017783.23-P1P2   
145472    MBIP_+_36789486.23-P1P2|MBIP_+_36789822.23-P1P2   

        perturbed_target_chromosome_encoding  
index                                         
0             

### Add metadata

In [13]:
cur_data.create_columns(
    overwrite=True,
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # perturbation type
        "perturbation_type_label": "CRISPRi",
        "perturbation_type_id": None,
        "data_modality": "CRISPR screen",
        "significant": None,
        "significance_criteria": None,
        "score_interpretation": None,

        # treatment
        "treatment_label": None,
        "treatment_id": None,
        # replicates
        "technical_replicate": None,
        "biological_replicate": None,
        # model system
        "model_system_label": "cell_line",
        "model_system_id": None,
        "tissue": "liver",
        "cell_line_label": "Hep G2 cell",
        "cell_type_label": "hepatocyte",
        "disease_label": "hepatoblastoma",

        "timepoint": "P7DT0H0M0S",
        "species": "Homo sapiens",
        "sex_label": "male",
        "sex_id": None,
        "developmental_stage_label": "adolescent",
        "developmental_stage_id": None,

        "study_title": "Transcriptome-wide analysis of differential expression in perturbation atlases",
        "study_uri": "https://doi.org/10.1038/s41588-025-02169-3",
        "study_year": 2025,
        "first_author": "Ajay Nadig",
        "last_author": "Luke J. Oâ€™Connor",

        "experiment_title": "HepG2 day 7 essential gene set Perturb-seq experiment",
        "experiment_summary": """
            HepG2 liver hepatoblastoma cells were transduced with a sgRNA library targeting a set of 2,393 common essential genes and sampled at day 7 after lentiviral transduction.
            Multiplexed CRISPRi library containing two distinct guides targeting the same gene were used.
            """,

        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],

        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": None,
        "library_generation_method_label": "dCas9-KRAB",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "lentivirus transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentivirus transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive transgene expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive transgene expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",

        "library_manufacturer": "Weissman Lab",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "2",
        "library_total_grnas": str(cur_data.adata.obs['guide_sequence'].str.split('|').explode().nunique()),
        "library_total_variants": None,

        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime v3",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina NovaSeq 6000",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "TRADE",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh38",
        
        "license_label": "free to use license",
        "license_id": "SWO:1000061",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "GSE264667",
                "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE264667&format=file&file=GSE264667%5Fhepg2%5Fraw%5Fsinglecell%5F01%2Eh5ad",
                "dataset_description": "Raw data for HepG2 Perturb-seq experiment",
                "dataset_file_name": "GSE264667_hepg2_raw_singlecell_01.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column data_modality added to adata.obs
Column significant added to adata.obs
Column significance_criteria added to adata.obs
Column score_interpretation added to adata.obs
Column treatment_label added to adata.obs
Column treatment_id added to adata.obs
Column technical_replicate added to adata.obs
Column biological_replicate added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column tissue added to adata.obs
Column cell_line_label added to adata.obs
Column cell_type_label added to adata.obs
Column disease_label added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs

### Curate replicate information

In [14]:
cur_data.adata.obs['technical_replicate'] = cur_data.adata.obs['gem_group'].astype(str)

### Curate tissue information


In [15]:
cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        liver              liver      liver  UBERON:0002107
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell type information

In [16]:
cur_data.standardize_ontology(
    input_column='cell_type_label',
    column_type='term_name',
    ontology_type='cell_type',
    overwrite=True
)

Mapped 1 cell_type ontology terms from `cell_type_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower  name_lower ontology_id
0   hepatocyte         hepatocyte  hepatocyte  CL:0000182
--------------------------------------------------
Overwriting column cell_type_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell line information

In [17]:
cur_data.standardize_ontology(
    input_column='cell_line_label',
    column_type='term_name',
    ontology_type='cell_line',
    overwrite=True
)

Mapped 1 cell_line ontology terms from `cell_line_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower   name_lower  ontology_id
0  Hep G2 cell        hep g2 cell  hep g2 cell  CLO:0003704
--------------------------------------------------
Overwriting column cell_line_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Curate disease information

In [18]:
cur_data.standardize_ontology(
    input_column='disease_label',
    column_type='term_name',
    ontology_type='disease',
    overwrite=True
)

Mapped 1 disease ontology terms from `disease_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
     input_column input_column_lower      name_lower    ontology_id
0  hepatoblastoma     hepatoblastoma  hepatoblastoma  MONDO:0018666
--------------------------------------------------
Overwriting column disease_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Match schema column order

In [19]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [20]:
cur_data.validate_data(slot='obs', verbose=True)

2025-12-08 15:40:17,160 DEBUG curation_tools.curation_tools: Applying dtype casting on adata.obs for columns: ['dataset_id', 'sample_id', 'data_modality', 'significant', 'significance_criteria', 'perturbation_name', 'perturbed_target_coord', 'perturbed_target_chromosome', 'perturbed_target_chromosome_encoding', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_biotype', 'guide_sequence', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'technical_replicate', 'biological_replicate', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_label', 'disease_id', 'study_title', 'study_uri', 'study_year', 'first_author', 'last_author', 'experiment_title', 'experiment_summary', 'number_of_perturbed_targets', 'number_of

Unnamed: 0,dataset_id,sample_id,data_modality,significant,significance_criteria,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,...,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,score_interpretation,reference_genome_id,reference_genome_label,associated_datasets,license_label,license_id
0,nadig_2025_hepg2,1,CRISPR screen,,,KIAA1143_+_44803075.23-P1P2|KIAA1143_+_4480308...,chr3:44737661-44762033;-1,3,3,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
1,nadig_2025_hepg2,2,CRISPR screen,,,FEN1_-_61560380.23-P1P2|FEN1_+_61560617.23-P1P2,chr11:61792725-61797238;1,11,11,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
2,nadig_2025_hepg2,3,CRISPR screen,,,RNPS1_+_2318108.23-P1P2|RNPS1_+_2318045.23-P1P2,chr16:2253116-2268397;-1,16,16,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
3,nadig_2025_hepg2,4,CRISPR screen,,,PHF10_-_170124315.23-P1P2|PHF10_+_170124573.23...,chr6:169703902-169725566;-1,6,6,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
4,nadig_2025_hepg2,5,CRISPR screen,,,HSF1_+_145515304.23-P1P2|HSF1_-_145515300.23-P1P2,chr8:144291529-144314727;1,8,8,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145468,nadig_2025_hepg2,145469,CRISPR screen,,,ADAT3_-_1905438.23-P1|ADAT3_+_1905424.23-P1,chr19:1905399-1913447;1,19,19,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
145469,nadig_2025_hepg2,145470,CRISPR screen,,,DHX16_+_30640731.23-P1P2|DHX16_-_30640796.23-P1P2,chr6:30653119-30673013;-1,6,6,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
145470,nadig_2025_hepg2,145471,CRISPR screen,,,EBNA1BP2_-_43637779.23-P1P2|EBNA1BP2_+_4363789...,chr1:43164175-43270936;-1,1,1,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061
145471,nadig_2025_hepg2,145472,CRISPR screen,,,RPL8_+_146017745.23-P1P2|RPL8_-_146017783.23-P1P2,chr8:144789765-144792587;-1,8,8,1,...,,CellRanger,,TRADE,,,GRCh38,"[{""dataset_accession"": ""GSE264667"", ""dataset_u...",free to use license,SWO:1000061


# VAR slot curation

### Standardise genes

In [26]:
cur_data.show_var()

Variable data:
DataFrame shape: (9624, 13)
--------------------------------------------------
                  gene_name         chr    start      end           class  \
gene_id                                                                     
ENSG00000228794   LINC01128        chr1   825138   868202   gene_version9   
ENSG00000188976       NOC2L        chr1   944203   959309  gene_version11   
ENSG00000187583     PLEKHN1        chr1   966482   975865  gene_version11   
ENSG00000188290        HES4        chr1   998962  1000172  gene_version10   
ENSG00000187608       ISG15        chr1  1001138  1014540  gene_version10   
...                     ...         ...      ...      ...             ...   
ENSG00000198786      MT-ND5        chrM    12337    14148   gene_version2   
ENSG00000198695      MT-ND6        chrM    14149    14673   gene_version2   
ENSG00000198727      MT-CYB        chrM    14747    15887   gene_version2   
ENSG00000276256  AC011043.1  GL000195.1    42939    49164  

In [23]:
cur_data.create_columns(
    slot = 'var',
    col_dict={'gene_ensembl_id': cur_data.adata.var.index},
    overwrite=True
)

Column gene_ensembl_id added to adata.var


In [28]:
cur_data.standardize_genes(
    slot="var",
    input_column="gene_ensembl_id",
    input_column_type="ensembl_gene_id",
    remove_version=False,
    multiple_entries=False
)

2025-12-08 16:56:44,504 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Missing Ensembl IDs: ['ENSG00000130723', 'ENSG00000215067', 'ENSG00000212978', 'ENSG00000287569', 'ENSG00000271254', 'ENSG00000148362', 'ENSG00000271895', 'ENSG00000264920', 'ENSG00000225489', 'ENSG00000112096', 'ENSG00000233937', 'ENSG00000272009', 'ENSG00000273319', 'ENSG00000276256', 'ENSG00000277203']; attempting to fetch latest IDs...


2025-12-08 16:56:45,306 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 3099


Fetched latest Ensembl IDs: {'ENSG00000130723': nan, 'ENSG00000215067': 'ENSG00000267532', 'ENSG00000212978': 'ENSG00000271889', 'ENSG00000287569': 'ENSG00000125804', 'ENSG00000148362': 'ENSG00000310560', 'ENSG00000271895': nan, 'ENSG00000264920': 'ENSG00000234494', 'ENSG00000225489': 'ENSG00000236924', 'ENSG00000112096': 'ENSG00000291237', 'ENSG00000233937': nan, 'ENSG00000272009': 'ENSG00000269293', 'ENSG00000273319': 'ENSG00000233559', 'ENSG00000277203': 'ENSG00000288709'}
--------------------------------------------------
Successfully mapped 9609 out of 9609 Ensembl IDs.
--------------------------------------------------


### Validate var metadata

In [29]:
cur_data.validate_data(slot='var')

2025-12-08 16:57:41,270 INFO curation_tools.curation_tools: adata.var is valid according to the var_schema.
2025-12-08 16:57:41,271 DEBUG curation_tools.curation_tools: Validated adata.var preview (shape=(9624, 2)):
                 ensembl_gene_id gene_symbol
index                                       
ENSG00000228794  ENSG00000228794   LINC01128
ENSG00000188976  ENSG00000188976       NOC2L
ENSG00000187583  ENSG00000187583     PLEKHN1
ENSG00000188290  ENSG00000188290        HES4
ENSG00000187608  ENSG00000187608       ISG15


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000228794,ENSG00000228794,LINC01128
ENSG00000188976,ENSG00000188976,NOC2L
ENSG00000187583,ENSG00000187583,PLEKHN1
ENSG00000188290,ENSG00000188290,HES4
ENSG00000187608,ENSG00000187608,ISG15
...,...,...
ENSG00000198786,ENSG00000198786,MT-ND5
ENSG00000198695,ENSG00000198695,MT-ND6
ENSG00000198727,ENSG00000198727,MT-CYB
ENSG00000276256,,


# Save the dataset

In [30]:
cur_data.save_curated_data_h5ad()

  adata.obs = adata.obs.fillna(value=np.nan)
... storing 'dataset_id' as categorical
... storing 'data_modality' as categorical
... storing 'significance_criteria' as categorical
... storing 'perturbation_name' as categorical
... storing 'perturbed_target_coord' as categorical
... storing 'perturbed_target_chromosome' as categorical
... storing 'perturbed_target_ensg' as categorical
... storing 'perturbed_target_symbol' as categorical
... storing 'perturbed_target_biotype' as categorical
... storing 'guide_sequence' as categorical
... storing 'perturbation_type_label' as categorical
... storing 'perturbation_type_id' as categorical
... storing 'timepoint' as categorical
... storing 'treatment_label' as categorical
... storing 'treatment_id' as categorical
... storing 'technical_replicate' as categorical
... storing 'biological_replicate' as categorical
... storing 'model_system_label' as categorical
... storing 'model_system_id' as categorical
... storing 'species' as categorical
... s

âœ… Curated h5ad data saved to ../curated/h5ad/nadig_2025_hepg2_curated.h5ad


In [34]:
cur_data.save_curated_data_parquet(split_metadata=True, save_metadata_only=True)

âœ… Metadata saved to ../curated/parquet/nadig_2025_hepg2_curated_metadata.parquet


# Upload to BigQuery

In [None]:
upload_parquet_to_bq(
    parquet_path='../curated/parquet/nadig_2025_hepg2_curated_metadata.parquet',
    bq_dataset_id='prj-ext-dev-pertcat-437314.perturb_seq',
    bq_table_name='metadata',
    key_columns=['dataset_id', 'sample_id'],
    verbose=True
)

# Upload to GC Storage

In [None]:
!gcloud storage cp ../curated/h5ad/nadig_2025_hepg2_curated.h5ad gs://perturbation-catalogue-lake/perturbseq/curated/