# Import

In [2]:
import pandas as pd
import json

from curation_tools.curation_tools import (
    CuratedDataset, 
    ObsSchema, 
    VarSchema, 
    Experiment,
    download_file,
    upload_parquet_to_bq
)

import logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    handlers=[
        logging.FileHandler("curation.log"),
        logging.StreamHandler(),  # keep console output too
    ],
    force=True,
)

# Download data

In [3]:
noncurated_path = "../non_curated/h5ad/replogle_2022_k562_essential_normalized.h5ad"
download_file(
    url="https://plus.figshare.com/ndownloader/files/35773075",
    dest_path=noncurated_path
)

File ../non_curated/h5ad/replogle_2022_k562_essential_normalized.h5ad already exists. Skipping download.


# Initialise the dataset object

In [4]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    noncurated_path=noncurated_path
)

cur_data.load_data()

Loading data from ../non_curated/h5ad/replogle_2022_k562_essential_normalized.h5ad


2025-11-14 15:40:28,827 DEBUG h5py._conv: Creating converter from 3 to 5


In [5]:
cur_data.adata.obs

Unnamed: 0_level_0,gem_group,gene,gene_id,transcript,gene_transcript,sgID_AB,mitopercent,UMI_count,z_gemgroup_UMI,core_scale_factor,core_adjusted_UMI_count
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAACCCAAGAAATCCA-27,27,NAF1,ENSG00000145414,P1P2,5449_NAF1_P1P2_ENSG00000145414,NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2,0.112083,11438.0,0.013047,0.813253,14064.512695
AAACCCAAGAACTTCC-31,31,BUB1,ENSG00000169679,P1P2,935_BUB1_P1P2_ENSG00000169679,BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2,0.179895,5342.0,-1.522247,0.844107,6328.584473
AAACCCAAGAAGCCAC-34,34,UBL5,ENSG00000198258,P1P2,9534_UBL5_P1P2_ENSG00000198258,UBL5_-_9938639.23-P1P2|UBL5_+_9938801.23-P1P2,0.105287,17305.0,0.384157,1.091537,15853.792969
AAACCCAAGAATAGTC-43,43,C9orf16,ENSG00000171159,P1P2,1131_C9orf16_P1P2_ENSG00000171159,C9orf16_+_130922603.23-P1P2|C9orf16_+_13092264...,0.099359,30244.0,3.721912,0.948277,31893.619141
AAACCCAAGACAGCGT-28,28,TIMM9,ENSG00000100575,P1P2,8927_TIMM9_P1P2_ENSG00000100575,TIMM9_-_58893843.23-P1P2|TIMM9_-_58893848.23-P1P2,0.137623,8407.0,-0.975371,0.868942,9674.979492
...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCTGTCGTC-45,45,ATP6V1D,ENSG00000100554,P1P2,682_ATP6V1D_P1P2_ENSG00000100554,ATP6V1D_+_67826485.23-P1P2|ATP6V1D_+_67826497....,0.100272,18350.0,0.428227,1.115052,16456.638672
TTTGTTGTCTGTCTCG-27,27,CNOT3,ENSG00000088038,P1P2,1718_CNOT3_P1P2_ENSG00000088038,CNOT3_+_54641532.23-P1P2|CNOT3_-_54641691.23-P1P2,0.093876,8671.0,-0.633593,0.813253,10662.125000
TTTGTTGTCTGTGCGG-44,44,METTL3,ENSG00000165819,P1P2,5004_METTL3_P1P2_ENSG00000165819,METTL3_+_21979431.23-P1P2|METTL3_-_21979084.23...,0.107983,20568.0,1.054624,0.973352,21131.095703
TTTGTTGTCTTGCAGA-14,14,RPL5,ENSG00000122406,P1P2,7475_RPL5_P1P2_ENSG00000122406,RPL5_+_93297664.23-P1P2|RPL5_-_93297968.23-P1P2,0.128225,23568.0,1.676254,1.050055,22444.542969


# OBS slot curation

### Show unique perturbations

In [6]:
cur_data.show_unique(slot = 'obs', column = 'sgID_AB')

Unique values in adata.obs.sgID_AB: 2273
--------------------------------------------------
{'AAAS_-_53715438.23-P1P2|AAAS_+_53715355.23-P1P2',
 'AAMP_+_219134851.23-P1P2|AAMP_+_219134841.23-P1P2',
 'AARS2_+_44281027.23-P1P2|AARS2_+_44281044.23-P1P2',
 'AARS_+_70323362.23-P1P2|AARS_-_70323332.23-P1P2',
 'AASDHPPT_+_105948405.23-P1P2|AASDHPPT_+_105948450.23-P1P2',
 'AATF_-_35306286.23-P1P2|AATF_-_35306346.23-P1P2',
 'ABCB10_+_229694285.23-P1P2|ABCB10_-_229694297.23-P1P2',
 'ABCB7_-_74376019.23-P1P2|ABCB7_+_74375885.23-P1P2',
 'ABCE1_-_146019502.23-P1P2|ABCE1_-_146019516.23-P1P2',
 'ABCF1_+_30539238.23-P1|ABCF1_+_30539469.23-P1',
 'ABCF1_-_30546354.23-P2|ABCF1_-_30546344.23-P2',
 'ABCG1_-_43639282.23-P1P2|ABCG1_+_43639503.23-P1P2',
 'ABHD11_-_73153094.23-P1P2|ABHD11_+_73152963.23-P1P2',
 'ABHD17A_-_1885483.23-P1P2|ABHD17A_+_1885470.23-P1P2',
 'ABT1_-_26597263.23-P1P2|ABT1_+_26597412.23-P1P2',
 'ACD_-_67694143.23-P1P2|ACD_+_67694124.23-P1P2',
 'ACIN1_-_23540292.23-P2|ACIN1_-_23540730.23-P

### Rename `sgID_AB` to `perturbation_name`

In [7]:
cur_data.rename_columns(slot = 'obs', name_dict = {'sgID_AB': 'perturbation_name'})

Renamed columns in adata.obs: {'sgID_AB': 'perturbation_name'}


### Add guide RNA information

In [8]:
# download the guide RNA spreadsheet
download_file(
    url="https://ars.els-cdn.com/content/image/1-s2.0-S0092867422005979-mmc1.xlsx",
    dest_path="../supplementary/replogle_2022_guide_info.xlsx"
)

# read in the guide RNA spreadsheet
# guides for the K562 essential day 6 library are in "TabB_K562_day6_library"
guide_info_df = pd.read_excel("../supplementary/replogle_2022_guide_info.xlsx", sheet_name="TabB_K562_day6_library")

# create perturbation_name column in guide_info_df
guide_info_df['perturbation_name'] = guide_info_df['sgID_A'] + '|' + guide_info_df['sgID_B']
# replace commas with hyphens in perturbation_name
guide_info_df['perturbation_name'] = guide_info_df['perturbation_name'].str.replace(',', '-')
# check that all perturbation names in cur_data are in guide_info_df
print(f"All perturbation names in cur_data are in guide_info_df: {cur_data.adata.obs['perturbation_name'].isin(guide_info_df['perturbation_name']).all()}")
# create guide_sequence column in guide_info_df
guide_info_df['guide_sequence'] = guide_info_df['targeting sequence A'] + '|' + guide_info_df['targeting sequence B']
# subset for necessary columns
guide_info_df = guide_info_df[['perturbation_name', 'guide_sequence']]
# merge cur_data.adata.obs with guide_info_df on perturbation_name
cur_data.adata.obs = cur_data.adata.obs.merge(guide_info_df, on='perturbation_name', how='left')
# check that there are no missing guide sequences
print(f"Number of missing guide sequences: {cur_data.adata.obs['guide_sequence'].isna().sum()}")


File ../supplementary/replogle_2022_guide_info.xlsx already exists. Skipping download.
All perturbation names in cur_data are in guide_info_df: True
Number of missing guide sequences: 0


  return dispatch(args[0].__class__)(*args, **kw)


### Standardise perturbation targets

In [9]:
cur_data.standardize_genes(
    slot='obs',
    input_column='gene',
    input_column_type='gene_symbol',
    multiple_entries=False
)

Mapping gene symbols: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2058/2058 [00:00<00:00, 21399.62it/s]


--------------------------------------------------
Successfully mapped 2057 out of 2058 gene symbols.
--------------------------------------------------
Couldn't map gene symbols: ['MTRNR2L1']
--------------------------------------------------


### Add `perturbed_target_number` column

In [10]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [11]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


In [12]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (310385, 2)
--------------------------------------------------
                                        perturbation_name  \
index                                                       
0       NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2   
1       BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2   
2           UBL5_-_9938639.23-P1P2|UBL5_+_9938801.23-P1P2   
3       C9orf16_+_130922603.23-P1P2|C9orf16_+_13092264...   
4       TIMM9_-_58893843.23-P1P2|TIMM9_-_58893848.23-P1P2   
...                                                   ...   
310380  ATP6V1D_+_67826485.23-P1P2|ATP6V1D_+_67826497....   
310381  CNOT3_+_54641532.23-P1P2|CNOT3_-_54641691.23-P1P2   
310382  METTL3_+_21979431.23-P1P2|METTL3_-_21979084.23...   
310383    RPL5_+_93297664.23-P1P2|RPL5_-_93297968.23-P1P2   
310384  SEC61B_+_101984577.23-P1P2|SEC61B_-_101984591....   

        perturbed_target_chromosome_encoding  
index                                         
0             

### Add metadata

In [13]:
cur_data.create_columns(
    overwrite=True,
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # perturbation type
        "perturbation_type_label": "CRISPRi",
        "perturbation_type_id": None,
        "data_modality": "CRISPR screen",
        "significant": None,
        "significance_criteria": None,
        "score_interpretation": None,

        # treatment
        "treatment_label": None,
        "treatment_id": None,
        # model system
        "model_system_label": "cell_line",
        "model_system_id": None,
        "tissue": "blood",
        "cell_line_label": "K 562 cell",
        "cell_type_label": "lymphoblast",
        "disease_label": "chronic myelogenous leukemia, BCR-ABL1 positive",

        "timepoint": "P6DT0H0M0S",
        "species": "Homo sapiens",
        "sex_label": "female",
        "sex_id": None,
        "developmental_stage_label": "adult",
        "developmental_stage_id": None,

        "study_title": "Mapping information-rich genotype-phenotype landscapes with genome-scale Perturb-seq",
        "study_uri": "https://doi.org/10.1016/j.cell.2022.05.013",
        "study_year": 2022,
        "first_author": "Joseph M Replogle",
        "last_author": "Jonathan S Weissman",

        "experiment_title": "K562 day 6 essential-scale Perturb-seq experiment",
        "experiment_summary": """
            K562 chronic myeloid leukemia cells were transduced with a sgRNA library targeting a set of 2,057 common essential genes and sampled at day 6 after lentiviral transduction.
            Multiplexed CRISPRi library containing two distinct guides targeting the same gene were used. 
            """,

        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],

        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": "EFO:0022895",
        "library_generation_method_label": "dCas9-KRAB",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "lentivirus transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentivirus transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive transgene expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive transgene expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",

        "library_manufacturer": "Weissman Lab",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "2",
        "library_total_grnas": str(cur_data.adata.obs['guide_sequence'].str.split('|').explode().nunique()),
        "library_total_variants": None,

        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime v3",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina NovaSeq 6000",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "custom",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh38",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "K562_essential_raw_bulk_01",
                "dataset_uri": "https://plus.figshare.com/ndownloader/files/35773070",
                "dataset_description": "Raw, pseudo-bulk expression data for genes expressed at >0.01 UMI per cell",
                "dataset_file_name": "K562_essential_raw_bulk_01.h5ad",
            },
            {
                "dataset_accession": "K562_essential_normalized_bulk_01",
                "dataset_uri": "https://plus.figshare.com/ndownloader/files/35780870",
                "dataset_description": "Gemgroup Z-normalized pseudo-bulk expression data for genes expressed at >0.01 UMI per cell",
                "dataset_file_name": "K562_essential_normalized_bulk_01.h5ad",
            },
            {
                "dataset_accession": "K562_essential_raw_singlecell_01",
                "dataset_uri": "https://plus.figshare.com/ndownloader/files/35773219",
                "dataset_description": "Raw, single-cell expression data for genes expressed at >0.01 UMI per cell",
                "dataset_file_name": "K562_essential_raw_singlecell_01.h5ad",
            },
            {
                "dataset_accession": "K562_essential_normalized_singlecell_01",
                "dataset_uri": "https://plus.figshare.com/ndownloader/files/35773075",
                "dataset_description": "Gemgroup Z-normalized single-cell expression data for genes expressed at >0.01 UMI per cell",
                "dataset_file_name": "K562_essential_normalized_singlecell_01.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column data_modality added to adata.obs
Column significant added to adata.obs
Column significance_criteria added to adata.obs
Column score_interpretation added to adata.obs
Column treatment_label added to adata.obs
Column treatment_id added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column tissue added to adata.obs
Column cell_line_label added to adata.obs
Column cell_type_label added to adata.obs
Column disease_label added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs
Column study_title added to adata.obs
Column study_uri added to adata.obs
Column study_year 

In [14]:
cur_data.adata.obs

Unnamed: 0_level_0,gem_group,gene_transcript,UMI_count,gene_id,z_gemgroup_UMI,perturbation_name,gene,core_scale_factor,core_adjusted_UMI_count,transcript,...,sequencing_platform_label,sequencing_strategy_id,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,reference_genome_id,reference_genome_label,associated_datasets
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,27,5449_NAF1_P1P2_ENSG00000145414,11438.0,ENSG00000145414,0.013047,NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2,NAF1,0.813253,14064.512695,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
1,31,935_BUB1_P1P2_ENSG00000169679,5342.0,ENSG00000169679,-1.522247,BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2,BUB1,0.844107,6328.584473,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
2,34,9534_UBL5_P1P2_ENSG00000198258,17305.0,ENSG00000198258,0.384157,UBL5_-_9938639.23-P1P2|UBL5_+_9938801.23-P1P2,UBL5,1.091537,15853.792969,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
3,43,1131_C9orf16_P1P2_ENSG00000171159,30244.0,ENSG00000171159,3.721912,C9orf16_+_130922603.23-P1P2|C9orf16_+_13092264...,C9orf16,0.948277,31893.619141,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
4,28,8927_TIMM9_P1P2_ENSG00000100575,8407.0,ENSG00000100575,-0.975371,TIMM9_-_58893843.23-P1P2|TIMM9_-_58893848.23-P1P2,TIMM9,0.868942,9674.979492,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310380,45,682_ATP6V1D_P1P2_ENSG00000100554,18350.0,ENSG00000100554,0.428227,ATP6V1D_+_67826485.23-P1P2|ATP6V1D_+_67826497....,ATP6V1D,1.115052,16456.638672,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
310381,27,1718_CNOT3_P1P2_ENSG00000088038,8671.0,ENSG00000088038,-0.633593,CNOT3_+_54641532.23-P1P2|CNOT3_-_54641691.23-P1P2,CNOT3,0.813253,10662.125000,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
310382,44,5004_METTL3_P1P2_ENSG00000165819,20568.0,ENSG00000165819,1.054624,METTL3_+_21979431.23-P1P2|METTL3_-_21979084.23...,METTL3,0.973352,21131.095703,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
310383,14,7475_RPL5_P1P2_ENSG00000122406,23568.0,ENSG00000122406,1.676254,RPL5_+_93297664.23-P1P2|RPL5_-_93297968.23-P1P2,RPL5,1.050055,22444.542969,P1P2,...,Illumina NovaSeq 6000,,barcode sequencing,,CellRanger,,custom,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."


### Curate tissue information


In [15]:
cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell type information

In [16]:
cur_data.standardize_ontology(
    input_column='cell_type_label',
    column_type='term_name',
    ontology_type='cell_type',
    overwrite=True
)

Mapped 1 cell_type ontology terms from `cell_type_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower   name_lower ontology_id
0  lymphoblast        lymphoblast  lymphoblast  CL:0017005
--------------------------------------------------
Overwriting column cell_type_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell line information

In [17]:
cur_data.standardize_ontology(
    input_column='cell_line_label',
    column_type='term_name',
    ontology_type='cell_line',
    overwrite=True
)

Mapped 1 cell_line ontology terms from `cell_line_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower  name_lower  ontology_id
0   K 562 cell         k 562 cell  k 562 cell  CLO:0007050
--------------------------------------------------
Overwriting column cell_line_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Curate disease information

In [18]:
cur_data.standardize_ontology(
    input_column='disease_label',
    column_type='term_name',
    ontology_type='disease',
    overwrite=True
)

Mapped 1 disease ontology terms from `disease_label` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                                      input_column  \
0  chronic myelogenous leukemia, BCR-ABL1 positive   

                                input_column_lower  \
0  chronic myelogenous leukemia, bcr-abl1 positive   

                                        name_lower    ontology_id  
0  chronic myelogenous leukemia, bcr-abl1 positive  MONDO:0011996  
--------------------------------------------------
Overwriting column disease_label in adata.obs


  return dispatch(args[0].__class__)(*args, **kw)


### Match schema column order

In [19]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [20]:
cur_data.validate_data(slot='obs', verbose=True)

2025-11-14 15:40:50,447 DEBUG curation_tools.curation_tools: Applying dtype casting on adata.obs for columns: ['dataset_id', 'sample_id', 'data_modality', 'significant', 'significance_criteria', 'perturbation_name', 'perturbed_target_coord', 'perturbed_target_chromosome', 'perturbed_target_chromosome_encoding', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_biotype', 'guide_sequence', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_label', 'disease_id', 'study_title', 'study_uri', 'study_year', 'first_author', 'last_author', 'experiment_title', 'experiment_summary', 'number_of_perturbed_targets', 'number_of_perturbed_samples', 'library_generation_type_i

Unnamed: 0,dataset_id,sample_id,data_modality,significant,significance_criteria,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,...,sequencing_strategy_id,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,score_interpretation,reference_genome_id,reference_genome_label,associated_datasets
0,replogle_2022_k562_essential_normalized,1,CRISPR screen,,,NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2,chr4:163110073-163166913;-1,4,4,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
1,replogle_2022_k562_essential_normalized,2,CRISPR screen,,,BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2,chr2:110636333-110678078;-1,2,2,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
2,replogle_2022_k562_essential_normalized,3,CRISPR screen,,,UBL5_-_9938639.23-P1P2|UBL5_+_9938801.23-P1P2,chr19:9827892-9830137;1,19,19,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
3,replogle_2022_k562_essential_normalized,4,CRISPR screen,,,C9orf16_+_130922603.23-P1P2|C9orf16_+_13092264...,chr9:128160265-128163924;1,9,9,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
4,replogle_2022_k562_essential_normalized,5,CRISPR screen,,,TIMM9_-_58893843.23-P1P2|TIMM9_-_58893848.23-P1P2,chr14:58407461-58427655;-1,14,14,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310380,replogle_2022_k562_essential_normalized,310381,CRISPR screen,,,ATP6V1D_+_67826485.23-P1P2|ATP6V1D_+_67826497....,chr14:67294371-67360265;-1,14,14,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
310381,replogle_2022_k562_essential_normalized,310382,CRISPR screen,,,CNOT3_+_54641532.23-P1P2|CNOT3_-_54641691.23-P1P2,chr19:54137710-54155708;1,19,19,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
310382,replogle_2022_k562_essential_normalized,310383,CRISPR screen,,,METTL3_+_21979431.23-P1P2|METTL3_-_21979084.23...,chr14:21498128-21511362;-1,14,14,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."
310383,replogle_2022_k562_essential_normalized,310384,CRISPR screen,,,RPL5_+_93297664.23-P1P2|RPL5_-_93297968.23-P1P2,chr1:92831990-92841924;1,1,1,1,...,,barcode sequencing,,CellRanger,,custom,,,GRCh38,"[{""dataset_accession"": ""K562_essential_raw_bul..."


In [21]:
cur_data.adata.obs[['library_total_grnas', 'library_total_variants']]

Unnamed: 0,library_total_grnas,library_total_variants
0,4537,
1,4537,
2,4537,
3,4537,
4,4537,
...,...,...
310380,4537,
310381,4537,
310382,4537,
310383,4537,


In [22]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_symbol', 'perturbed_target_ensg', 'perturbed_target_coord'])

Observation data:
DataFrame shape: (310385, 4)
--------------------------------------------------
                                        perturbation_name  \
0       NAF1_+_164087918.23-P1P2|NAF1_-_164087674.23-P1P2   
1       BUB1_-_111435363.23-P1P2|BUB1_-_111435372.23-P1P2   
2           UBL5_-_9938639.23-P1P2|UBL5_+_9938801.23-P1P2   
3       C9orf16_+_130922603.23-P1P2|C9orf16_+_13092264...   
4       TIMM9_-_58893843.23-P1P2|TIMM9_-_58893848.23-P1P2   
...                                                   ...   
310380  ATP6V1D_+_67826485.23-P1P2|ATP6V1D_+_67826497....   
310381  CNOT3_+_54641532.23-P1P2|CNOT3_-_54641691.23-P1P2   
310382  METTL3_+_21979431.23-P1P2|METTL3_-_21979084.23...   
310383    RPL5_+_93297664.23-P1P2|RPL5_-_93297968.23-P1P2   
310384  SEC61B_+_101984577.23-P1P2|SEC61B_-_101984591....   

       perturbed_target_symbol perturbed_target_ensg  \
0                         NAF1       ENSG00000145414   
1                         BUB1       ENSG00000169679   
2

# VAR slot curation

### Standardise genes

In [23]:
cur_data.gene_ont[cur_data.gene_ont['synonym'].str.contains('AC00455', na=False)]

Unnamed: 0,ensembl_gene_id,gene_symbol,chromosome_name,gene_coord,biotype,description,synonym_type,synonym
359173,ENSG00000223731,SUPT20HL1,X,chrX:24360639-24367839;1,protein_coding,SUPT20H like 1 [Source:HGNC Symbol;Acc:HGNC:30...,EMBL,AC004552
359779,ENSG00000089127,OAS1,12,chr12:112905856-112933219;1,protein_coding,2'-5'-oligoadenylate synthetase 1 [Source:HGNC...,EMBL,AC004551
361277,ENSG00000111331,OAS3,12,chr12:112938051-112976460;1,protein_coding,2'-5'-oligoadenylate synthetase 3 [Source:HGNC...,EMBL,AC004551
384257,ENSG00000226372,DCAF8L1,X,chrX:27977992-27981449;-1,protein_coding,DDB1 and CUL4 associated factor 8 like 1 [Sour...,EMBL,AC004553
387477,ENSG00000005302,MSL3,X,chrX:11758159-11775772;1,protein_coding,MSL complex subunit 3 [Source:HGNC Symbol;Acc:...,EMBL,AC004554
389339,ENSG00000169933,FRMPD4,X,chrX:11822423-12724523;1,protein_coding,FERM and PDZ domain containing 4 [Source:HGNC ...,EMBL,AC004554
394757,ENSG00000047648,ARHGAP6,X,chrX:11117651-11665920;-1,protein_coding,Rho GTPase activating protein 6 [Source:HGNC S...,EMBL,AC004554
395931,ENSG00000089169,RPH3A,12,chr12:112570380-112898881;1,protein_coding,rabphilin 3A [Source:HGNC Symbol;Acc:HGNC:17056],EMBL,AC004551
421343,ENSG00000111335,OAS2,12,chr12:112978395-113011723;1,protein_coding,2'-5'-oligoadenylate synthetase 2 [Source:HGNC...,EMBL,AC004551
425581,ENSG00000079385,CEACAM1,19,chr19:42507304-42561234;-1,protein_coding,CEA cell adhesion molecule 1 [Source:HGNC Symb...,EMBL,AC004559


In [24]:
cur_data.show_var()

Variable data:
DataFrame shape: (8563, 12)
--------------------------------------------------
                  gene_name         chr   start      end           class  \
gene_id                                                                    
ENSG00000237491   LINC01409        chr1  778747   810065  gene_version10   
ENSG00000228794   LINC01128        chr1  825138   868202   gene_version9   
ENSG00000188976       NOC2L        chr1  944203   959309  gene_version11   
ENSG00000187961      KLHL17        chr1  960584   965719  gene_version14   
ENSG00000188290        HES4        chr1  998962  1000172  gene_version10   
...                     ...         ...     ...      ...             ...   
ENSG00000278704  BX004987.1  GL000009.2   56140    58376   gene_version1   
ENSG00000274847       MAFIP  GL000194.1   53594   115055   gene_version1   
ENSG00000278384  AL354822.1  GL000218.1   51867    54893   gene_version1   
ENSG00000271254  AC240274.1  KI270711.1    4612    29626   gene_versio

In [25]:
cur_data.create_columns(
    slot = 'var', 
    col_dict={'gene_ensembl_id': cur_data.adata.var.index},
    overwrite=True
)

Column gene_ensembl_id added to adata.var


In [26]:
cur_data.standardize_genes(
    slot="var",
    input_column="gene_ensembl_id",
    input_column_type="ensembl_gene_id",
    remove_version=False,
    multiple_entries=False
)

2025-11-14 15:40:54,805 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Missing Ensembl IDs: ['ENSG00000148362', 'ENSG00000215271', 'ENSG00000239467', 'ENSG00000273373', 'ENSG00000283633', 'ENSG00000271870', 'ENSG00000271109', 'ENSG00000228434', 'ENSG00000215067', 'ENSG00000233937', 'ENSG00000223797', 'ENSG00000130723', 'ENSG00000235111', 'ENSG00000255823', 'ENSG00000112096', 'ENSG00000264920', 'ENSG00000287263', 'ENSG00000272316', 'ENSG00000269028', 'ENSG00000272196', 'ENSG00000258081']; attempting to fetch latest IDs...


2025-11-14 15:40:55,902 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 4355


Fetched latest Ensembl IDs: {'ENSG00000148362': 'ENSG00000310560', 'ENSG00000215271': 'ENSG00000290292', 'ENSG00000239467': 'ENSG00000204334', 'ENSG00000273373': 'ENSG00000224699', 'ENSG00000283633': 'ENSG00000290418', 'ENSG00000271870': nan, 'ENSG00000271109': 'ENSG00000267767', 'ENSG00000228434': 'ENSG00000290758', 'ENSG00000215067': 'ENSG00000267532', 'ENSG00000233937': nan, 'ENSG00000223797': 'ENSG00000280739', 'ENSG00000130723': nan, 'ENSG00000235111': 'ENSG00000188511', 'ENSG00000255823': nan, 'ENSG00000112096': 'ENSG00000291237', 'ENSG00000264920': 'ENSG00000234494', 'ENSG00000287263': 'ENSG00000177738', 'ENSG00000272316': 'ENSG00000290555', 'ENSG00000269028': nan, 'ENSG00000272196': 'ENSG00000288859', 'ENSG00000258081': nan}
--------------------------------------------------
Successfully mapped 8557 out of 8563 Ensembl IDs.
--------------------------------------------------


### Validate var metadata

In [27]:
cur_data.validate_data(slot='var')

2025-11-14 15:40:55,984 INFO curation_tools.curation_tools: adata.var is valid according to the var_schema.
2025-11-14 15:40:55,985 DEBUG curation_tools.curation_tools: Validated adata.var preview (shape=(8563, 2)):
                 ensembl_gene_id gene_symbol
index                                       
ENSG00000237491  ENSG00000237491   LINC01409
ENSG00000228794  ENSG00000228794   LINC01128
ENSG00000188976  ENSG00000188976       NOC2L
ENSG00000187961  ENSG00000187961      KLHL17
ENSG00000188290  ENSG00000188290        HES4


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000237491,ENSG00000237491,LINC01409
ENSG00000228794,ENSG00000228794,LINC01128
ENSG00000188976,ENSG00000188976,NOC2L
ENSG00000187961,ENSG00000187961,KLHL17
ENSG00000188290,ENSG00000188290,HES4
...,...,...
ENSG00000278704,ENSG00000278704,
ENSG00000274847,ENSG00000274847,MAFIP
ENSG00000278384,ENSG00000278384,
ENSG00000271254,ENSG00000271254,


# Save the dataset

In [27]:
cur_data.save_curated_data_h5ad()

  adata.obs = adata.obs.fillna(value=np.nan)
... storing 'dataset_id' as categorical
... storing 'data_modality' as categorical
... storing 'significance_criteria' as categorical
... storing 'perturbation_name' as categorical
... storing 'perturbed_target_coord' as categorical
... storing 'perturbed_target_chromosome' as categorical
... storing 'perturbed_target_ensg' as categorical
... storing 'perturbed_target_symbol' as categorical
... storing 'perturbed_target_biotype' as categorical
... storing 'guide_sequence' as categorical
... storing 'perturbation_type_label' as categorical
... storing 'perturbation_type_id' as categorical
... storing 'timepoint' as categorical
... storing 'treatment_label' as categorical
... storing 'treatment_id' as categorical
... storing 'model_system_label' as categorical
... storing 'model_system_id' as categorical
... storing 'species' as categorical
... storing 'tissue_label' as categorical
... storing 'tissue_id' as categorical
... storing 'cell_type_

âœ… Curated h5ad data saved to ../curated/h5ad/replogle_2022_k562_essential_normalized_curated.h5ad


In [28]:
cur_data.save_curated_data_parquet(split_metadata=True, save_metadata_only=True)

âœ… Metadata saved to ../curated/parquet/replogle_2022_k562_essential_normalized_curated_metadata.parquet


# Upload to BigQuery

In [29]:
upload_parquet_to_bq(
    parquet_path='../curated/parquet/replogle_2022_k562_essential_normalized_curated_metadata.parquet',
    bq_dataset_id='prj-ext-dev-pertcat-437314.perturb_seq',
    bq_table_name='metadata',
    key_columns=['dataset_id', 'sample_id'],
    verbose=True
)

2025-11-14 15:42:18,002 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-11-14 15:42:18,002 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-11-14 15:42:18,686 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-11-14 15:42:18,687 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-11-14 15:42:18,971 DEBUG google.cloud.bigquery.opentelemetry_tracing: This service is instrumented using OpenTelemetry. OpenTelemetry or one of its components could not be imported; please add compatible versions of opentelemetry-api and opentelemetry-instrumentation packages in order to get BigQuery Tracing data.
2025-11-14 15:42:18,973 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-11-14 15:42:18,973 DEBUG google.auth.transport.requests: Making reques

Staging table: loading `.parquet` file ../curated/parquet/replogle_2022_k562_essential_normalized_curated_metadata.parquet to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging...


2025-11-14 15:42:20,053 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable HTTP/1.1" 200 0
2025-11-14 15:42:23,866 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "PUT /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable&upload_id=AOCedOFJgjgvPE0CQU2PBBBxb9zizgF8roiDrDxzm4FGzn9oEJNZPG-H1r-K0wa0Et8sokZgosfSEMy6pmG3wfIu5MH3TSBrMOlx7N0xNDUDcw HTTP/1.1" 200 13370
2025-11-14 15:42:24,085 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/8f75aa6b-bb9d-43c3-81ac-c815079e3b2d?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-11-14 15:42:24,089 DEBUG google.api_core.retry: Retrying due to , sleeping 0.0s ...
2025-11-14 15:42:24,256 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-4

Staging table: loaded 310385 rows to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-11-14 15:42:30,111 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-11-14 15:42:30,112 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-11-14 15:42:30,389 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-11-14 15:42:30,390 DEBUG google.auth.transport.requests: Making request: POST https://oauth2.googleapis.com/token
2025-11-14 15:42:30,391 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): oauth2.googleapis.com:443
2025-11-14 15:42:30,509 DEBUG urllib3.connectionpool: https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None
2025-11-14 15:42:30,510 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): bigquery.googleapis.com:443
2025-11-14 15:42:30,962 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?pre

Staging table: added ingested_at timestamp column to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-11-14 15:42:38,556 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/datasets/perturb_seq/tables/metadata?prettyPrint=false HTTP/1.1" 200 None
2025-11-14 15:42:39,001 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?prettyPrint=false HTTP/1.1" 200 None
2025-11-14 15:42:39,138 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/176b366e-cdaf-47e3-8ec3-c21a6a964335?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-11-14 15:42:46,233 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/queries/176b366e-cdaf-47e3-8ec3-c21a6a964335?maxResults=0&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-11-14 15:42:46,350 DEBUG urllib3.connectionpool: https://bigquery.go

Merge completed: staging â†’ prj-ext-dev-pertcat-437314.perturb_seq.metadata with type-safe casting.
Staging table: deleted prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


# Upload to GC Storage

In [None]:
!gcloud storage cp ../curated/parquet/replogle_2022_k562_essential_normalized_curated_metadata.parquet gs://perturbation-catalogue-lake/perturbseq/curated/