# Import

In [1]:
import pandas as pd
import json

from curation_tools.curation_tools import (
    CuratedDataset,
    ObsSchema,
    VarSchema,
    Experiment,
    download_file,
    upload_parquet_to_bq
)

import logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    handlers=[
        logging.FileHandler("curation.log"),
        logging.StreamHandler(),  # keep console output too
    ],
    force=True,
)

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



# Download the dataset

In [2]:
noncurated_path = "../non_curated/h5ad/adamson_2016_pilot.h5ad"
download_file(
    url="https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
    dest_path=noncurated_path
)

2025-12-12 10:24:36,206 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): zenodo.org:443
2025-12-12 10:24:36,383 DEBUG urllib3.connectionpool: https://zenodo.org:443 "GET /record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad HTTP/1.1" 301 317
2025-12-12 10:24:36,563 DEBUG urllib3.connectionpool: https://zenodo.org:443 "GET /records/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad HTTP/1.1" 200 34557246


Downloaded https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad to ../non_curated/h5ad/adamson_2016_pilot.h5ad


# Initialise the dataset object

In [3]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    noncurated_path=noncurated_path
)

cur_data.load_data()

2025-12-12 10:24:50,493 DEBUG h5py._conv: Creating converter from 3 to 5


Loading data from ../non_curated/h5ad/adamson_2016_pilot.h5ad


In [4]:
cur_data.show_var()

Variable data:
DataFrame shape: (35635, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      0.0       0
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945      1.0       1
...                       ...      ...     ...
MT-ND4L       ENSG00000212907      0.0       0
MT-ND4        ENSG00000198886      0.0       0
MT-ND5        ENSG00000198786      0.0       0
MT-ND6        ENSG00000198695      0.0       0
MT-CYB        ENSG00000198727      0.0       0

[35635 rows x 3 columns]
--------------------------------------------------


In [5]:
cur_data.adata.obs

Unnamed: 0_level_0,perturbation,read count,UMI count,tissue_type,cell_line,cancer,disease,perturbation_type,celltype,organism,ncounts,ngenes,percent_mito,percent_ribo,nperts
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAACATACACCGAT,CREB1_pDS269,1286.0,98.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,8138.0,2412,0.0,34.037846,2
AAACATACAGAGAT,SNAI1_pDS266,296.0,19.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,8980.0,2386,0.0,40.011135,2
AAACATACCAGAAA,62(mod)_pBA581,1829.0,162.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,28610.0,4404,0.0,40.003494,2
AAACATACGTTGAC,EP300_pDS268,1580.0,98.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,11346.0,2815,0.0,35.184204,2
AAACATACTGTTCT,62(mod)_pBA581,748.0,51.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,9864.0,2584,0.0,35.817112,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGAAGGC,SNAI1_pDS266,331.0,22.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,5942.0,1856,0.0,31.605520,2
TTTGACTGGACGAG,SNAI1_pDS266,745.0,35.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,8150.0,2263,0.0,36.797546,2
TTTGCATGCCCGTT,SNAI1_pDS266,784.0,37.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,9179.0,2473,0.0,37.128227,2
TTTGCATGCCGTTC,62(mod)_pBA581,1469.0,79.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,15785.0,3167,0.0,37.465950,2


# OBS slot curation

### Show unique perturbations

In [6]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 10
--------------------------------------------------
{nan,
 '*',
 '62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


### Drop NAs

In [7]:
cur_data.remove_na(slot = 'obs', column = 'perturbation')

Removed 10 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [8]:
cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')

Removed 6 entries \* from column perturbation of adata.obs


  cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')


In [9]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 8
--------------------------------------------------
{'62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


### Rename `perturbation` to `perturbation_name`

In [10]:
cur_data.rename_columns(slot = 'obs', name_dict = {'perturbation': 'perturbation_name'})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Add guide RNA information

In [11]:
# Not reported by study authors for this dataset
cur_data.create_columns(
    slot='obs',
    col_dict={
        'guide_sequence': None
    }
)

Column guide_sequence added to adata.obs


### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [12]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol_input": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol_input')

Column perturbed_target_symbol_input added to adata.obs
Unique values in adata.obs.perturbed_target_symbol_input: 8
--------------------------------------------------
{'62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


#### Clean up `perturbation_target_symbol` column

In [13]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol_input",
    map_dict={
        r"62\(mod\).*": "control_nontargeting",
        r"_(pD|pB).*": ""
    }
)

cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol_input')

Replaced '62\(mod\).*' with 'control_nontargeting' in column perturbed_target_symbol_input of adata.obs
Replaced '_(pD|pB).*' with '' in column perturbed_target_symbol_input of adata.obs
Unique values in adata.obs.perturbed_target_symbol_input: 8
--------------------------------------------------
{'BHLHE40',
 'CONTROL_NONTARGETING',
 'CREB1',
 'DDIT3',
 'EP300',
 'SNAI1',
 'SPI1',
 'ZNF326'}
--------------------------------------------------


  if df[column].str.upper().str.contains(old_val.upper()).any():


### Standardise perturbation targets

In [14]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol_input',
    input_column_type='gene_symbol',
    multiple_entries=False,
    # remove_version=True,
    # version_sep='.'
)

Mapping gene symbols: 100%|█████████████████████████████████████████| 8/8 [00:00<00:00, 3919.45it/s]


--------------------------------------------------
Successfully mapped 7 out of 8 gene symbols.
--------------------------------------------------
Couldn't map gene symbols: ['CONTROL_NONTARGETING']
--------------------------------------------------


In [15]:
cur_data.adata.obs

Unnamed: 0_level_0,guide_sequence,perturbation_type,ncounts,disease,percent_ribo,tissue_type,cancer,read count,ngenes,organism,...,perturbation_name,percent_mito,perturbed_target_symbol_input,cell_line,UMI count,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,perturbed_target_coord,perturbed_target_chromosome
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCGAT,,CRISPR,8138.0,chronic myelogenous leukemia,34.037846,cell_line,True,1286.0,2412,human,...,CREB1_pDS269,0.0,CREB1,K562,98.0,ENSG00000118260,CREB1,protein_coding,chr2:207529737-207605988;1,2
AAACATACAGAGAT,,CRISPR,8980.0,chronic myelogenous leukemia,40.011135,cell_line,True,296.0,2386,human,...,SNAI1_pDS266,0.0,SNAI1,K562,19.0,ENSG00000124216,SNAI1,protein_coding,chr20:49982980-49988886;1,20
AAACATACCAGAAA,,CRISPR,28610.0,chronic myelogenous leukemia,40.003494,cell_line,True,1829.0,4404,human,...,62(mod)_pBA581,0.0,CONTROL_NONTARGETING,K562,162.0,,CONTROL_NONTARGETING,,,
AAACATACGTTGAC,,CRISPR,11346.0,chronic myelogenous leukemia,35.184204,cell_line,True,1580.0,2815,human,...,EP300_pDS268,0.0,EP300,K562,98.0,ENSG00000100393,EP300,protein_coding,chr22:41091816-41180077;1,22
AAACATACTGTTCT,,CRISPR,9864.0,chronic myelogenous leukemia,35.817112,cell_line,True,748.0,2584,human,...,62(mod)_pBA581,0.0,CONTROL_NONTARGETING,K562,51.0,,CONTROL_NONTARGETING,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGAAGGC,,CRISPR,5942.0,chronic myelogenous leukemia,31.605520,cell_line,True,331.0,1856,human,...,SNAI1_pDS266,0.0,SNAI1,K562,22.0,ENSG00000124216,SNAI1,protein_coding,chr20:49982980-49988886;1,20
TTTGACTGGACGAG,,CRISPR,8150.0,chronic myelogenous leukemia,36.797546,cell_line,True,745.0,2263,human,...,SNAI1_pDS266,0.0,SNAI1,K562,35.0,ENSG00000124216,SNAI1,protein_coding,chr20:49982980-49988886;1,20
TTTGCATGCCCGTT,,CRISPR,9179.0,chronic myelogenous leukemia,37.128227,cell_line,True,784.0,2473,human,...,SNAI1_pDS266,0.0,SNAI1,K562,37.0,ENSG00000124216,SNAI1,protein_coding,chr20:49982980-49988886;1,20
TTTGCATGCCGTTC,,CRISPR,15785.0,chronic myelogenous leukemia,37.465950,cell_line,True,1469.0,3167,human,...,62(mod)_pBA581,0.0,CONTROL_NONTARGETING,K562,79.0,,CONTROL_NONTARGETING,,,


### Add `perturbed_target_number` column

In [16]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [17]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


In [18]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (5752, 2)
--------------------------------------------------
               perturbation_name  perturbed_target_chromosome_encoding
index                                                                 
AAACATACACCGAT      CREB1_pDS269                                     2
AAACATACAGAGAT      SNAI1_pDS266                                    20
AAACATACCAGAAA    62(mod)_pBA581                                     0
AAACATACGTTGAC      EP300_pDS268                                    22
AAACATACTGTTCT    62(mod)_pBA581                                     0
...                          ...                                   ...
TTTGACTGGAAGGC      SNAI1_pDS266                                    20
TTTGACTGGACGAG      SNAI1_pDS266                                    20
TTTGCATGCCCGTT      SNAI1_pDS266                                    20
TTTGCATGCCGTTC    62(mod)_pBA581                                     0
TTTGCATGTTCTAC    62(mod)_pBA581                    

### Add metadata

In [19]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # treatment
        "treatment_label": None, 
        "treatment_id": None,
        # perturbation type
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None,
        "data_modality": "CRISPR screen",
        "significant": None,
        "significance_criteria": None,
        "score_interpretation": None,
        # model system
        "model_system_label": "cell_line",
        "model_system_id": None,
        # replicates
        "technical_replicate": None,
        "biological_replicate": None,

        'tissue': 'blood',
        "timepoint": "P0DT0H0M0S",
        
        "species": "Homo sapiens",
        "sex_label": "female", 
        "sex_id": None,
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None,

        "study_title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "study_year": 2016,
        "first_author": "Britt Adamson",
        "last_author": "Jonathan Weissman",
        
        "experiment_title": "6000 chronic myeloid leukemia (K562) cells transfected with gRNAs against 7 transcription factors + 1 control",
        "experiment_summary": "In a pilot experiment, single-cell RNA-seq was performed on a pool of individually transduced chronic myeloid leukemia cells (K562) carrying 8 distinct guide barcodes, analyzing \u223c6,000 cells total.",
        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],
        
        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": "EFO:0022895",
        "library_generation_method_label": "dCas9-KRAB",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "retrovirus transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentivirus transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive transgene expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive transgene expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",
        
        "library_manufacturer": "Weissman",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "1",
        "library_total_grnas": "8",
        "library_total_variants": None,
        
        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina HiSeq 2500",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "MAGeCK",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh37",
        
        "license_label": "CC BY 4.0",
        "license_id": "SWO:1000065",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "GSM2406675",
                "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406675",
                "dataset_description": "Raw counts",
                "dataset_file_name": "GSE90546_RAW.tar",
            },
            {
                "dataset_accession": "GSM2406675_10X001",
                "dataset_uri": "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
                "dataset_description": "Processed .h5ad file",
                "dataset_file_name": "AdamsonWeissman2016_GSM2406675_10X001.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column treatment_label added to adata.obs
Column treatment_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column data_modality added to adata.obs
Column significant added to adata.obs
Column significance_criteria added to adata.obs
Column score_interpretation added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column technical_replicate added to adata.obs
Column biological_replicate added to adata.obs
Column tissue added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs
Column study_title added to adata.obs
Column study_uri added to adata.obs
Column study_year added to adata.obs
Column first

In [20]:
cur_data.adata.obs

Unnamed: 0_level_0,guide_sequence,perturbation_type,ncounts,disease,percent_ribo,tissue_type,cancer,read count,ngenes,organism,...,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,reference_genome_id,reference_genome_label,license_label,license_id,associated_datasets
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCGAT,,CRISPR,8138.0,chronic myelogenous leukemia,34.037846,cell_line,True,1286.0,2412,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
AAACATACAGAGAT,,CRISPR,8980.0,chronic myelogenous leukemia,40.011135,cell_line,True,296.0,2386,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
AAACATACCAGAAA,,CRISPR,28610.0,chronic myelogenous leukemia,40.003494,cell_line,True,1829.0,4404,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
AAACATACGTTGAC,,CRISPR,11346.0,chronic myelogenous leukemia,35.184204,cell_line,True,1580.0,2815,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
AAACATACTGTTCT,,CRISPR,9864.0,chronic myelogenous leukemia,35.817112,cell_line,True,748.0,2584,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGAAGGC,,CRISPR,5942.0,chronic myelogenous leukemia,31.605520,cell_line,True,331.0,1856,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
TTTGACTGGACGAG,,CRISPR,8150.0,chronic myelogenous leukemia,36.797546,cell_line,True,745.0,2263,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
TTTGCATGCCCGTT,,CRISPR,9179.0,chronic myelogenous leukemia,37.128227,cell_line,True,784.0,2473,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
TTTGCATGCCGTTC,,CRISPR,15785.0,chronic myelogenous leukemia,37.465950,cell_line,True,1469.0,3167,human,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."


### Curate tissue information

In [21]:
cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell type information

In [22]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
   input_column input_column_lower    name_lower ontology_id
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell line information

In [23]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower  ontology_id
0         K562               k562       k562  CLO:0007050
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate disease information

In [24]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  
0  chronic myelogenous leukemia  MONDO:0011996  
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Match schema column order

In [25]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [26]:
cur_data.validate_data(slot='obs')

2025-12-12 10:25:00,653 DEBUG curation_tools.curation_tools: Applying dtype casting on adata.obs for columns: ['dataset_id', 'sample_id', 'data_modality', 'significant', 'significance_criteria', 'perturbation_name', 'perturbed_target_coord', 'perturbed_target_chromosome', 'perturbed_target_chromosome_encoding', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_biotype', 'guide_sequence', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'technical_replicate', 'biological_replicate', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_label', 'disease_id', 'study_title', 'study_uri', 'study_year', 'first_author', 'last_author', 'experiment_title', 'experiment_summary', 'number_of_perturbed_targets', 'number_of

Unnamed: 0,dataset_id,sample_id,data_modality,significant,significance_criteria,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,...,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,score_interpretation,reference_genome_id,reference_genome_label,associated_datasets,license_label,license_id
0,adamson_2016_pilot,1,CRISPR screen,,,CREB1_pDS269,chr2:207529737-207605988;1,2,2,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
1,adamson_2016_pilot,2,CRISPR screen,,,SNAI1_pDS266,chr20:49982980-49988886;1,20,20,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
2,adamson_2016_pilot,3,CRISPR screen,,,62(mod)_pBA581,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
3,adamson_2016_pilot,4,CRISPR screen,,,EP300_pDS268,chr22:41091816-41180077;1,22,22,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
4,adamson_2016_pilot,5,CRISPR screen,,,62(mod)_pBA581,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5747,adamson_2016_pilot,5748,CRISPR screen,,,SNAI1_pDS266,chr20:49982980-49988886;1,20,20,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
5748,adamson_2016_pilot,5749,CRISPR screen,,,SNAI1_pDS266,chr20:49982980-49988886;1,20,20,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
5749,adamson_2016_pilot,5750,CRISPR screen,,,SNAI1_pDS266,chr20:49982980-49988886;1,20,20,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
5750,adamson_2016_pilot,5751,CRISPR screen,,,62(mod)_pBA581,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065


# VAR slot curation

### Standardise genes

In [27]:
cur_data.create_columns(
    slot = 'var',
    col_dict={'gene_symbol_input': cur_data.adata.var.index},
    overwrite=True
)

Column gene_symbol_input added to adata.var


In [28]:
cur_data.show_var()

Variable data:
DataFrame shape: (35635, 4)
--------------------------------------------------
                   ensembl_id  ncounts  ncells gene_symbol_input
gene_symbol                                                     
MIR1302-10    ENSG00000243485      0.0       0        MIR1302-10
FAM138A       ENSG00000237613      0.0       0           FAM138A
OR4F5         ENSG00000186092      0.0       0             OR4F5
RP11-34P13.7  ENSG00000238009      0.0       0      RP11-34P13.7
RP11-34P13.8  ENSG00000239945      1.0       1      RP11-34P13.8
...                       ...      ...     ...               ...
MT-ND4L       ENSG00000212907      0.0       0           MT-ND4L
MT-ND4        ENSG00000198886      0.0       0            MT-ND4
MT-ND5        ENSG00000198786      0.0       0            MT-ND5
MT-ND6        ENSG00000198695      0.0       0            MT-ND6
MT-CYB        ENSG00000198727      0.0       0            MT-CYB

[35635 rows x 4 columns]
-----------------------------------

In [29]:
cur_data.standardize_genes(
    slot="var",
    input_column="ensembl_id",
    input_column_type="ensembl_gene_id",
    remove_version=False,
    multiple_entries=False
)

2025-12-12 10:25:01,630 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Missing Ensembl IDs: ['ENSG00000223367', 'ENSG00000269733', 'ENSG00000240440', 'ENSG00000198374', 'ENSG00000253115', 'ENSG00000267988', 'ENSG00000259243', 'ENSG00000223980', 'ENSG00000261091', 'ENSG00000267964', 'ENSG00000261059', 'ENSG00000269810', 'ENSG00000236632', 'ENSG00000269674', 'ENSG00000269751', 'ENSG00000262028', 'ENSG00000257601', 'ENSG00000260666', 'ENSG00000235769', 'ENSG00000263243', 'ENSG00000267816', 'ENSG00000224313', 'ENSG00000263074', 'ENSG00000268765', 'ENSG00000258414', 'ENSG00000268427', 'ENSG00000234876', 'ENSG00000260141', 'ENSG00000263141', 'ENSG00000264416', 'ENSG00000263060', 'ENSG00000155984', 'ENSG00000237508', 'ENSG00000262088', 'ENSG00000226404', 'ENSG00000270595', 'ENSG00000273266', 'ENSG00000268212', 'ENSG00000229767', 'ENSG00000241180', 'ENSG00000236524', 'ENSG00000234631', 'ENSG00000268637', 'ENSG00000261445', 'ENSG00000251416', 'ENSG00000272552', 'ENSG00000227305', 'ENSG00000244214', 'ENSG00000272993', 'ENSG00000262868', 'ENSG00000223808', 'ENSG0000

2025-12-12 10:25:09,105 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93960
2025-12-12 10:25:09,176 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 501 to 1000...


2025-12-12 10:25:18,105 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93489
2025-12-12 10:25:18,169 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 1001 to 1500...


2025-12-12 10:25:22,723 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 94420
2025-12-12 10:25:22,845 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 1501 to 2000...


2025-12-12 10:25:27,740 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93928
2025-12-12 10:25:27,921 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 2001 to 2500...


2025-12-12 10:25:32,595 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 96192
2025-12-12 10:25:32,766 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 2501 to 3000...


2025-12-12 10:25:39,225 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93767
2025-12-12 10:25:39,427 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 3001 to 3500...


2025-12-12 10:25:44,748 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93165
2025-12-12 10:25:44,925 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 3501 to 4000...


2025-12-12 10:25:49,183 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 94164
2025-12-12 10:25:49,365 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 4001 to 4500...


2025-12-12 10:25:53,840 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93807
2025-12-12 10:25:54,004 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 4501 to 5000...


2025-12-12 10:25:59,163 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93591
2025-12-12 10:25:59,373 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 5001 to 5338...


2025-12-12 10:26:02,502 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 63712


Fetched latest Ensembl IDs: {'ENSG00000269733': 'ENSG00000183571', 'ENSG00000240440': 'ENSG00000223725', 'ENSG00000198374': 'ENSG00000278677', 'ENSG00000253115': 'ENSG00000254288', 'ENSG00000267988': nan, 'ENSG00000259243': nan, 'ENSG00000261091': 'ENSG00000256943', 'ENSG00000267964': nan, 'ENSG00000261059': 'ENSG00000171102', 'ENSG00000269810': nan, 'ENSG00000269674': 'ENSG00000241743', 'ENSG00000269751': nan, 'ENSG00000262028': 'ENSG00000255561', 'ENSG00000257601': nan, 'ENSG00000260666': 'ENSG00000249942', 'ENSG00000267816': 'ENSG00000010404', 'ENSG00000263074': 'ENSG00000141337', 'ENSG00000268765': 'ENSG00000165168', 'ENSG00000258414': 'ENSG00000151338', 'ENSG00000268427': nan, 'ENSG00000260141': 'ENSG00000291268', 'ENSG00000264416': 'ENSG00000215784', 'ENSG00000263060': nan, 'ENSG00000155984': nan, 'ENSG00000262088': 'ENSG00000118181', 'ENSG00000270595': nan, 'ENSG00000273266': nan, 'ENSG00000268212': 'ENSG00000205081', 'ENSG00000241180': 'ENSG00000272438', 'ENSG00000236524': nan,

Because such a large number of ENSG are unmapped, replace the unmapped ones with the original IDs


In [30]:
cur_data.adata.var.loc[cur_data.adata.var['ensembl_gene_id'].isna(), 'ensembl_gene_id'] = cur_data.adata.var.loc[cur_data.adata.var['ensembl_gene_id'].isna(), 'ensembl_id']
cur_data.adata.var.loc[cur_data.adata.var['gene_symbol'].isna(), 'gene_symbol'] = cur_data.adata.var.loc[
    cur_data.adata.var['gene_symbol'].isna()].index

### Validate var metadata

In [31]:
cur_data.validate_data(slot='var')

2025-12-12 10:26:04,012 INFO curation_tools.curation_tools: adata.var is valid according to the var_schema.
2025-12-12 10:26:04,013 DEBUG curation_tools.curation_tools: Validated adata.var preview (shape=(35635, 2)):
              ensembl_gene_id   gene_symbol
index                                      
MIR1302-10    ENSG00000243485   MIR1302-2HG
FAM138A       ENSG00000237613       FAM138A
OR4F5         ENSG00000186092         OR4F5
RP11-34P13.7  ENSG00000238009  RP11-34P13.7
RP11-34P13.8  ENSG00000239945  RP11-34P13.8


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,RP11-34P13.7
RP11-34P13.8,ENSG00000239945,RP11-34P13.8
...,...,...
MT-ND4L,ENSG00000212907,MT-ND4L
MT-ND4,ENSG00000198886,MT-ND4
MT-ND5,ENSG00000198786,MT-ND5
MT-ND6,ENSG00000198695,MT-ND6


# Save the dataset

In [32]:
cur_data.save_curated_data_h5ad()

  adata.obs = adata.obs.fillna(value=np.nan)
... storing 'dataset_id' as categorical
... storing 'data_modality' as categorical
... storing 'significance_criteria' as categorical
... storing 'perturbation_name' as categorical
... storing 'perturbed_target_coord' as categorical
... storing 'perturbed_target_chromosome' as categorical
... storing 'perturbed_target_ensg' as categorical
... storing 'perturbed_target_symbol' as categorical
... storing 'perturbed_target_biotype' as categorical
... storing 'perturbation_type_label' as categorical
... storing 'perturbation_type_id' as categorical
... storing 'timepoint' as categorical
... storing 'treatment_label' as categorical
... storing 'treatment_id' as categorical
... storing 'technical_replicate' as categorical
... storing 'biological_replicate' as categorical
... storing 'model_system_label' as categorical
... storing 'model_system_id' as categorical
... storing 'species' as categorical
... storing 'tissue_label' as categorical
... sto

✅ Curated h5ad data saved to ../curated/h5ad/adamson_2016_pilot_curated.h5ad


In [33]:
cur_data.save_curated_data_parquet(split_metadata=True, save_metadata_only=True)

Files ../curated/parquet/adamson_2016_pilot_curated_data.parquet or ../curated/parquet/adamson_2016_pilot_curated_metadata.parquet already exist. Skipping write.


# Upload to BigQuery


In [34]:
upload_parquet_to_bq(
    parquet_path='../curated/parquet/adamson_2016_pilot_curated_metadata.parquet',
    bq_dataset_id='prj-ext-dev-pertcat-437314.perturb_seq',
    bq_table_name='metadata',
    key_columns=['dataset_id', 'sample_id'],
    verbose=True
)

2025-12-12 10:26:04,579 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-12 10:26:04,580 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-12 10:26:05,703 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-12 10:26:05,706 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-12 10:26:06,429 DEBUG google.cloud.bigquery.opentelemetry_tracing: This service is instrumented using OpenTelemetry. OpenTelemetry or one of its components could not be imported; please add compatible versions of opentelemetry-api and opentelemetry-instrumentation packages in order to get BigQuery Tracing data.
2025-12-12 10:26:06,432 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-12-12 10:26:06,434 DEBUG google.auth.transport.requests: Making reques

Staging table: loading `.parquet` file ../curated/parquet/adamson_2016_pilot_curated_metadata.parquet to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging...


2025-12-12 10:26:07,495 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable HTTP/1.1" 200 0
2025-12-12 10:26:08,173 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "PUT /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable&upload_id=AHVrFxNgNPJr3qi9JgKv_sTpMvaHbSe2jDuQQi0AS_pOfb8WGd26kPZgHwSxXf3l-rBqzrBsrd9Z8YCwqbWcfmhBMXBEohn3Sv7C95fHTJTERxY HTTP/1.1" 200 14002
2025-12-12 10:26:08,321 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/f1e21525-b30e-4ad9-abcc-0b92383cde37?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:26:08,323 DEBUG google.api_core.retry: Retrying due to , sleeping 0.6s ...
2025-12-12 10:26:09,055 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-

Staging table: loaded 5752 rows to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-12-12 10:26:10,761 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-12 10:26:10,761 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-12 10:26:11,186 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-12-12 10:26:11,187 DEBUG google.auth.transport.requests: Making request: POST https://oauth2.googleapis.com/token
2025-12-12 10:26:11,188 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): oauth2.googleapis.com:443
2025-12-12 10:26:11,293 DEBUG urllib3.connectionpool: https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None
2025-12-12 10:26:11,295 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): bigquery.googleapis.com:443
2025-12-12 10:26:11,841 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?pre

Staging table: added ingested_at timestamp column to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-12-12 10:26:15,638 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/datasets/perturb_seq/tables/metadata?prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:26:16,043 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:26:16,145 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/74a6b153-a2df-4bdb-81af-b454d9ad767c?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:26:18,454 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/queries/74a6b153-a2df-4bdb-81af-b454d9ad767c?maxResults=0&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:26:18,605 DEBUG urllib3.connectionpool: https://bigquery.go

Merge completed: staging → prj-ext-dev-pertcat-437314.perturb_seq.metadata with type-safe casting.
Staging table: deleted prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


# Upload to GC Storage


In [35]:
!gcloud storage cp ../curated/h5ad/adamson_2016_pilot_curated.h5ad gs://perturbation-catalogue-lake/perturbseq/curated/

Copying file://../curated/h5ad/adamson_2016_pilot_curated.h5ad to gs://perturbation-catalogue-lake/perturbseq/curated/adamson_2016_pilot_curated.h5ad
  Completed files 1/1 | 118.7MiB/118.7MiB | 1.9MiB/s                           

Average throughput: 2.3MiB/s
