# Import

In [1]:
import pandas as pd
import json

from curation_tools.curation_tools import (
    CuratedDataset,
    ObsSchema,
    VarSchema,
    Experiment,
    download_file,
    upload_parquet_to_bq
)

import logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    handlers=[
        logging.FileHandler("curation.log"),
        logging.StreamHandler(),  # keep console output too
    ],
    force=True,
)

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



# Download the dataset

In [2]:
noncurated_path = "../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad"
download_file(
    url="https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406677_10X005.h5ad",
    dest_path=noncurated_path
)

File ../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad already exists. Skipping download.


# Initialise the dataset object

In [3]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    noncurated_path=noncurated_path
)

cur_data.load_data()

2025-12-11 10:21:26,984 DEBUG h5py._conv: Creating converter from 3 to 5


Loading data from ../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad


In [4]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      2.0       2
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      1.0       1
RP11-34P13.8  ENSG00000239945      2.0       2
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


In [5]:
cur_data.adata.obs

Unnamed: 0_level_0,perturbation,read count,UMI count,tissue_type,cell_line,cancer,disease,perturbation_type,celltype,organism,ncounts,ngenes,percent_mito,percent_ribo,nperts
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAACATACACTCAG,3x_neg_ctrl_pMJ144-1,261.0,59.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,24343.0,4164,4.563941,32.629505,4
AAACATACTCCTAT,3x_neg_ctrl_pMJ144-2,132.0,37.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,27678.0,4428,4.508996,28.658140,4
AAACATTGCAGAGG,3x_neg_ctrl_pMJ144-2,560.0,117.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,24745.0,4371,3.200647,31.117397,4
AAACATTGGCGAAG,ATF6_PERK_IRE1_pMJ158,215.0,49.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,30836.0,4551,3.132702,29.890388,4
AAACCGTGATACCG,ATF6_PERK_pMJ150,567.0,124.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,33314.0,4539,6.588821,35.984871,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGCTTAG,PERK_only_pMJ146,215.0,60.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,34472.0,4645,3.437572,31.933163,3
TTTGACTGGGGATG,PERK_IRE1_pMJ154,64.0,16.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,27671.0,4655,4.033103,27.219833,3
TTTGACTGTGGTCA,3x_neg_ctrl_pMJ144-1,218.0,43.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,26795.0,4238,4.545624,31.991043,4
TTTGCATGCGGAGA,PERK_IRE1_pMJ154,284.0,45.0,cell_line,K562,True,chronic myelogenous leukemia,CRISPR,lymphoblasts,human,28974.0,4382,2.626493,30.271969,3


# OBS slot curation

### Show unique perturbations

In [6]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 21
--------------------------------------------------
{nan,
 '*',
 '3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


### Drop NAs

In [7]:
cur_data.remove_na(slot = 'obs', column = 'perturbation')

Removed 296 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [8]:
cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')

Removed 13 entries \* from column perturbation of adata.obs


  cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')


In [9]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 19
--------------------------------------------------
{'3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


### Rename `perturbation` to `perturbation_name`

In [10]:
cur_data.rename_columns(slot = 'obs', name_dict = {'perturbation': 'perturbation_name'})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Add guide RNA information

In [11]:
# Not reported by study authors for this dataset
cur_data.create_columns(
    slot='obs',
    col_dict={
        'guide_sequence': None
    }
)

Column guide_sequence added to adata.obs


### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [12]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol_input": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol_input')

Column perturbed_target_symbol_input added to adata.obs
Unique values in adata.obs.perturbed_target_symbol_input: 19
--------------------------------------------------
{'3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


#### Clean up `perturbation_target_symbol` column

In [13]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol_input",
    map_dict={
        r"3x_neg_ctrl.*": "control_nontargeting",
        r"Gal4-4.*": "control_nontargeting",
        r"_(pM|pD|pB|only).*": ""
    }
)

cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol_input')

Replaced '3x_neg_ctrl.*' with 'control_nontargeting' in column perturbed_target_symbol_input of adata.obs
Replaced 'Gal4-4.*' with 'control_nontargeting' in column perturbed_target_symbol_input of adata.obs
Replaced '_(pM|pD|pB|only).*' with '' in column perturbed_target_symbol_input of adata.obs
Unique values in adata.obs.perturbed_target_symbol_input: 16
--------------------------------------------------
{'ATF4',
 'ATF6',
 'ATF6_IRE1',
 'ATF6_PERK',
 'ATF6_PERK_IRE1',
 'C7ORF26',
 'CONTROL_NONTARGETING',
 'IER3IP1',
 'IRE1',
 'PERK',
 'PERK_IRE1',
 'PSMA1',
 'PSMD12',
 'SNAI1',
 'XBP1',
 'YIPF5'}
--------------------------------------------------


  if df[column].str.upper().str.contains(old_val.upper()).any():


### Standardise perturbation targets

In [14]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol_input',
    input_column_type='gene_symbol',
    multiple_entries=True,
    multiple_entries_sep='_',
    # remove_version=True,
    # version_sep='.'
)

Mapping gene symbols: 100%|███████████████████████████████████████| 12/12 [00:00<00:00, 9146.22it/s]


--------------------------------------------------
Successfully mapped 11 out of 12 gene symbols.
--------------------------------------------------
Couldn't map gene symbols: ['CONTROL_NONTARGETING']
--------------------------------------------------
Collapsed column cell_barcode using separator |


### Add `perturbed_target_number` column

In [15]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [16]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


In [17]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (14697, 2)
--------------------------------------------------
                    perturbation_name  perturbed_target_chromosome_encoding
index                                                                      
AAACATACACTCAG   3x_neg_ctrl_pMJ144-1                                     0
AAACATACTCCTAT   3x_neg_ctrl_pMJ144-2                                     0
AAACATTGCAGAGG   3x_neg_ctrl_pMJ144-2                                     0
AAACATTGGCGAAG  ATF6_PERK_IRE1_pMJ158                                     0
AAACCGTGATACCG       ATF6_PERK_pMJ150                                     0
...                               ...                                   ...
TTTGACTGGCTTAG       PERK_only_pMJ146                                     2
TTTGACTGGGGATG       PERK_IRE1_pMJ154                                     0
TTTGACTGTGGTCA   3x_neg_ctrl_pMJ144-1                                     0
TTTGCATGCGGAGA       PERK_IRE1_pMJ154                              

### Add metadata

### Add treatment information

Data is stored in a separate file, as part of the `cell BC` column's suffix


In [18]:
orig_cell_ident_link = r"https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2406nnn/GSM2406677/suppl/GSM2406677%5F10X005%5Fcell%5Fidentities.csv.gz"
orig_cell_ident = pd.read_csv(orig_cell_ident_link)

# the number after the dash is the cell barcode indicates the treatment
display(orig_cell_ident[["cell BC", "guide identity"]].head())

treatment_label_map = {"1": "tunicamycin", "2": "thapsigargin", "3": "DMSO"}

treatment_df = (
    orig_cell_ident["cell BC"]
    .str.split("-", expand=True)
    .rename(columns={0: "cell_barcode", 1: "treatment_number"})
)

treatment_df["treatment_temp"] = treatment_df["treatment_number"].map(
    treatment_label_map
)

treatment_df = (
    treatment_df.drop_duplicates(subset=["cell_barcode"])
    .drop(columns=["treatment_number"])
    .set_index("cell_barcode")
)

treatment_df

Unnamed: 0,cell BC,guide identity
0,ACGGTATGCTTAGG-3,PERK_IRE1_pMJ154
1,ACAATCCTACCCTC-1,PERK_IRE1_pMJ154
2,ACGAACACGTGCTA-3,ATF6_PERK_IRE1_pMJ158
3,CTGTGAGATTGGTG-1,ATF6_PERK_IRE1_pMJ158
4,ATGTTGCTAATCGC-2,3x_neg_ctrl_pMJ144-2


Unnamed: 0_level_0,treatment_temp
cell_barcode,Unnamed: 1_level_1
ACGGTATGCTTAGG,DMSO
ACAATCCTACCCTC,tunicamycin
ACGAACACGTGCTA,DMSO
CTGTGAGATTGGTG,tunicamycin
ATGTTGCTAATCGC,thapsigargin
...,...
CGTAACGAGTTGCA,DMSO
CCATGCTGGCTTCC,DMSO
CCCTCAGAAAAGTG,DMSO
TCAAGTCTAGGTCT,thapsigargin


Add treatment information to the dataset

In [19]:
cur_data.adata.obs = cur_data.adata.obs.merge(
    treatment_df,
    left_index=True,
    right_index=True,
    how="left"
)

Map treatment compounds to CHEBI

In [20]:
cur_data.standardize_compounds(column='treatment_temp')

2025-12-11 10:21:52,580 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): www.ebi.ac.uk:443
2025-12-11 10:21:52,736 DEBUG urllib3.connectionpool: https://www.ebi.ac.uk:443 "GET /chebi/backend/api/public/es_search/?term=tunicamycin&page=1&size=1 HTTP/1.1" 200 None
2025-12-11 10:21:52,741 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): www.ebi.ac.uk:443
2025-12-11 10:21:52,861 DEBUG urllib3.connectionpool: https://www.ebi.ac.uk:443 "GET /chebi/backend/api/public/es_search/?term=thapsigargin&page=1&size=1 HTTP/1.1" 200 None
2025-12-11 10:21:52,866 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): www.ebi.ac.uk:443
2025-12-11 10:21:52,989 DEBUG urllib3.connectionpool: https://www.ebi.ac.uk:443 "GET /chebi/backend/api/public/es_search/?term=DMSO&page=1&size=1 HTTP/1.1" 200 None


Successfully mapped 3/3 compounds: ['tunicamycin', 'thapsigargin', 'DMSO']


  return dispatch(args[0].__class__)(*args, **kw)


Unnamed: 0,original_name,treatment_label,treatment_id
0,tunicamycin,tunicamycin,CHEBI:29699
1,thapsigargin,thapsigargin,CHEBI:9516
3,DMSO,dimethyl sulfoxide,CHEBI:28262


In [21]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # treatment
        # "treatment_label": None, 
        # "treatment_id": None,
        # perturbation type
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None,
        "data_modality": "CRISPR screen",
        "significant": None,
        "significance_criteria": None,
        "score_interpretation": None,
        # model system
        "model_system_label": "cell_line",
        "model_system_id": None,
        # replicates
        "technical_replicate": None,
        "biological_replicate": None,

        'tissue': 'blood',
        "timepoint": "P0DT0H0M0S",
        
        "species": "Homo sapiens",
        "sex_label": "female", 
        "sex_id": None,
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None,

        "study_title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "study_year": 2016,
        "first_author": "Britt Adamson",
        "last_author": "Jonathan Weissman",
        
        "experiment_title": "14595 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs in every combination (singly with controls, doubly with a control, or triply).",
        "experiment_summary": "Using our final three-guide Perturb-seq vector to simultaneously deliver 3 sgRNAs, we individually transduced K562 cells expressing dCas9-KRAB (cBA010) with constructs that targeted all three UPR sensor genes in every combination (singly with controls, doubly with a control, or triply). Transduced cells were then pooled and selected. After 2 days of combined growth, the cells were treated with DMSO for 6 hr, 4 μg/mL tunicamycin (Tm) for 6 hr, or 100 nM thapsigargin (Tg) for 4 hr and were profiled by Perturb-seq (24 conditions in total).",
        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],
        
        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": "EFO:0022895",
        "library_generation_method_label": "dCas9-KRAB",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "retrovirus transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentivirus transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive transgene expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive transgene expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",
        
        "library_manufacturer": "Weissman",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "1",
        "library_total_grnas": "16",
        "library_total_variants": None,
        
        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina HiSeq 4000",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "MAGeCK",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh37",
        
        "license_label": "CC BY 4.0",
        "license_id": "SWO:1000065",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "GSM2406675",
                "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406675",
                "dataset_description": "Raw counts",
                "dataset_file_name": "GSE90546_RAW.tar",
            },
            {
                "dataset_accession": "GSM2406675_10X001",
                "dataset_uri": "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
                "dataset_description": "Processed .h5ad file",
                "dataset_file_name": "AdamsonWeissman2016_GSM2406675_10X001.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column data_modality added to adata.obs
Column significant added to adata.obs
Column significance_criteria added to adata.obs
Column score_interpretation added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column technical_replicate added to adata.obs
Column biological_replicate added to adata.obs
Column tissue added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs
Column study_title added to adata.obs
Column study_uri added to adata.obs
Column study_year added to adata.obs
Column first_author added to adata.obs
Column last_author added to adata.obs
Column experimen

### Curate tissue information

In [22]:
cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell type information

In [23]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
   input_column input_column_lower    name_lower ontology_id
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell line information

In [24]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower  ontology_id
0         K562               k562       k562  CLO:0007050
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate disease information

In [25]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  
0  chronic myelogenous leukemia  MONDO:0011996  
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Match schema column order

In [26]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [27]:
cur_data.validate_data(slot='obs')

2025-12-11 10:27:58,358 DEBUG curation_tools.curation_tools: Applying dtype casting on adata.obs for columns: ['dataset_id', 'sample_id', 'data_modality', 'significant', 'significance_criteria', 'perturbation_name', 'perturbed_target_coord', 'perturbed_target_chromosome', 'perturbed_target_chromosome_encoding', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_biotype', 'guide_sequence', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'technical_replicate', 'biological_replicate', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_label', 'disease_id', 'study_title', 'study_uri', 'study_year', 'first_author', 'last_author', 'experiment_title', 'experiment_summary', 'number_of_perturbed_targets', 'number_of

Unnamed: 0,dataset_id,sample_id,data_modality,significant,significance_criteria,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,...,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,score_interpretation,reference_genome_id,reference_genome_label,associated_datasets,license_label,license_id
0,adamson_2016_upr_epistasis,1,CRISPR screen,,,3x_neg_ctrl_pMJ144-1,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
1,adamson_2016_upr_epistasis,2,CRISPR screen,,,3x_neg_ctrl_pMJ144-2,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
2,adamson_2016_upr_epistasis,3,CRISPR screen,,,3x_neg_ctrl_pMJ144-2,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
3,adamson_2016_upr_epistasis,4,CRISPR screen,,,ATF6_PERK_IRE1_pMJ158,chr1:161766261-161977574;1|chr2:88556741-88691...,1|2|17,0,3,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
4,adamson_2016_upr_epistasis,5,CRISPR screen,,,ATF6_PERK_pMJ150,chr1:161766261-161977574;1|chr2:88556741-88691...,1|2,0,2,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14692,adamson_2016_upr_epistasis,14693,CRISPR screen,,,PERK_only_pMJ146,chr2:88556741-88691518;-1,2,2,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
14693,adamson_2016_upr_epistasis,14694,CRISPR screen,,,PERK_IRE1_pMJ154,chr2:88556741-88691518;-1|chr17:64039080-64130...,2|17,0,2,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
14694,adamson_2016_upr_epistasis,14695,CRISPR screen,,,3x_neg_ctrl_pMJ144-1,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065
14695,adamson_2016_upr_epistasis,14696,CRISPR screen,,,PERK_IRE1_pMJ154,chr2:88556741-88691518;-1|chr17:64039080-64130...,2|17,0,2,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_...",CC BY 4.0,SWO:1000065


# VAR slot curation

### Standardise genes

In [28]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      2.0       2
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      1.0       1
RP11-34P13.8  ENSG00000239945      2.0       2
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


In [29]:
cur_data.standardize_genes(
    slot="var",
    input_column="ensembl_id",
    input_column_type="ensembl_gene_id",
    remove_version=False,
    multiple_entries=False
)

2025-12-11 10:28:53,351 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Missing Ensembl IDs: ['ENSG00000261720', 'ENSG00000263443', 'ENSG00000232056', 'ENSG00000256316', 'ENSG00000263203', 'ENSG00000268851', 'ENSG00000267242', 'ENSG00000221870', 'ENSG00000239446', 'ENSG00000260065', 'ENSG00000255124', 'ENSG00000270169', 'ENSG00000233906', 'ENSG00000269049', 'ENSG00000241732', 'ENSG00000235242', 'ENSG00000154035', 'ENSG00000269881', 'ENSG00000248986', 'ENSG00000215504', 'ENSG00000228139', 'ENSG00000268000', 'ENSG00000205850', 'ENSG00000226032', 'ENSG00000262194', 'ENSG00000268162', 'ENSG00000235423', 'ENSG00000180081', 'ENSG00000269072', 'ENSG00000233517', 'ENSG00000267037', 'ENSG00000258297', 'ENSG00000227011', 'ENSG00000230925', 'ENSG00000268822', 'ENSG00000261159', 'ENSG00000255080', 'ENSG00000254144', 'ENSG00000203836', 'ENSG00000273395', 'ENSG00000225205', 'ENSG00000269510', 'ENSG00000268759', 'ENSG00000268856', 'ENSG00000264212', 'ENSG00000268948', 'ENSG00000268341', 'ENSG00000266118', 'ENSG00000270147', 'ENSG00000229510', 'ENSG00000250003', 'ENSG0000

2025-12-11 10:29:04,189 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93558
2025-12-11 10:29:04,257 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 501 to 1000...


2025-12-11 10:29:11,043 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 94705
2025-12-11 10:29:11,117 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 1001 to 1500...


2025-12-11 10:29:17,018 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93890
2025-12-11 10:29:17,098 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 1501 to 2000...


2025-12-11 10:29:25,577 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 96093
2025-12-11 10:29:25,641 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 2001 to 2500...


2025-12-11 10:29:31,381 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93536
2025-12-11 10:29:31,431 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 2501 to 2577...


2025-12-11 10:29:32,465 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 14766


Fetched latest Ensembl IDs: {'ENSG00000261720': 'ENSG00000292430', 'ENSG00000263443': 'ENSG00000253819', 'ENSG00000232056': 'ENSG00000271952', 'ENSG00000256316': 'ENSG00000278272', 'ENSG00000263203': nan, 'ENSG00000268851': nan, 'ENSG00000267242': nan, 'ENSG00000221870': nan, 'ENSG00000239446': nan, 'ENSG00000260065': nan, 'ENSG00000255124': nan, 'ENSG00000270169': nan, 'ENSG00000233906': 'ENSG00000283982', 'ENSG00000269049': nan, 'ENSG00000241732': nan, 'ENSG00000235242': 'ENSG00000286513', 'ENSG00000154035': nan, 'ENSG00000269881': nan, 'ENSG00000248986': nan, 'ENSG00000215504': nan, 'ENSG00000228139': 'ENSG00000101883', 'ENSG00000268000': nan, 'ENSG00000205850': nan, 'ENSG00000226032': 'ENSG00000164691', 'ENSG00000262194': nan, 'ENSG00000268162': nan, 'ENSG00000235423': nan, 'ENSG00000180081': nan, 'ENSG00000269072': 'ENSG00000268595', 'ENSG00000233517': 'ENSG00000272568', 'ENSG00000267037': nan, 'ENSG00000258297': nan, 'ENSG00000227011': 'ENSG00000226364', 'ENSG00000230925': nan, '

Because such a large number of ENSG are unmapped, replace the unmapped ones with the original IDs

In [32]:
cur_data.adata.var.loc[cur_data.adata.var['ensembl_gene_id'].isna(), 'ensembl_gene_id'] = cur_data.adata.var.loc[cur_data.adata.var['ensembl_gene_id'].isna(), 'ensembl_id']
cur_data.adata.var.loc[cur_data.adata.var['gene_symbol'].isna(), 'gene_symbol'] = cur_data.adata.var.loc[
    cur_data.adata.var['gene_symbol'].isna()].index

In [37]:
cur_data.adata.var

Unnamed: 0_level_0,ncounts,ncells,ensembl_id,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MIR1302-10,2.0,2,ENSG00000243485,ENSG00000243485,MIR1302-2HG
FAM138A,0.0,0,ENSG00000237613,ENSG00000237613,FAM138A
OR4F5,0.0,0,ENSG00000186092,ENSG00000186092,OR4F5
RP11-34P13.7,1.0,1,ENSG00000238009,ENSG00000238009,RP11-34P13.7
RP11-34P13.8,2.0,2,ENSG00000239945,ENSG00000239945,RP11-34P13.8
...,...,...,...,...,...
AC145205.1,0.0,0,ENSG00000215635,ENSG00000215635,AC145205.1
BAGE5,0.0,0,ENSG00000268590,ENSG00000268590,BAGE5
CU459201.1,0.0,0,ENSG00000251180,ENSG00000251180,CU459201.1
AC002321.2,0.0,0,ENSG00000215616,ENSG00000215616,AC002321.2


### Validate var metadata

In [38]:
cur_data.validate_data(slot='var')

2025-12-11 14:43:48,930 INFO curation_tools.curation_tools: adata.var is valid according to the var_schema.
2025-12-11 14:43:48,932 DEBUG curation_tools.curation_tools: Validated adata.var preview (shape=(32738, 2)):
              ensembl_gene_id   gene_symbol
index                                      
MIR1302-10    ENSG00000243485   MIR1302-2HG
FAM138A       ENSG00000237613       FAM138A
OR4F5         ENSG00000186092         OR4F5
RP11-34P13.7  ENSG00000238009  RP11-34P13.7
RP11-34P13.8  ENSG00000239945  RP11-34P13.8


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,RP11-34P13.7
RP11-34P13.8,ENSG00000239945,RP11-34P13.8
...,...,...
AC145205.1,ENSG00000215635,AC145205.1
BAGE5,ENSG00000268590,BAGE5
CU459201.1,ENSG00000251180,CU459201.1
AC002321.2,ENSG00000215616,AC002321.2


# Save the dataset

In [39]:
cur_data.save_curated_data_h5ad()

  adata.obs = adata.obs.fillna(value=np.nan)
... storing 'dataset_id' as categorical
... storing 'data_modality' as categorical
... storing 'significance_criteria' as categorical
... storing 'perturbation_name' as categorical
... storing 'perturbed_target_coord' as categorical
... storing 'perturbed_target_chromosome' as categorical
... storing 'perturbed_target_ensg' as categorical
... storing 'perturbed_target_symbol' as categorical
... storing 'perturbed_target_biotype' as categorical
... storing 'perturbation_type_label' as categorical
... storing 'perturbation_type_id' as categorical
... storing 'timepoint' as categorical
... storing 'treatment_label' as categorical
... storing 'treatment_id' as categorical
... storing 'technical_replicate' as categorical
... storing 'biological_replicate' as categorical
... storing 'model_system_label' as categorical
... storing 'model_system_id' as categorical
... storing 'species' as categorical
... storing 'tissue_label' as categorical
... sto

✅ Curated h5ad data saved to ../curated/h5ad/adamson_2016_upr_epistasis_curated.h5ad


In [40]:
cur_data.save_curated_data_parquet(split_metadata=True, save_metadata_only=True)

✅ Metadata saved to ../curated/parquet/adamson_2016_upr_epistasis_curated_metadata.parquet


# Upload to BigQuery


In [41]:
upload_parquet_to_bq(
    parquet_path='../curated/parquet/adamson_2016_upr_epistasis_curated_metadata.parquet',
    bq_dataset_id='prj-ext-dev-pertcat-437314.perturb_seq',
    bq_table_name='metadata',
    key_columns=['dataset_id', 'sample_id'],
    verbose=True
)

2025-12-11 14:46:34,218 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-11 14:46:34,221 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-11 14:46:37,564 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-11 14:46:37,565 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-11 14:46:37,933 DEBUG google.cloud.bigquery.opentelemetry_tracing: This service is instrumented using OpenTelemetry. OpenTelemetry or one of its components could not be imported; please add compatible versions of opentelemetry-api and opentelemetry-instrumentation packages in order to get BigQuery Tracing data.
2025-12-11 14:46:37,935 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-12-11 14:46:37,936 DEBUG google.auth.transport.requests: Making reques

Staging table: loading `.parquet` file ../curated/parquet/adamson_2016_upr_epistasis_curated_metadata.parquet to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging...


2025-12-11 14:46:40,025 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable HTTP/1.1" 200 0
2025-12-11 14:46:40,612 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "PUT /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable&upload_id=AHVrFxOAJWBrMs_8YvqLUg0qPGOvkpBmRGmYo04gl0qwg0sGnH9zriBma1HFDASFQoSvP-CQCCO61_Nd10N2et6kHNLMWGr5rE4hv-qw8SScqQ HTTP/1.1" 200 14002
2025-12-11 14:46:40,780 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/f99cfe35-bee0-41ea-9f1c-4ddfc1dfb5fe?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-11 14:46:40,782 DEBUG google.api_core.retry: Retrying due to , sleeping 0.1s ...
2025-12-11 14:46:40,975 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-4

Staging table: loaded 14697 rows to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-12-11 14:46:45,613 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-11 14:46:45,613 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-11 14:46:45,954 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-12-11 14:46:45,955 DEBUG google.auth.transport.requests: Making request: POST https://oauth2.googleapis.com/token
2025-12-11 14:46:45,956 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): oauth2.googleapis.com:443
2025-12-11 14:46:46,065 DEBUG urllib3.connectionpool: https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None
2025-12-11 14:46:46,067 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): bigquery.googleapis.com:443
2025-12-11 14:46:46,452 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?pre

Staging table: added ingested_at timestamp column to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-12-11 14:47:07,322 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/datasets/perturb_seq/tables/metadata?prettyPrint=false HTTP/1.1" 200 None
2025-12-11 14:47:07,766 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?prettyPrint=false HTTP/1.1" 200 None
2025-12-11 14:47:07,849 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/19e4e3d1-f911-4ef2-9f17-03a33cc725b7?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-11 14:47:17,798 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/queries/19e4e3d1-f911-4ef2-9f17-03a33cc725b7?maxResults=0&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-11 14:47:17,903 DEBUG urllib3.connectionpool: https://bigquery.go

Merge completed: staging → prj-ext-dev-pertcat-437314.perturb_seq.metadata with type-safe casting.
Staging table: deleted prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


# Upload to GC Storage


In [42]:
!gcloud storage cp ../curated/h5ad/adamson_2016_upr_epistasis_curated.h5ad gs://perturbation-catalogue-lake/perturbseq/curated/

uploading large objects. If you would like to opt-out and instead
perform a normal upload, run:
`gcloud config set storage/parallel_composite_upload_enabled False`
`gcloud config set storage/parallel_composite_upload_enabled True`
Note that with parallel composite uploads, your object might be
uploaded as a composite object
(https://cloud.google.com/storage/docs/composite-objects), which means
that any user who downloads your object will need to use crc32c
checksums to verify data integrity. gcloud storage is capable of
computing crc32c checksums, but this might pose a problem for other
clients.

Copying file://../curated/h5ad/adamson_2016_upr_epistasis_curated.h5ad to gs://perturbation-catalogue-lake/perturbseq/curated/adamson_2016_upr_epistasis_curated.h5ad
  Completed files 11/1 | 483.2MiB/483.2MiB | 9.1MiB/s                          

Average throughput: 2.8MiB/s
