# Import

In [2]:
import pandas as pd
import json

from curation_tools.curation_tools import (
    CuratedDataset,
    ObsSchema,
    VarSchema,
    Experiment,
    download_file,
    upload_parquet_to_bq
)

import logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    handlers=[
        logging.FileHandler("curation.log"),
        logging.StreamHandler(),  # keep console output too
    ],
    force=True,
)

# Download the dataset

In [3]:
noncurated_path = "../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad"
download_file(
    url="https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad",
    dest_path=noncurated_path
)

2025-12-12 09:46:48,622 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): zenodo.org:443
2025-12-12 09:46:48,917 DEBUG urllib3.connectionpool: https://zenodo.org:443 "GET /record/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad HTTP/1.1" 301 317
2025-12-12 09:46:49,063 DEBUG urllib3.connectionpool: https://zenodo.org:443 "GET /records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad HTTP/1.1" 200 471286951


Downloaded https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad to ../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad


# Initialise the dataset object

In [4]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    noncurated_path=noncurated_path
)

cur_data.load_data()

2025-12-12 10:00:08,787 DEBUG h5py._conv: Creating converter from 3 to 5


Loading data from ../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad


In [5]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485     11.0      11
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945     43.0      43
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


# OBS slot curation

### Show unique perturbations

In [6]:
cur_data.show_unique(slot="obs", column="perturbation")

Unique values in adata.obs.perturbation: 115
--------------------------------------------------
{nan,
 '*',
 '62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_pDS089',
 'IARS2_pDS090',
 'IARS2_pDS091',
 'IDH3A

### Drop NAs

In [7]:
cur_data.remove_na(slot="obs", column="perturbation")

Removed 2613 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [8]:
cur_data.remove_entries(slot="obs", column="perturbation", to_remove=r"\*")

Removed 101 entries \* from column perturbation of adata.obs


In [9]:
cur_data.show_unique(slot="obs", column="perturbation")

Unique values in adata.obs.perturbation: 113
--------------------------------------------------
{'62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_pDS089',
 'IARS2_pDS090',
 'IARS2_pDS091',
 'IDH3A_pDS393',
 '

### Rename `perturbation` to `perturbation_name`

In [10]:
cur_data.rename_columns(slot="obs", name_dict={"perturbation": "perturbation_name"})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Add guide RNA information

In [11]:
# download the protospacer sequences data from supplementary information Table S1
# https://ars.els-cdn.com/content/image/1-s2.0-S0092867416316609-mmc1.xlsx

guides_df = pd.read_csv("../supplementary/adamson_2016_pilot.csv", header=1)
display(guides_df)

guides_dict_vecid = (
    guides_df[["Perturb-seq_Vector_ID", "Protospacer"]]
    .dropna()
    .set_index("Perturb-seq_Vector_ID", drop=True)["Protospacer"]
    .to_dict()
)

guides_dict_vecid

Unnamed: 0,Gene,Protospacer,Guide_ID (synonymous with sgGuide_ID),Perturb-seq_Vector_ID,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,AARS,GAGGGCGGCCTACCTCTCCT,,pDS381,,,
1,AMIGO3/GMPPB,GGAACGCGACACCGGGTAGA,,pDS391,,,
2,AMIGO3/GMPPB,GGGGCCAGCAGCCGTCTACC,,pDS434,,,
3,ARHGAP22,GGTCCGTCCGGAGCCAGGAG,,pDS458,,,
4,ASCC3,GACGCAAAGACGCACAGACC,,pDS051,,,
...,...,...,...,...,...,...,...
97,YIPF5,GTGACACGTAGCAACGGGGC,,pDS226,,,
98,,GGCCAAACGTGCCCTGACGG,NegCtrl-1,,,,
99,,GCGATGGGGGGGTGGGTAGC,NegCtrl-2,,,,
100,,GACGACTAGTTAGGCGTGTA,NegCtrl-3,,,,


{'pDS381': 'GAGGGCGGCCTACCTCTCCT',
 'pDS391': 'GGAACGCGACACCGGGTAGA',
 'pDS434': 'GGGGCCAGCAGCCGTCTACC',
 'pDS458': 'GGTCCGTCCGGAGCCAGGAG',
 'pDS051': 'GACGCAAAGACGCACAGACC',
 'pDS052': 'GCGCACAGACCCGGCGAGGA',
 'pDS055': 'GAGTCTCCGCAAGGCCCCGG',
 'pDS468': 'GTAGGAGCCTCGGGCGCGCT',
 'pDS460': 'GAGCCATGGCAGATTCCTCC',
 'pDS006': 'GCGACGTCCGAGCATTCCA',
 'pDS024': 'GCGCTGGTGGTCGATCGTG',
 'pDS065': 'GCGGCTATGAACCGCAGCAG',
 'pDS462': 'GGATGCTGTGGTGTCCACAG',
 'pDS499': 'GACCTTGCGTGCAGTTATGT',
 'pDS495': 'GTGAGACCCCAGGGTCGGGA',
 'pDS382': 'GTGGGTCCTTCGGCAGGAGG',
 'pDS041': 'GCGGTCCACAAAGGCTCAGA',
 'pDS042': 'GGTAGGCGCGGCTGACCGGT',
 'pDS383': 'GGCGCCCAGCGGAGCTAATC',
 'pDS026': 'GGGCGCCTGTGCTTGAGGTT',
 'pDS074': 'GCTTGCCTGGAACTCCTGTA',
 'pDS463': 'GTAGCTGCCTTCAGCCTTCAC',
 'pDS508': 'GCCATTGGGCTGTCAGTCAG',
 'pDS491': 'GCTGAGGGCGATGGCTGCTG',
 'pDS386': 'GAGACTTGCTTCCCCCTCAC',
 'pDS390': 'GTCACTGTAGGTGCGGCCCA',
 'pDS494': 'GGCAGCCTCGGCCCGAGTCC',
 'pDS043': 'GCAGGGCTCGCGCAGTTACC',
 'pDS506': 'GCAGGGCCG

In [12]:
# extract vector IDs from perturbation_names
cur_data.create_columns(
    slot="obs",
    col_dict={
        "vector_id": cur_data.adata.obs["perturbation_name"].str.split("_").str[1]
    },
)

cur_data.show_unique(slot="obs", column="vector_id")

Column vector_id added to adata.obs
Unique values in adata.obs.vector_id: 113
--------------------------------------------------
{'pBA572',
 'pBA573',
 'pBA574',
 'pBA575',
 'pBA576',
 'pBA577',
 'pBA578',
 'pBA579',
 'pBA580',
 'pBA581',
 'pBA582',
 'pBA586',
 'pBA608',
 'pDS001',
 'pDS002',
 'pDS003',
 'pDS004',
 'pDS005',
 'pDS006',
 'pDS007',
 'pDS008',
 'pDS009',
 'pDS010',
 'pDS011',
 'pDS017',
 'pDS024',
 'pDS026',
 'pDS027',
 'pDS029',
 'pDS031',
 'pDS032',
 'pDS033',
 'pDS036',
 'pDS038',
 'pDS040',
 'pDS041',
 'pDS042',
 'pDS043',
 'pDS044',
 'pDS046',
 'pDS051',
 'pDS052',
 'pDS055',
 'pDS065',
 'pDS074',
 'pDS087',
 'pDS088',
 'pDS089',
 'pDS090',
 'pDS091',
 'pDS096',
 'pDS110',
 'pDS124',
 'pDS156',
 'pDS159',
 'pDS160',
 'pDS162',
 'pDS175',
 'pDS186',
 'pDS218',
 'pDS219',
 'pDS226',
 'pDS284',
 'pDS353',
 'pDS371',
 'pDS373',
 'pDS381',
 'pDS382',
 'pDS383',
 'pDS386',
 'pDS390',
 'pDS391',
 'pDS393',
 'pDS394',
 'pDS395',
 'pDS396',
 'pDS397',
 'pDS398',
 'pDS401',
 '

In [13]:
cur_data.map_values_from_column(
    ref_col="vector_id", target_col="guide_sequence", map_dict=guides_dict_vecid
)

Column guide_sequence created in adata.obs
Mapped value pDS381 in column vector_id to GAGGGCGGCCTACCTCTCCT in column guide_sequence of adata.obs
Mapped value pDS391 in column vector_id to GGAACGCGACACCGGGTAGA in column guide_sequence of adata.obs
Mapped value pDS434 in column vector_id to GGGGCCAGCAGCCGTCTACC in column guide_sequence of adata.obs
Mapped value pDS458 in column vector_id to GGTCCGTCCGGAGCCAGGAG in column guide_sequence of adata.obs
Mapped value pDS051 in column vector_id to GACGCAAAGACGCACAGACC in column guide_sequence of adata.obs
Mapped value pDS052 in column vector_id to GCGCACAGACCCGGCGAGGA in column guide_sequence of adata.obs
Mapped value pDS055 in column vector_id to GAGTCTCCGCAAGGCCCCGG in column guide_sequence of adata.obs
Mapped value pDS468 in column vector_id to GTAGGAGCCTCGGGCGCGCT in column guide_sequence of adata.obs
Mapped value pDS460 in column vector_id to GAGCCATGGCAGATTCCTCC in column guide_sequence of adata.obs
Mapped value pDS006 in column vector_id

In [16]:
cur_data.adata.obs[['perturbation_name', 'guide_sequence']]

Unnamed: 0_level_0,perturbation_name,guide_sequence
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACATACAAGATG,63(mod)_pBA580,
AAACATACACCTAG,OST4_pDS353,GGCTTGTTCGCTGGTGGCGT
AAACATACTTCCCG,SEC61A1_pDS031,GCTGTGCAGTGGAACGCGCT
AAACATTGAAACAG,EIF2B4_pDS491,GCTGAGGGCGATGGCTGCTG
AAACATTGCAGCTA,SRPR_pDS482,GGCGAACGCGGCCTGAATTCC
...,...,...
TTTGCATGCTTTAC,STT3A_pDS011,GGGAGCCCCGCGGATCGTTT
TTTGCATGGAGGAC,ARHGAP22_pDS458,GGTCCGTCCGGAGCCAGGAG
TTTGCATGTAGAGA,63(mod)_pBA580,
TTTGCATGTCAAGC,KCTD16_pDS096,GGTTTCTCAGACCTAGGTGA


### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [17]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol_input": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol_input')

Column perturbed_target_symbol_input added to adata.obs
Unique values in adata.obs.perturbed_target_symbol_input: 113
--------------------------------------------------
{'62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088

#### Clean up `perturbation_target_symbol` column

In [18]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol_input",
    map_dict={
        r"62\(mod\).*": "control_nontargeting",
        r"63\(mod\).*": "control_nontargeting",
        r"Gal4-4.*": "control_nontargeting",
        r"_(pD|pB).*": "",
    }
)


cur_data.show_unique(slot="obs", column="perturbed_target_symbol_input")

Replaced '62\(mod\).*' with 'control_nontargeting' in column perturbed_target_symbol_input of adata.obs
Replaced '63\(mod\).*' with 'control_nontargeting' in column perturbed_target_symbol_input of adata.obs
Replaced 'Gal4-4.*' with 'control_nontargeting' in column perturbed_target_symbol_input of adata.obs
Replaced '_(pD|pB).*' with '' in column perturbed_target_symbol_input of adata.obs
Unique values in adata.obs.perturbed_target_symbol_input: 91
--------------------------------------------------
{'AARS',
 'AMIGO3',
 'ARHGAP22',
 'ASCC3',
 'ATF4',
 'ATF6',
 'ATP5B',
 'C7ORF26',
 'CAD',
 'CARS',
 'CCND3',
 'CHERP',
 'CONTROL_NONTARGETING',
 'COPB1',
 'COPZ1',
 'DAD1',
 'DARS',
 'DDOST',
 'DDRGK1',
 'DERL2',
 'DHDDS',
 'DNAJC19',
 'EIF2AK3',
 'EIF2B2',
 'EIF2B3',
 'EIF2B4',
 'EIF2S1',
 'ERN1',
 'FARSB',
 'FECH',
 'GBF1',
 'GMPPB',
 'GNPNAT1',
 'HARS',
 'HSD17B12',
 'HSPA5',
 'HSPA9',
 'HYOU1',
 'IARS2',
 'IDH3A',
 'IER3IP1',
 'KCTD16',
 'MANF',
 'MARS',
 'MRGBP',
 'MRPL39',
 'MTHFD1',


  if df[column].str.upper().str.contains(old_val.upper()).any():


### Standardise perturbation targets

In [19]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol_input',
    input_column_type='gene_symbol',
    multiple_entries=False,
    # remove_version=True,
    # version_sep='.'
)

Mapping gene symbols: 100%|██████████████████████████████████████| 91/91 [00:00<00:00, 14249.83it/s]


--------------------------------------------------
Successfully mapped 90 out of 91 gene symbols.
--------------------------------------------------
Couldn't map gene symbols: ['CONTROL_NONTARGETING']
--------------------------------------------------


### Add `perturbed_target_number` column

In [20]:
cur_data.count_entries(
    slot="obs",
    input_column="perturbed_target_symbol",
    count_column_name="perturbed_target_number",
    sep="|",
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [21]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


In [22]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (62623, 2)
--------------------------------------------------
               perturbation_name  perturbed_target_chromosome_encoding
index                                                                 
AAACATACAAGATG    63(mod)_pBA580                                     0
AAACATACACCTAG       OST4_pDS353                                     2
AAACATACTTCCCG    SEC61A1_pDS031                                     3
AAACATTGAAACAG     EIF2B4_pDS491                                     2
AAACATTGCAGCTA       SRPR_pDS482                                    11
...                          ...                                   ...
TTTGCATGCTTTAC      STT3A_pDS011                                    11
TTTGCATGGAGGAC   ARHGAP22_pDS458                                    10
TTTGCATGTAGAGA    63(mod)_pBA580                                     0
TTTGCATGTCAAGC     KCTD16_pDS096                                     5
TTTGCATGTGGAGG     SAMM50_pDS156                   

### Add metadata

In [23]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # treatment
        "treatment_label": None, 
        "treatment_id": None,
        # perturbation type
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None,
        "data_modality": "CRISPR screen",
        "significant": None,
        "significance_criteria": None,
        "score_interpretation": None,
        # model system
        "model_system_label": "cell_line",
        "model_system_id": None,
        # replicates
        "technical_replicate": None,
        "biological_replicate": None,
        
        'tissue': 'blood',
        "timepoint": "P0DT0H0M0S",
        
        "species": "Homo sapiens",
        "sex_label": "female", 
        "sex_id": None,
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None,

        "study_title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "study_year": 2016,
        "first_author": "Britt Adamson",
        "last_author": "Jonathan Weissman",
        
        "experiment_title": "63000 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs.",
        "experiment_summary": "Perturb-seq was applied to a small CRISPRi library of 91 sgRNAs targeting UPR genes in K562 cells.",
        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],
        
        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": "EFO:0022895",
        "library_generation_method_label": "dCas9-KRAB",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "retrovirus transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentivirus transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive transgene expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive transgene expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",
        
        "library_manufacturer": "Weissman",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "1",
        "library_total_grnas": "91",
        "library_total_variants": None,
        
        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina HiSeq 4000",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "MAGeCK",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh37",
        
        "license_label": "CC BY 4.0",
        "license_id": "SWO:1000065",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "GSM2406677",
                "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406681",
                "dataset_description": "Barcode, cell identities, raw gene expression matrix",
                "dataset_file_name": "GSM2406677_10X010",
            },
            {
                "dataset_accession": "GSM2406677_10X005",
                "dataset_uri": "https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad",
                "dataset_description": "Processed .h5ad file",
                "dataset_file_name": "AdamsonWeissman2016_GSM2406681_10X010.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column treatment_label added to adata.obs
Column treatment_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column data_modality added to adata.obs
Column significant added to adata.obs
Column significance_criteria added to adata.obs
Column score_interpretation added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column technical_replicate added to adata.obs
Column biological_replicate added to adata.obs
Column tissue added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs
Column study_title added to adata.obs
Column study_uri added to adata.obs
Column study_year added to adata.obs
Column first

In [24]:
cur_data.adata.obs

Unnamed: 0_level_0,nperts,percent_mito,read count,perturbation_type,organism,perturbation_name,tissue_type,celltype,ngenes,cell_line,...,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,reference_genome_id,reference_genome_label,license_label,license_id,associated_datasets
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACAAGATG,2,4.917663,282.0,CRISPR,human,63(mod)_pBA580,cell_line,lymphoblasts,2914,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
AAACATACACCTAG,2,4.468626,331.0,CRISPR,human,OST4_pDS353,cell_line,lymphoblasts,3818,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
AAACATACTTCCCG,2,5.060113,285.0,CRISPR,human,SEC61A1_pDS031,cell_line,lymphoblasts,2616,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
AAACATTGAAACAG,2,5.052769,1036.0,CRISPR,human,EIF2B4_pDS491,cell_line,lymphoblasts,3488,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
AAACATTGCAGCTA,2,4.514091,863.0,CRISPR,human,SRPR_pDS482,cell_line,lymphoblasts,3620,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCATGCTTTAC,2,5.996971,476.0,CRISPR,human,STT3A_pDS011,cell_line,lymphoblasts,3356,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
TTTGCATGGAGGAC,2,4.612751,539.0,CRISPR,human,ARHGAP22_pDS458,cell_line,lymphoblasts,2961,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
TTTGCATGTAGAGA,2,7.242625,647.0,CRISPR,human,63(mod)_pBA580,cell_line,lymphoblasts,3473,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
TTTGCATGTCAAGC,2,7.296345,98.0,CRISPR,human,KCTD16_pDS096,cell_line,lymphoblasts,3431,K562,...,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,CC BY 4.0,SWO:1000065,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."


### Curate tissue information

In [25]:

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell type information

In [26]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
   input_column input_column_lower    name_lower ontology_id
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate cell line information

In [27]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower  ontology_id
0         K562               k562       k562  CLO:0007050
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Curate disease information

In [28]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  
0  chronic myelogenous leukemia  MONDO:0011996  
--------------------------------------------------


  return dispatch(args[0].__class__)(*args, **kw)


### Match schema column order

In [29]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [30]:
cur_data.validate_data(slot='obs')

2025-12-12 10:04:32,818 DEBUG curation_tools.curation_tools: Applying dtype casting on adata.obs for columns: ['dataset_id', 'sample_id', 'data_modality', 'significant', 'significance_criteria', 'perturbation_name', 'perturbed_target_coord', 'perturbed_target_chromosome', 'perturbed_target_chromosome_encoding', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_biotype', 'guide_sequence', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'technical_replicate', 'biological_replicate', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_label', 'disease_id', 'study_title', 'study_uri', 'study_year', 'first_author', 'last_author', 'experiment_title', 'experiment_summary', 'number_of_perturbed_targets', 'number_of

Unnamed: 0,dataset_id,sample_id,data_modality,significant,significance_criteria,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,...,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,score_interpretation,reference_genome_id,reference_genome_label,associated_datasets,license_label,license_id
0,adamson_2016_upr_perturb_seq,1,CRISPR screen,,,63(mod)_pBA580,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065
1,adamson_2016_upr_perturb_seq,2,CRISPR screen,,,OST4_pDS353,chr2:27070470-27071685;-1,2,2,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065
2,adamson_2016_upr_perturb_seq,3,CRISPR screen,,,SEC61A1_pDS031,chr3:128051641-128071705;1,3,3,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065
3,adamson_2016_upr_perturb_seq,4,CRISPR screen,,,EIF2B4_pDS491,chr2:27363420-27370400;-1,2,2,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065
4,adamson_2016_upr_perturb_seq,5,CRISPR screen,,,SRPR_pDS482,chr11:126262912-126269144;-1,11,11,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62618,adamson_2016_upr_perturb_seq,62619,CRISPR screen,,,STT3A_pDS011,chr11:125591712-125625215;1,11,11,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065
62619,adamson_2016_upr_perturb_seq,62620,CRISPR screen,,,ARHGAP22_pDS458,chr10:48446036-48656265;-1,10,10,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065
62620,adamson_2016_upr_perturb_seq,62621,CRISPR screen,,,63(mod)_pBA580,,,0,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065
62621,adamson_2016_upr_perturb_seq,62622,CRISPR screen,,,KCTD16_pDS096,chr5:144170873-144485686;1,5,5,1,...,,CellRanger,,MAGeCK,,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_...",CC BY 4.0,SWO:1000065


# VAR slot curation

### Standardise genes

In [31]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485     11.0      11
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945     43.0      43
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


In [32]:
cur_data.standardize_genes(
    slot="var",
    input_column="ensembl_id",
    input_column_type="ensembl_gene_id",
    remove_version=False,
    multiple_entries=False
)

2025-12-12 10:04:39,047 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Missing Ensembl IDs: ['ENSG00000256725', 'ENSG00000270113', 'ENSG00000203871', 'ENSG00000229323', 'ENSG00000267942', 'ENSG00000257183', 'ENSG00000226381', 'ENSG00000240875', 'ENSG00000240618', 'ENSG00000255518', 'ENSG00000261044', 'ENSG00000258297', 'ENSG00000178206', 'ENSG00000253115', 'ENSG00000272049', 'ENSG00000267094', 'ENSG00000232386', 'ENSG00000249677', 'ENSG00000272327', 'ENSG00000187145', 'ENSG00000267194', 'ENSG00000269363', 'ENSG00000233280', 'ENSG00000226434', 'ENSG00000249645', 'ENSG00000244693', 'ENSG00000267502', 'ENSG00000214871', 'ENSG00000181464', 'ENSG00000268111', 'ENSG00000161572', 'ENSG00000269391', 'ENSG00000179294', 'ENSG00000197932', 'ENSG00000198150', 'ENSG00000259209', 'ENSG00000214733', 'ENSG00000269515', 'ENSG00000205664', 'ENSG00000266002', 'ENSG00000269746', 'ENSG00000269027', 'ENSG00000270028', 'ENSG00000256427', 'ENSG00000224911', 'ENSG00000227603', 'ENSG00000233434', 'ENSG00000214479', 'ENSG00000262026', 'ENSG00000226974', 'ENSG00000268656', 'ENSG0000

2025-12-12 10:04:56,725 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93829
2025-12-12 10:04:56,785 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 501 to 1000...


2025-12-12 10:05:10,202 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 93676
2025-12-12 10:05:10,265 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 1001 to 1500...


2025-12-12 10:05:17,428 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 95080
2025-12-12 10:05:17,503 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 1501 to 2000...


2025-12-12 10:05:24,326 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 94383
2025-12-12 10:05:24,385 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 2001 to 2500...


2025-12-12 10:05:31,097 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 95169
2025-12-12 10:05:31,155 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): rest.ensembl.org:443


Processing IDs 2501 to 2577...


2025-12-12 10:05:32,072 DEBUG urllib3.connectionpool: https://rest.ensembl.org:443 "POST /archive/id HTTP/1.1" 200 14403


Fetched latest Ensembl IDs: {'ENSG00000256725': nan, 'ENSG00000270113': nan, 'ENSG00000203871': nan, 'ENSG00000229323': nan, 'ENSG00000267942': nan, 'ENSG00000257183': nan, 'ENSG00000226381': nan, 'ENSG00000240875': 'ENSG00000243926', 'ENSG00000240618': nan, 'ENSG00000255518': nan, 'ENSG00000261044': 'ENSG00000277332', 'ENSG00000258297': nan, 'ENSG00000178206': nan, 'ENSG00000253115': 'ENSG00000254288', 'ENSG00000272049': 'ENSG00000245729', 'ENSG00000267094': nan, 'ENSG00000232386': 'ENSG00000272808', 'ENSG00000249677': nan, 'ENSG00000272327': nan, 'ENSG00000187145': nan, 'ENSG00000267194': nan, 'ENSG00000269363': nan, 'ENSG00000233280': nan, 'ENSG00000226434': nan, 'ENSG00000249645': nan, 'ENSG00000244693': 'ENSG00000289604', 'ENSG00000267502': nan, 'ENSG00000214871': nan, 'ENSG00000181464': nan, 'ENSG00000268111': nan, 'ENSG00000161572': nan, 'ENSG00000269391': 'ENSG00000233570', 'ENSG00000179294': nan, 'ENSG00000197932': 'ENSG00000277203', 'ENSG00000198150': nan, 'ENSG00000259209': 

Because such a large number of ENSG are unmapped, replace the unmapped ones with the original IDs


In [33]:
cur_data.adata.var.loc[cur_data.adata.var['ensembl_gene_id'].isna(), 'ensembl_gene_id'] = cur_data.adata.var.loc[cur_data.adata.var['ensembl_gene_id'].isna(), 'ensembl_id']
cur_data.adata.var.loc[cur_data.adata.var['gene_symbol'].isna(), 'gene_symbol'] = cur_data.adata.var.loc[
    cur_data.adata.var['gene_symbol'].isna()].index

### Validate var metadata

In [34]:
cur_data.validate_data(slot="var")

2025-12-12 10:07:30,278 INFO curation_tools.curation_tools: adata.var is valid according to the var_schema.
2025-12-12 10:07:30,282 DEBUG curation_tools.curation_tools: Validated adata.var preview (shape=(32738, 2)):
              ensembl_gene_id   gene_symbol
index                                      
MIR1302-10    ENSG00000243485   MIR1302-2HG
FAM138A       ENSG00000237613       FAM138A
OR4F5         ENSG00000186092         OR4F5
RP11-34P13.7  ENSG00000238009  RP11-34P13.7
RP11-34P13.8  ENSG00000239945  RP11-34P13.8


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,RP11-34P13.7
RP11-34P13.8,ENSG00000239945,RP11-34P13.8
...,...,...
AC145205.1,ENSG00000215635,AC145205.1
BAGE5,ENSG00000268590,BAGE5
CU459201.1,ENSG00000251180,CU459201.1
AC002321.2,ENSG00000215616,AC002321.2


# Save the dataset

In [35]:
cur_data.save_curated_data_h5ad()

  adata.obs = adata.obs.fillna(value=np.nan)
... storing 'dataset_id' as categorical
... storing 'data_modality' as categorical
... storing 'significance_criteria' as categorical
... storing 'perturbation_name' as categorical
... storing 'perturbed_target_coord' as categorical
... storing 'perturbed_target_chromosome' as categorical
... storing 'perturbed_target_ensg' as categorical
... storing 'perturbed_target_symbol' as categorical
... storing 'perturbed_target_biotype' as categorical
... storing 'guide_sequence' as categorical
... storing 'perturbation_type_label' as categorical
... storing 'perturbation_type_id' as categorical
... storing 'timepoint' as categorical
... storing 'treatment_label' as categorical
... storing 'treatment_id' as categorical
... storing 'technical_replicate' as categorical
... storing 'biological_replicate' as categorical
... storing 'model_system_label' as categorical
... storing 'model_system_id' as categorical
... storing 'species' as categorical
... s

✅ Curated h5ad data saved to ../curated/h5ad/adamson_2016_upr_perturb_seq_curated.h5ad


In [36]:
cur_data.save_curated_data_parquet(split_metadata=True, save_metadata_only=True)

✅ Metadata saved to ../curated/parquet/adamson_2016_upr_perturb_seq_curated_metadata.parquet


# Upload to BigQuery


In [37]:
upload_parquet_to_bq(
    parquet_path='../curated/parquet/adamson_2016_upr_perturb_seq_curated_metadata.parquet',
    bq_dataset_id='prj-ext-dev-pertcat-437314.perturb_seq',
    bq_table_name='metadata',
    key_columns=['dataset_id', 'sample_id'],
    verbose=True
)

2025-12-12 10:08:21,882 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-12 10:08:21,884 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-12 10:08:23,325 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-12 10:08:23,326 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-12 10:08:23,696 DEBUG google.cloud.bigquery.opentelemetry_tracing: This service is instrumented using OpenTelemetry. OpenTelemetry or one of its components could not be imported; please add compatible versions of opentelemetry-api and opentelemetry-instrumentation packages in order to get BigQuery Tracing data.
2025-12-12 10:08:23,697 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-12-12 10:08:23,698 DEBUG google.auth.transport.requests: Making reques

Staging table: loading `.parquet` file ../curated/parquet/adamson_2016_upr_perturb_seq_curated_metadata.parquet to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging...


2025-12-12 10:08:24,833 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable HTTP/1.1" 200 0
2025-12-12 10:08:26,035 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "PUT /upload/bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?uploadType=resumable&upload_id=AHVrFxPmknirCgCx5fEQvVi2Y0zid-l592QWfvp24PYtVZStPFhvEbYt_oexlhpjEOWM58DI4ZqBITFiXl28FYHM7h1y3vMap0aW0ON1UC2KjkA HTTP/1.1" 200 14002
2025-12-12 10:08:26,190 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/acd760f6-4ea0-4fb7-897b-ddc4c338ae9b?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:08:26,193 DEBUG google.api_core.retry: Retrying due to , sleeping 0.4s ...
2025-12-12 10:08:26,744 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-

Staging table: loaded 62623 rows to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-12-12 10:08:28,808 DEBUG google.auth._default: Checking None for explicit credentials as part of auth process...
2025-12-12 10:08:28,808 DEBUG google.auth._default: Checking Cloud SDK credentials as part of auth process...
2025-12-12 10:08:29,162 DEBUG urllib3.util.retry: Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
2025-12-12 10:08:29,163 DEBUG google.auth.transport.requests: Making request: POST https://oauth2.googleapis.com/token
2025-12-12 10:08:29,165 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): oauth2.googleapis.com:443
2025-12-12 10:08:29,267 DEBUG urllib3.connectionpool: https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None
2025-12-12 10:08:29,269 DEBUG urllib3.connectionpool: Starting new HTTPS connection (1): bigquery.googleapis.com:443
2025-12-12 10:08:29,690 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?pre

Staging table: added ingested_at timestamp column to prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


2025-12-12 10:08:34,132 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/datasets/perturb_seq/tables/metadata?prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:08:34,301 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/datasets/perturb_seq/tables/metadata?prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:08:34,803 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "POST /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs?prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:08:34,930 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects/prj-ext-dev-pertcat-437314/jobs/0eb7b304-6385-4914-8eef-9c65af034ed0?projection=full&location=europe-west2&prettyPrint=false HTTP/1.1" 200 None
2025-12-12 10:08:38,231 DEBUG urllib3.connectionpool: https://bigquery.googleapis.com:443 "GET /bigquery/v2/projects

Merge completed: staging → prj-ext-dev-pertcat-437314.perturb_seq.metadata with type-safe casting.
Staging table: deleted prj-ext-dev-pertcat-437314.perturb_seq.metadata_staging


# Upload to GC Storage


In [39]:
!gcloud storage cp ../curated/h5ad/adamson_2016_upr_perturb_seq_curated.h5ad gs://perturbation-catalogue-lake/perturbseq/curated/

uploading large objects. If you would like to opt-out and instead
perform a normal upload, run:
`gcloud config set storage/parallel_composite_upload_enabled False`
`gcloud config set storage/parallel_composite_upload_enabled True`
Note that with parallel composite uploads, your object might be
uploaded as a composite object
(https://cloud.google.com/storage/docs/composite-objects), which means
that any user who downloads your object will need to use crc32c
checksums to verify data integrity. gcloud storage is capable of
computing crc32c checksums, but this might pose a problem for other
clients.

Copying file://../curated/h5ad/adamson_2016_upr_perturb_seq_curated.h5ad to gs://perturbation-catalogue-lake/perturbseq/curated/adamson_2016_upr_perturb_seq_curated.h5ad
  Completed files 32/1 | 1.7GiB/1.7GiB | 1.3MiB/s                              

Average throughput: 2.2MiB/s


Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ 