In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import json
import pandera as pa
from pandera.typing import Series, Index
from typing import Optional
from neofuzz import char_ngram_process, Process

from tools.curation_tools import (
    search_compounds_in_chebi,
    standardize_gene_symbols,
    standardize_ontology,
    standardize_var_genes,
    get_vals,
    get_dict_vals,
)
from tools.perturbseq_anndata_schema import ObsSchema, VarSchema

import sys

sys.path.append("../../")
from unified_metadata_schema.unified_metadata_schema import Experiment

# Read/download anndata from a file

**Change the directory to the location of your file!**

In [2]:
data_source_link = "https://zenodo.org/records/13350497/files/DatlingerBock2017.h5ad"
noncurated_path = "../non_curated/h5ad/datlinger_2017.h5ad"

# Download the data if it doesn't exist
if not os.path.exists(noncurated_path):
    print(f"Downloading data from {data_source_link} to {noncurated_path}")
    os.makedirs(os.path.dirname(noncurated_path), exist_ok=True)
    os.system(f"wget {data_source_link} -O {noncurated_path}")
else:
    print(f"File {noncurated_path} already exists. Skipping download.")

File ../non_curated/h5ad/datlinger_2017.h5ad already exists. Skipping download.


In [3]:
# Load the data
adata = sc.read_h5ad(noncurated_path)
adata

AnnData object with n_obs × n_vars = 5905 × 36722
    obs: 'perturbation', 'perturbation_2', 'replicate', 'target', 'celltype', 'cell_line', 'cancer', 'disease', 'tissue_type', 'organism', 'perturbation_type', 'perturbation_type_2', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts'
    var: 'ncounts', 'ncells'

# Load ontologies from parquets

In [4]:
gene_ont = pd.read_parquet('../ontologies/genes.parquet').drop_duplicates()
ctype_ont = pd.read_parquet('../ontologies/cell_types.parquet').drop_duplicates()
cline_ont = pd.read_parquet('../ontologies/cell_lines.parquet').drop_duplicates()
tis_ont = pd.read_parquet('../ontologies/tissues.parquet').drop_duplicates()
dis_ont = pd.read_parquet('../ontologies/diseases.parquet').drop_duplicates()

# OBS curation

### Filter out nans and unknowns before proceeding with further curation

In [5]:
# rename the columns to match the schema
adata.obs = adata.obs.rename(
    columns={
        'perturbation': 'perturbation_name'
    }
)
# drop None values from the perturbation_name column
print(adata.obs.shape)
adata = adata[~adata.obs["perturbation_name"].isna()]
print(adata.obs.shape)

(5905, 17)
(5905, 17)


In [6]:
print(len(set(adata.obs["perturbation_name"])))
set(adata.obs["perturbation_name"])

97


{'Essential_library_DHODH_1',
 'Essential_library_DHODH_2',
 'Essential_library_DHODH_3',
 'Essential_library_MVD_1',
 'Essential_library_MVD_2',
 'Essential_library_MVD_3',
 'Essential_library_TUBB_1',
 'Essential_library_TUBB_2',
 'Essential_library_TUBB_3',
 'Tcrlibrary_BACH2_1',
 'Tcrlibrary_BACH2_2',
 'Tcrlibrary_BACH2_3',
 'Tcrlibrary_DOK2_1',
 'Tcrlibrary_DOK2_2',
 'Tcrlibrary_DOK2_3',
 'Tcrlibrary_EGR1_1',
 'Tcrlibrary_EGR1_2',
 'Tcrlibrary_EGR1_3',
 'Tcrlibrary_EGR2_1',
 'Tcrlibrary_EGR2_2',
 'Tcrlibrary_EGR2_3',
 'Tcrlibrary_EGR3_1',
 'Tcrlibrary_EGR3_2',
 'Tcrlibrary_EGR3_3',
 'Tcrlibrary_EGR4_1',
 'Tcrlibrary_EGR4_2',
 'Tcrlibrary_EGR4_3',
 'Tcrlibrary_ETS1_1',
 'Tcrlibrary_ETS1_2',
 'Tcrlibrary_ETS1_3',
 'Tcrlibrary_FOS_1',
 'Tcrlibrary_FOS_2',
 'Tcrlibrary_FOS_3',
 'Tcrlibrary_GATA3_1',
 'Tcrlibrary_GATA3_2',
 'Tcrlibrary_GATA3_3',
 'Tcrlibrary_JUNB_1',
 'Tcrlibrary_JUNB_2',
 'Tcrlibrary_JUNB_3',
 'Tcrlibrary_JUND_1',
 'Tcrlibrary_JUND_2',
 'Tcrlibrary_JUND_3',
 'Tcrlibra

In [7]:

# extract the perturbed target symbol from the perturbation_name column
# replace NaN values in the perturbation_name column with "control"
adata.obs["perturbed_target_symbol"] = (
    adata.obs["target"].cat.add_categories(["control"]).fillna("control")
)

# drop the rows with unknown perturbed target symbol
print(adata.obs.shape)
print(len(set(adata.obs["perturbed_target_symbol"])))
set(adata.obs["perturbed_target_symbol"])

(5905, 18)
33


  adata.obs["perturbed_target_symbol"] = (


{'BACH2',
 'DHODH',
 'DOK2',
 'EGR1',
 'EGR2',
 'EGR3',
 'EGR4',
 'ETS1',
 'FOS',
 'GATA3',
 'JUN',
 'JUNB',
 'JUND',
 'LAT',
 'LCK',
 'MVD',
 'NFAT5',
 'NFATC1',
 'NFATC2',
 'NFATC3',
 'NFKB1',
 'NFKB2',
 'NR4A1',
 'PTPN11',
 'PTPN6',
 'REL',
 'RELA',
 'RELB',
 'RUNX1',
 'RUNX2',
 'TUBB',
 'ZAP70',
 'control'}

## Proceed with the curation of the adata.obs slot

Standardize perturbed target gene symbols, ENSG IDs and biotypes

In [8]:
obs = adata.obs.copy()

obs = standardize_gene_symbols(obs, "perturbed_target_symbol")

# map the perturbed target symbol to the ENSG
obs['perturbed_target_ensg'] = obs['perturbed_target_symbol'].map(
    gene_ont.set_index('symbol')['ensembl_gene_id'].to_dict()
)

# map the perturbed target ENSG to the biotype
obs['perturbed_target_category'] = obs['perturbed_target_ensg'].map(
    gene_ont.set_index('ensembl_gene_id')['biotype'].to_dict()
)

# add the perturbed target number column based on the number of symbols in the perturbed_target_symbol column
obs['perturbed_target_number'] = [len(x.split("|")) if x is not None else 0 for x in obs['perturbed_target_symbol']]

obs


Loaded gene ontology with 86403 entries
--------------------------------------------------
33 out of 33 gene symbols mapped to standardized symbols
--------------------------------------------------
0 gene symbols could not be mapped to standardized symbols
--------------------------------------------------
All unmatched gene symbols have been mapped to standardized symbols using synonyms
--------------------------------------------------


Unnamed: 0,perturbed_target_symbol,standardized_symbol
0,control,control
1,JUND,JUND
2,BACH2,BACH2
3,NFKB2,NFKB2
4,JUN,JUN
5,NFKB1,NFKB1
6,JUNB,JUNB
7,GATA3,GATA3
8,NFATC1,NFATC1
9,RELA,RELA


Mapped the standardized symbols in column perturbed_target_symbol back to the original DataFrame


Unnamed: 0_level_0,perturbation_name,perturbation_2,replicate,target,celltype,cell_line,cancer,disease,tissue_type,organism,...,perturbation_type_2,ncounts,ngenes,percent_mito,percent_ribo,nperts,perturbed_target_symbol,perturbed_target_ensg,perturbed_target_category,perturbed_target_number
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TACTTGACCCCN,control,stimulated,1,,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,8696.0,2722,0.287488,25.091997,1,control,control,control,1
TTACAGCTGAAC,Tcrlibrary_JUND_2,stimulated,1,JUND,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,3198.0,1581,5.065666,15.384615,3,JUND,ENSG00000130522,protein_coding,1
CTAAGGCCCTTA,Tcrlibrary_BACH2_3,stimulated,1,BACH2,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,8137.0,2856,4.350498,20.412930,3,BACH2,ENSG00000112182,protein_coding,1
CTTGACGCAGGT,Tcrlibrary_NFKB2_3,stimulated,1,NFKB2,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,7051.0,2687,9.034180,10.466600,3,NFKB2,ENSG00000077150,protein_coding,1
TAACCCGTACGC,Tcrlibrary_JUN_1,stimulated,1,JUN,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,5453.0,2122,12.140106,3.594352,3,JUN,ENSG00000177606,protein_coding,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGTGTCGGGGA,control,unstimulated,5,,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,4213.0,1928,4.486115,16.496559,1,control,control,control,1
TTTAGTATTCCA,Tcrlibrary_EGR1_3,unstimulated,5,EGR1,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,4197.0,2054,1.834644,14.939242,3,EGR1,ENSG00000120738,protein_coding,1
GGCGCCTAATCG,control,unstimulated,5,,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,11385.0,3655,2.213439,17.891964,1,control,control,control,1
GCTGAGCGTTTN,Tcrlibrary_NR4A1_1,unstimulated,5,NR4A1,T cells,Jurkat cells,True,acute T cell leukemia,cell_line,human,...,TCR stimulation,929.0,689,3.229279,15.392896,3,NR4A1,ENSG00000123358,protein_coding,1


Add treatment information

In [9]:
# treatment
obs['treatment_label'] = obs['perturbation_2'].cat.rename_categories(
    {
        "stimulated": "anti-CD3 antibody|anti-CD28 antibody",
        "unstimulated": "Untretaed Control"
    }
)
obs['treatment_id'] = obs['perturbation_2'].cat.rename_categories(
    {
        "stimulated": "EFO:0003317|EFO:0003304",
        "unstimulated": "NCIT:C184729"
    }
)

Add perturbation information

In [10]:
# perturbation type
obs['perturbation_type_label'] = 'CRISPRko'
obs['perturbation_type_id'] = None


Add timepoint information

In [11]:

# timepoint
obs['timepoint'] = "P0DT0H0M0S" # this follows the ISO 8601 format for time intervals


Add model system and tissue information

In [12]:

# model system
obs['model_system_label'] = "cell line"
obs['model_system_id'] = None

# tissue
obs['tissue_label'] = None
obs['tissue_id'] = None


Add cell type information

In [13]:
# cell type
obs['cell_type_label'] = obs['celltype']
obs = standardize_ontology(obs, "cell_type_label", ctype_ont)
obs['cell_type_id'] = obs['cell_type_label'].map(
    ctype_ont.set_index('name')['ontology_id'].to_dict()
)

1 out of 1 ontology labels mapped to standardized names
--------------------------------------------------
0 ontology label could not be mapped to standardized names
--------------------------------------------------


Unnamed: 0,cell_type_label,standardized_name
0,T cells,T cell


Mapped the standardized ontology labels in column cell_type_label back to the original DataFrame


Add cell line information

In [14]:
# cell line label
obs['cell_line_label'] = obs['cell_line']

obs = standardize_ontology(obs, "cell_line_label", cline_ont)

obs['cell_line_id'] = obs['cell_line_label'].map(
    cline_ont.set_index('name')['ontology_id'].to_dict()
)


1 out of 1 ontology labels mapped to standardized names
--------------------------------------------------
0 ontology label could not be mapped to standardized names
--------------------------------------------------


Unnamed: 0,cell_line_label,standardized_name
0,Jurkat cells,JURKAT cell


Mapped the standardized ontology labels in column cell_line_label back to the original DataFrame


Add disease information

In [15]:

# disease
obs['disease_term_label'] = obs['disease']
obs = standardize_ontology(obs, "disease_term_label", dis_ont)

obs['disease_term_id'] = obs['disease_term_label'].map(
    dis_ont.set_index('name')['ontology_id'].to_dict()
)


0 out of 1 ontology labels mapped to standardized names
--------------------------------------------------
1 ontology label could not be mapped to standardized names
--------------------------------------------------
Trying to match the unmatched ontology labels against known synonyms
--------------------------------------------------
1 ontology label mapped to standardized names using synonyms
--------------------------------------------------
All unmatched ontology labels have been mapped to standardized names using synonyms
--------------------------------------------------


Unnamed: 0,disease_term_label,standardized_name
0,acute T cell leukemia,T-cell acute lymphoblastic leukemia


Mapped the standardized ontology labels in column disease_term_label back to the original DataFrame


Add species, sex and developmental stage information

In [16]:

# species
obs['species'] = 'Homo sapiens'

# sex
obs['sex_label'] = None
obs['sex_id'] = None

# developmental stage
obs['developmental_stage_label'] = None
obs['developmental_stage_id'] = None

# change the order of the columns to match that of the schema
obs = obs[ObsSchema.to_schema().columns.keys()]

obs

Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TACTTGACCCCN,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TTACAGCTGAAC,Tcrlibrary_JUND_2,1,ENSG00000130522,JUND,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
CTAAGGCCCTTA,Tcrlibrary_BACH2_3,1,ENSG00000112182,BACH2,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
CTTGACGCAGGT,Tcrlibrary_NFKB2_3,1,ENSG00000077150,NFKB2,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TAACCCGTACGC,Tcrlibrary_JUN_1,1,ENSG00000177606,JUN,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGTGTCGGGGA,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TTTAGTATTCCA,Tcrlibrary_EGR1_3,1,ENSG00000120738,EGR1,protein_coding,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
GGCGCCTAATCG,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
GCTGAGCGTTTN,Tcrlibrary_NR4A1_1,1,ENSG00000123358,NR4A1,protein_coding,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963


## Validate the adata.obs slot

In [17]:
try:
    validated_obs = ObsSchema.validate(obs, lazy=True)
    print("Data is successfully validated!")
    display(validated_obs)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))

Data is successfully validated!


Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TACTTGACCCCN,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TTACAGCTGAAC,Tcrlibrary_JUND_2,1,ENSG00000130522,JUND,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
CTAAGGCCCTTA,Tcrlibrary_BACH2_3,1,ENSG00000112182,BACH2,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
CTTGACGCAGGT,Tcrlibrary_NFKB2_3,1,ENSG00000077150,NFKB2,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TAACCCGTACGC,Tcrlibrary_JUN_1,1,ENSG00000177606,JUN,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGTGTCGGGGA,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TTTAGTATTCCA,Tcrlibrary_EGR1_3,1,ENSG00000120738,EGR1,protein_coding,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
GGCGCCTAATCG,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
GCTGAGCGTTTN,Tcrlibrary_NR4A1_1,1,ENSG00000123358,NR4A1,protein_coding,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963


## VAR curation

In [18]:
var = adata.var.copy()
var["gene_symbol"] = var.index

var = standardize_var_genes(var, column='gene_symbol')

var

Converted 22800/36722 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol,original_gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,ENSG00000121410,A1BG,A1BG
A1BG-AS1,ENSG00000268895,A1BG-AS1,A1BG-AS1
A1CF,ENSG00000148584,A1CF,A1CF
A2M,ENSG00000175899,A2M,A2M
A2M-AS1,ENSG00000245105,A2M-AS1,A2M-AS1
...,...,...,...
hsa-mir-1587,ENSG00000263972,MIR1587,hsa-mir-1587
hsa-mir-3149,ENSG00000266712,MIR3149,hsa-mir-3149
hsa-mir-4259,ENSG00000266458,MIR4259,hsa-mir-4259
snosnR66,,,snosnR66


In [19]:
try:
    validated_var = VarSchema.validate(var, lazy=True)
    print("Data is successfully validated!")
    display(validated_var)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))
    

Data is successfully validated!


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol,original_gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,ENSG00000121410,A1BG,A1BG
A1BG-AS1,ENSG00000268895,A1BG-AS1,A1BG-AS1
A1CF,ENSG00000148584,A1CF,A1CF
A2M,ENSG00000175899,A2M,A2M
A2M-AS1,ENSG00000245105,A2M-AS1,A2M-AS1
...,...,...,...
hsa-mir-1587,ENSG00000263972,MIR1587,hsa-mir-1587
hsa-mir-3149,ENSG00000266712,MIR3149,hsa-mir-3149
hsa-mir-4259,ENSG00000266458,MIR4259,hsa-mir-4259
snosnR66,,,snosnR66


## Reassign obs and var

Since we dropped cells that were not uniquely assigned to one or another treatment, we need to filter the original adata before we can reassign standardised obs and var names.

In [20]:
adata = adata[adata.obs.index.isin(obs.index), :]

adata

View of AnnData object with n_obs × n_vars = 5905 × 36722
    obs: 'perturbation_name', 'perturbation_2', 'replicate', 'target', 'celltype', 'cell_line', 'cancer', 'disease', 'tissue_type', 'organism', 'perturbation_type', 'perturbation_type_2', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'perturbed_target_symbol'
    var: 'ncounts', 'ncells'

In [21]:
adata.var = validated_var
adata.obs = validated_obs

In [22]:
adata

AnnData object with n_obs × n_vars = 5905 × 36722
    obs: 'perturbation_name', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_category', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_term_label', 'disease_term_id'
    var: 'ensembl_gene_id', 'gene_symbol', 'original_gene_symbol'

# Metadata curation

### Fill the dictionary below

In [24]:
get_dict_vals("treatment_id", "treatment_label", adata)

[{'term_id': 'EFO:0003317|EFO:0003304',
  'term_label': 'anti-CD3 antibody|anti-CD28 antibody'},
 {'term_id': 'NCIT:C184729', 'term_label': 'Untretaed Control'}]

In [28]:
metadata = {
    "study": {
        "title": "Pooled CRISPR screening with single-cell transcriptome readout",
        "study_uri": "https://doi.org/10.1038/nmeth.4177",
        "year": 2017,
        "first_author": {"first_name": "Paul", "last_name": "Datlinger"},
        "last_author": {"first_name": "Christoph", "last_name": "Bock"},
    },
    "experiment": {
        "title": "Transcriptomics measurements of 5905 Jurkat cells induced with anti-CD3 and anti-CD28 antibodies",
        "summary": """
            Jurkat cells were transduced with a gRNA library targeting high-level
            regulators of T cell receptor signaling and a set of transcription factors. After 10
            days of antibiotic selection and expansion, cells were stimulated with anti-CD3 and
            anti-CD28 antibodies or left untreated. Both conditions were analyzed using CROP-seq,
            measuring TCR activation for each gene knockout. The dataset comprises 5,905 high-quality
            single-cell transcriptomes with uniquely assigned gRNAs.
            """,
        "treatments": get_dict_vals("treatment_id", "treatment_label", adata),
        "timepoints": get_vals(adata.obs["timepoint"], "list"),
        "replicates": "none",
        "number_of_samples": 2,
        "number_of_perturbed_cells": adata.obs.shape[0],
        "perturbation_type": get_dict_vals(
            "perturbation_type_id", "perturbation_type_label", adata
        ),
        "perturbed_target_category": get_vals(
            adata.obs["perturbed_target_category"], "list"
        ),
        "number_of_perturbed_targets": len(
            get_vals(adata.obs["perturbed_target_ensg"], "list")
        ),
        "perturbed_targets": get_vals(adata.obs["perturbed_target_ensg"], "list"),
    },
    "perturbation": {
        "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022876",
            "term_label": "SpCas9",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "pooled",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "knockout",
                },
            ],
            "manufacturer": "Bock",
            "lentiviral_generation": "3",
            "grnas_per_gene": "3",
            "total_grnas": "116",
            "total_genes": len(get_vals(adata.obs["perturbed_target_ensg"], "list")),
            "total_variants": None,
        },
    },
    "assay": {
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "single-cell rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "Perturb-seq",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "Nextera XT",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina HiSeq 4000"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "Drop-seq Tools"},
        "software_analysis": {"term_id": None, "term_label": "Custom"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh38",
        },
    },
    "model_system": {
        "model_system": get_dict_vals("model_system_id", "model_system_label", adata),
        "species": "Homo sapiens",
        "tissue": get_dict_vals("tissue_id", "tissue_label", adata),
        "cell_type": get_dict_vals("cell_type_id", "cell_type_label", adata),
        "cell_line": get_dict_vals("cell_line_id", "cell_line_label", adata),
        "sex": get_dict_vals("sex_id", "sex_label", adata),
        "developmental_stage": get_dict_vals(
            "developmental_stage_id", "developmental_stage_label", adata
        ),
        "passage_number": None,
        "sample_quantity": {
            "sample_quantity_value": adata.obs.shape[0],
            "sample_quantity_unit": "cells",
        },
    },
    "associated_diseases": get_dict_vals(
        "disease_term_id", "disease_term_label", adata
    ),
    "associated_datasets": [
        {
            "dataset_accession": "GSE92872",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92872",
            "dataset_description": "Digital expression matrix",
            "dataset_file_name": "GSE92872_CROP-seq_Jurkat_TCR.digital_expression.csv.gz",
        },
        {
            "dataset_accession": "DatlingerBock2017.h5ad",
            "dataset_uri": "https://zenodo.org/records/13350497/files/DatlingerBock2017.h5ad",
            "dataset_description": "Processed .h5ad file",
            "dataset_file_name": "DatlingerBock2017.h5ad",
        },
    ],
}

### Validate the metadata

In [29]:
m = Experiment.model_validate(metadata)

In [30]:
print(m.model_dump_json(indent=4))

{
    "study": {
        "title": "Pooled CRISPR screening with single-cell transcriptome readout",
        "study_uri": "https://doi.org/10.1038/nmeth.4177",
        "year": 2017,
        "first_author": {
            "first_name": "Paul",
            "last_name": "Datlinger"
        },
        "last_author": {
            "first_name": "Christoph",
            "last_name": "Bock"
        }
    },
    "experiment": {
        "title": "Transcriptomics measurements of 5905 Jurkat cells induced with anti-CD3 and anti-CD28 antibodies",
        "summary": "\n            Jurkat cells were transduced with a gRNA library targeting high-level\n            regulators of T cell receptor signaling and a set of transcription factors. After 10\n            days of antibiotic selection and expansion, cells were stimulated with anti-CD3 and\n            anti-CD28 antibodies or left untreated. Both conditions were analyzed using CROP-seq,\n            measuring TCR activation for each gene knockout. T

### Show the metadata

In [32]:
adata.var

Unnamed: 0_level_0,ensembl_gene_id,gene_symbol,original_gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,ENSG00000121410,A1BG,A1BG
A1BG-AS1,ENSG00000268895,A1BG-AS1,A1BG-AS1
A1CF,ENSG00000148584,A1CF,A1CF
A2M,ENSG00000175899,A2M,A2M
A2M-AS1,ENSG00000245105,A2M-AS1,A2M-AS1
...,...,...,...
hsa-mir-1587,ENSG00000263972,MIR1587,hsa-mir-1587
hsa-mir-3149,ENSG00000266712,MIR3149,hsa-mir-3149
hsa-mir-4259,ENSG00000266458,MIR4259,hsa-mir-4259
snosnR66,,,snosnR66


In [33]:
adata.obs

Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TACTTGACCCCN,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TTACAGCTGAAC,Tcrlibrary_JUND_2,1,ENSG00000130522,JUND,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
CTAAGGCCCTTA,Tcrlibrary_BACH2_3,1,ENSG00000112182,BACH2,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
CTTGACGCAGGT,Tcrlibrary_NFKB2_3,1,ENSG00000077150,NFKB2,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TAACCCGTACGC,Tcrlibrary_JUN_1,1,ENSG00000177606,JUN,protein_coding,CRISPRko,,P0DT0H0M0S,anti-CD3 antibody|anti-CD28 antibody,EFO:0003317|EFO:0003304,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGTGTCGGGGA,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
TTTAGTATTCCA,Tcrlibrary_EGR1_3,1,ENSG00000120738,EGR1,protein_coding,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
GGCGCCTAATCG,control,1,control,control,control,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963
GCTGAGCGTTTN,Tcrlibrary_NR4A1_1,1,ENSG00000123358,NR4A1,protein_coding,CRISPRko,,P0DT0H0M0S,Untretaed Control,NCIT:C184729,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,,,,,T-cell acute lymphoblastic leukemia,MONDO:0004963


### Replace None values with np.nan

None values are not supported in anndata

In [34]:
adata.obs = adata.obs.replace({None: np.nan})

  adata.obs = adata.obs.replace({None: np.nan})


# Save the anndata object

In [35]:
curated_path = noncurated_path.replace("non_curated", "curated").replace(
    ".h5ad", "_curated.h5ad"
)

# create the directory if it doesn't exist
if not os.path.exists(os.path.dirname(curated_path)):
    os.makedirs(os.path.dirname(curated_path))

# save the adata object
adata.write_h5ad(curated_path)
print(f"Curated data saved to {curated_path}")

Curated data saved to ../curated/h5ad/datlinger_2017_curated.h5ad


## Save the metadata

In [36]:
# savet the serialized m object
curated_metadata_path = curated_path.replace("/h5ad", "/json").replace(".h5ad", ".json")
with open(curated_metadata_path, "w") as f:
    json.dump(m.model_dump(), f, indent=4)
print(f"Curated metadata saved to {curated_metadata_path}")

Curated metadata saved to ../curated/json/datlinger_2017_curated.json
