In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import json
import pandera as pa
from pandera.typing import Series, Index
from typing import Optional
from neofuzz import char_ngram_process, Process

from tools.curation_tools import add_new_term, remove_term, add_synonym, remove_synonym, standardize_data, get_vals, get_dict_vals

import sys
sys.path.append("../../")
from unified_metadata_schema.unified_metadata_schema import Experiment


# Read/download anndata from a file

**Change the directory to the location of your file!**

In [None]:
data_source_link = "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad"
noncurated_path = "../non_curated/h5ad/adamson_2016_pilot.h5ad"

# Download the data if it doesn't exist
if not os.path.exists(noncurated_path):
    print(f"Downloading data from {data_source_link} to {noncurated_path}")
    os.makedirs(os.path.dirname(noncurated_path), exist_ok=True)
    os.system(f"wget {data_source_link} -O {noncurated_path}")
else:
    print(f"File {noncurated_path} already exists. Skipping download.")

AnnData object with n_obs × n_vars = 5768 × 35635
    obs: 'perturbation', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts'
    var: 'ensembl_id', 'ncounts', 'ncells'

# Load ontologies from parquets

In [None]:
gene_ont = pd.read_parquet('../ontologies/genes.parquet').drop_duplicates()
ctype_ont = pd.read_parquet('../ontologies/cell_types.parquet').drop_duplicates()
cline_ont = pd.read_parquet('../ontologies/cell_lines.parquet').drop_duplicates()
tis_ont = pd.read_parquet('../ontologies/tissues.parquet').drop_duplicates()
dis_ont = pd.read_parquet('../ontologies/diseases.parquet').drop_duplicates()

### Run the cell below to enable fast fuzzy mapping of gene names

In [6]:
# index gene_ont for fast fuzzy search

if os.path.exists("tools/fuzzy_gene_index.joblib"):
    print("Loading existing process from disk")
    process = Process.from_disk("tools/fuzzy_gene_index.joblib")
else:
    print("Creating new process")
    # Create a process
    process = char_ngram_process()
    # Index the options
    process.index(gene_ont["symbol"].dropna().drop_duplicates())
    # save the process to disk
    process.to_disk("tools/fuzzy_gene_index.joblib")
    

Loading existing process from disk


# Define adata slots schemas

In [7]:
# adata.obs schema
class ObsSchema(pa.DataFrameModel):
    perturbation_name: Series[str] = pa.Field(nullable=False)
    perturbed_target_number: Series[int] = pa.Field(nullable=False, ge=1)
    perturbed_target_ensg: Series[str] = pa.Field(
        nullable=False, isin=gene_ont.ensembl_gene_id.values
    )
    perturbed_target_symbol: Optional[Series[str]] = pa.Field(
        nullable=True, isin=gene_ont.symbol.values
    )
    perturbed_target_category: Optional[Series[str]] = pa.Field(
        nullable=True, isin=gene_ont.biotype.values
    )
    perturbation_type_label: Series[str] = pa.Field(nullable=False)
    perturbation_type_id: Series[str] = pa.Field(nullable=True, str_contains=":")
    timepoint: Optional[Series[str]] = pa.Field(nullable=True, regex = r"^P\d+DT\d{1,2}H\d{1,2}M\d{1,2}S$") # /^P\d+DT\d{1,2}H\d{1,2}M\d{1,2}S$/gm
    treatment_label: Optional[Series[str]] = pa.Field(nullable=True)
    treatment_id: Optional[Series[str]] = pa.Field(nullable=True, str_contains=":")
    model_system_label: Series[str] = pa.Field(nullable=False)
    model_system_id: Series[str] = pa.Field(nullable=True, str_contains=":")
    species: Series[str] = pa.Field(nullable=False, isin=["Homo sapiens"])
    tissue_label: Optional[Series[str]] = pa.Field(
        nullable=True, isin=tis_ont.name.values
    )
    tissue_id: Optional[Series[str]] = pa.Field(
        nullable=True,
        isin=tis_ont.ontology_id.values,
    )
    cell_type_label: Series[str] = pa.Field(nullable=False, isin=ctype_ont.name.values)
    cell_type_id: Series[str] = pa.Field(
        nullable=False,
        isin=ctype_ont.ontology_id.values,
    )
    cell_line_label: Optional[Series[str]] = pa.Field(
        nullable=True, isin=cline_ont.name.values
    )
    cell_line_id: Optional[Series[str]] = pa.Field(
        nullable=True,
        isin=cline_ont.ontology_id.values,
    )
    sex_label: Optional[Series[str]] = pa.Field(nullable=True)
    sex_id: Optional[Series[str]] = pa.Field(nullable=True, str_contains=":")
    developmental_stage_label: Optional[Series[str]] = pa.Field(nullable=True)
    developmental_stage_id: Optional[Series[str]] = pa.Field(nullable=True, str_contains=":")
    disease_term_label: Optional[Series[str]] = pa.Field(
        nullable=True, isin=dis_ont.name.values
    )
    disease_term_id: Optional[Series[str]] = pa.Field(
        nullable=True,
        isin=dis_ont.ontology_id.values,
    )

    class Config:
        strict = True
        coerce = True
        ordered = True


# adata.var schema
class VarSchema(pa.DataFrameModel):
    ensembl_gene_id: Index[str] = pa.Field(
        nullable=False,
        unique=True,
        str_startswith=("ENSG", 'control'),
        # isin=gene_ont.ensembl_gene_id.values,
        check_name=True,
    )
    ensembl_gene_id: Series[str] = pa.Field(
        nullable=False, 
        unique=True,
        str_startswith=("ENSG", 'control'),
        # isin=gene_ont.ensembl_gene_id.values
    )
    symbol: Series[str] = pa.Field(
        nullable=True,
        coerce=True
        # isin=gene_ont.symbol.values
    )

    class Config:
        strict = "filter"
        coerce = True
        ordered = True

# OBS curation

### Filter out nans and unknowns before proceeding with further curation

In [10]:
# rename the columns to match the schema
adata.obs = adata.obs.rename(
    columns={
        'perturbation': 'perturbation_name'
    }
)
# drop None values from the perturbation_name column
print(adata.obs.shape)
adata = adata[~adata.obs["perturbation_name"].isna()]
print(adata.obs.shape)

# extract the perturbed target symbol from the perturbation_name column
adata.obs['perturbed_target_symbol'] = adata.obs['perturbation_name'].str.split("_").str[0]
adata.obs['perturbed_target_symbol'] = adata.obs['perturbed_target_symbol'].replace(
    {
        "62(mod)": "control",
        "*": "unknown"
    }
)
# drop the rows with unknown perturbed target symbol
adata = adata[adata.obs['perturbed_target_symbol'] != "unknown"]
print(adata.obs.shape)

(5768, 15)
(5758, 15)
(5752, 16)


  adata.obs['perturbed_target_symbol'] = adata.obs['perturbation_name'].str.split("_").str[0]


## Proceed with the curation of the adata.obs slot

In [11]:
obs = adata.obs.copy()

# check if the perturbed target symbol is in the gene ontology
obs = standardize_data(obs, "perturbed_target_symbol", gene_ont, "symbol")

# map the perturbed target symbol to the ENSG
obs['perturbed_target_ensg'] = obs['perturbed_target_symbol'].map(
    gene_ont.set_index('symbol')['ensembl_gene_id'].to_dict()
)

# map the perturbed target ENSG to the biotype
obs['perturbed_target_category'] = obs['perturbed_target_ensg'].map(
    gene_ont.set_index('ensembl_gene_id')['biotype'].to_dict()
)

# add the perturbed target number column based on the number of symbols in the perturbed_target_symbol column
obs['perturbed_target_number'] = [len(x.split("|")) if x is not None else 0 for x in obs['perturbed_target_symbol']]

# perturbation type
obs['perturbation_type_label'] = 'CRISPRi'
obs['perturbation_type_id'] = None

# timepoint
obs['timepoint'] = "P0DT0H0M0S" # this follows the ISO 8601 format for time intervals

# treatment
obs['treatment_label'] = None
obs['treatment_id'] = None

# model system
obs['model_system_label'] = "cell line"
obs['model_system_id'] = None

# tissue
obs['tissue_label'] = None
obs['tissue_id'] = None

# cell type
obs['cell_type_label'] = obs['celltype']
obs['cell_type_label'] = obs['cell_type_label'].replace(
    {
        'lymphoblasts': 'lymphoblast'
    }
)
obs['cell_type_id'] = obs['cell_type_label'].map(
    ctype_ont.set_index('name')['ontology_id'].to_dict()
)

# cell line label
obs['cell_line_label'] = obs['cell_line']
obs = standardize_data(obs, "cell_line_label", cline_ont, "name")

obs['cell_line_id'] = obs['cell_line_label'].map(
    cline_ont.set_index('name')['ontology_id'].to_dict()
)

# disease
obs['disease_term_label'] = obs['disease']
obs = standardize_data(obs, "disease_term_label", dis_ont, "name")

obs['disease_term_id'] = obs['disease_term_label'].map(
    dis_ont.set_index('name')['ontology_id'].to_dict()
)

# species
obs['species'] = 'Homo sapiens'

# sex
obs['sex_label'] = None
obs['sex_id'] = None

# developmental stage
obs['developmental_stage_label'] = None
obs['developmental_stage_id'] = None

# change the order of the columns to match that of the schema
obs = obs[ObsSchema.to_schema().columns.keys()]

obs

Number of perfect matches: 5752
Number of synonym matches: 0
Standardized values:
{'K562': 'K 562 cell'}
Standardized values:
{'chronic myelogenous leukemia': 'chronic myelogenous leukemia, BCR-ABL1 positive'}


  obs['cell_type_label'] = obs['cell_type_label'].replace(


Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCGAT,CREB1_pDS269,1,ENSG00000118260,CREB1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACAGAGAT,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACCAGAAA,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACGTTGAC,EP300_pDS268,1,ENSG00000100393,EP300,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTGTTCT,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGAAGGC,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGGACGAG,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCCCGTT,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCCGTTC,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


In [12]:
try:
    validated_obs = ObsSchema.validate(obs, lazy=True)
    print("Data is successfully validated!")
    display(validated_obs)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))

Data is successfully validated!


Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCGAT,CREB1_pDS269,1,ENSG00000118260,CREB1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACAGAGAT,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACCAGAAA,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACGTTGAC,EP300_pDS268,1,ENSG00000100393,EP300,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTGTTCT,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGAAGGC,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGGACGAG,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCCCGTT,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCCGTTC,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


## VAR curation

In [13]:
var = adata.var.copy()
var["symbol"] = var.index
var = var.rename(columns={"ensembl_id": "ensembl_gene_id"})
# var.index = var["ensembl_gene_id"]

# map symbols from gene_ont
var = var.merge(
    right= gene_ont[['ensembl_gene_id', 'symbol']].dropna(),
    how='left',
    left_on='ensembl_gene_id',
    right_on='ensembl_gene_id'
)
# replace the symbol column with the gene_ont symbol, when ENSG is not null
var.loc[var['symbol_y'].notna(), 'symbol_x'] = var.loc[var['symbol_y'].notna(), 'symbol_y']

var = var.drop(columns=['symbol_y']).rename(columns={'symbol_x': 'symbol'})

var

Unnamed: 0,ensembl_gene_id,ncounts,ncells,symbol
0,ENSG00000243485,0.0,0,MIR1302-2HG
1,ENSG00000237613,0.0,0,FAM138A
2,ENSG00000186092,0.0,0,OR4F5
3,ENSG00000238009,0.0,0,RP11-34P13.7
4,ENSG00000239945,1.0,1,RP11-34P13.8
...,...,...,...,...
35630,ENSG00000212907,0.0,0,MT-ND4L
35631,ENSG00000198886,0.0,0,MT-ND4
35632,ENSG00000198786,0.0,0,MT-ND5
35633,ENSG00000198695,0.0,0,MT-ND6


In [14]:
var = standardize_data(
    obs_df=var,
    obs_column="symbol",
    ref_df=gene_ont,
    ref_column="symbol",
    return_fuzzy=False
)
var.set_index('ensembl_gene_id', inplace=True, drop=False)
var

Number of perfect matches: 25168
Number of synonym matches: 811


Unnamed: 0_level_0,ensembl_gene_id,ncounts,ncells,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,0.0,0,MIR1302-2HG
ENSG00000237613,ENSG00000237613,0.0,0,FAM138A
ENSG00000186092,ENSG00000186092,0.0,0,OR4F5
ENSG00000238009,ENSG00000238009,0.0,0,RP11-34P13.7
ENSG00000239945,ENSG00000239945,1.0,1,RP11-34P13.8
...,...,...,...,...
ENSG00000212907,ENSG00000212907,0.0,0,MT-ND4L
ENSG00000198886,ENSG00000198886,0.0,0,MT-ND4
ENSG00000198786,ENSG00000198786,0.0,0,MT-ND5
ENSG00000198695,ENSG00000198695,0.0,0,MT-ND6


In [15]:
try:
    validated_var = VarSchema.validate(var, lazy=True)
    print("Data is successfully validated!")
    display(validated_var)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))
    

Data is successfully validated!


Unnamed: 0_level_0,ensembl_gene_id,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,RP11-34P13.7
ENSG00000239945,ENSG00000239945,RP11-34P13.8
...,...,...
ENSG00000212907,ENSG00000212907,MT-ND4L
ENSG00000198886,ENSG00000198886,MT-ND4
ENSG00000198786,ENSG00000198786,MT-ND5
ENSG00000198695,ENSG00000198695,MT-ND6


## Reassign obs and var

In [16]:
adata.var = validated_var
adata.obs = validated_obs

In [17]:
adata

AnnData object with n_obs × n_vars = 5752 × 35635
    obs: 'perturbation_name', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_category', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_term_label', 'disease_term_id'
    var: 'ensembl_gene_id', 'symbol'

# Metadata curation

### Fill the dictionary below

In [18]:
metadata = {
    "study": {
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {"first_name": "Britt", "last_name": "Adamson"},
        "last_author": {"first_name": "Jonathan", "last_name": "Weissman"},
    },
    "experiment": {
        "title": "6000 chronic myeloid leukemia (K562) cells transfected with gRNAs against 7 transcription factors + 1 control",
        "summary": "In a pilot experiment, single-cell RNA-seq was performed on a pool of individually transduced chronic myeloid leukemia cells (K562) carrying 8 distinct guide barcodes, analyzing \u223c6,000 cells total.",
        "treatments": get_dict_vals("treatment_id", "treatment_label", adata),
        "timepoints": get_vals(adata.obs["timepoint"], "list"),
        "replicates": "none",
        "number_of_samples": 1,
        "number_of_perturbed_cells": adata.obs.shape[0],
        "perturbation_type": get_dict_vals(
            "perturbation_type_id", "perturbation_type_label", adata
        ),
        "perturbed_target_category": get_vals(
            adata.obs["perturbed_target_category"], "list"
        ),
        "number_of_perturbed_targets": len(
            get_vals(adata.obs["perturbed_target_ensg"], "list")
        ),
        "perturbed_targets": get_vals(adata.obs["perturbed_target_ensg"], "list"),
    },
    "perturbation": {
        "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022895",
            "term_label": "dCas9-KRAB",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "retroviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "pooled",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "inhibition",
                },
            ],
            "manufacturer": "Weissman",
            "lentiviral_generation": "3",
            "grnas_per_gene": "1",
            "total_grnas": "8",
            "total_genes": len(get_vals(adata.obs["perturbed_target_ensg"], "list")),
            "total_variants": None,
        },
    },
    "assay": {
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "single-cell rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "Perturb-seq",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "10x Genomics Single Cell 3-prime",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina HiSeq 2500"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "CellRanger"},
        "software_analysis": {"term_id": None, "term_label": "MAGeCK"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh37",
        },
    },
    "model_system": {
        "model_system": get_dict_vals("model_system_id", "model_system_label", adata),
        "species": "Homo sapiens",
        "tissue": get_dict_vals("tissue_id", "tissue_label", adata),
        "cell_type": get_dict_vals("cell_type_id", "cell_type_label", adata),
        "cell_line": get_dict_vals("cell_line_id", "cell_line_label", adata),
        "sex": get_dict_vals("sex_id", "sex_label", adata),
        "developmental_stage": get_dict_vals(
            "developmental_stage_id", "developmental_stage_label", adata
        ),
        "passage_number": None,
        "sample_quantity": {
            "sample_quantity_value": adata.obs.shape[0],
            "sample_quantity_unit": "cells",
        },
    },
    "associated_diseases": get_dict_vals("disease_term_id", "disease_term_label", adata),
    "associated_datasets": [
        {
            "dataset_accession": "GSM2406675",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406675",
            "dataset_description": "Raw counts",
            "dataset_file_name": "GSE90546_RAW.tar",
        },
        {
            "dataset_accession": "GSM2406675_10X001",
            "dataset_uri": "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
            "dataset_description": "Processed .h5ad file",
            "dataset_file_name": "AdamsonWeissman2016_GSM2406675_10X001.h5ad",
        },
    ],
}

### Validate the metadata

In [19]:
m = Experiment.model_validate(metadata)

In [20]:
print(m.model_dump_json(indent=4))

{
    "study": {
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {
            "first_name": "Britt",
            "last_name": "Adamson"
        },
        "last_author": {
            "first_name": "Jonathan",
            "last_name": "Weissman"
        }
    },
    "experiment": {
        "title": "6000 chronic myeloid leukemia (K562) cells transfected with gRNAs against 7 transcription factors + 1 control",
        "summary": "In a pilot experiment, single-cell RNA-seq was performed on a pool of individually transduced chronic myeloid leukemia cells (K562) carrying 8 distinct guide barcodes, analyzing ∼6,000 cells total.",
        "treatments": null,
        "timepoints": [
            "P0DT0H0M0S"
        ],
        "replicates": "none",
        "number_of_samples": 1,
        "number_

### Show the metadata

In [21]:
adata.var

Unnamed: 0_level_0,ensembl_gene_id,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,RP11-34P13.7
ENSG00000239945,ENSG00000239945,RP11-34P13.8
...,...,...
ENSG00000212907,ENSG00000212907,MT-ND4L
ENSG00000198886,ENSG00000198886,MT-ND4
ENSG00000198786,ENSG00000198786,MT-ND5
ENSG00000198695,ENSG00000198695,MT-ND6


In [22]:
adata.obs

Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCGAT,CREB1_pDS269,1,ENSG00000118260,CREB1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACAGAGAT,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACCAGAAA,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACGTTGAC,EP300_pDS268,1,ENSG00000100393,EP300,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTGTTCT,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGAAGGC,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGGACGAG,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCCCGTT,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCCGTTC,62(mod)_pBA581,1,control,control,,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


### Replace None values with np.nan

None values are not supported in anndata

In [23]:
adata.obs = adata.obs.replace({None: np.nan})

  adata.obs = adata.obs.replace({None: np.nan})


# Save the anndata object

In [24]:
curated_path = noncurated_path.replace("non_curated", "curated").replace(
    ".h5ad", "_curated.h5ad"
)

# create the directory if it doesn't exist
if not os.path.exists(os.path.dirname(curated_path)):
    os.makedirs(os.path.dirname(curated_path))

# save the adata object
adata.write_h5ad(curated_path)
print(f"Curated data saved to {curated_path}")

Curated data saved to ../curated/h5ad/adamson_2016_pilot_curated.h5ad


## Save the metadata

In [25]:
# savet the serialized m object
curated_metadata_path = curated_path.replace("/h5ad", "/json").replace(".h5ad", ".json")
with open(curated_metadata_path, "w") as f:
    json.dump(m.model_dump(), f, indent=4)
print(f"Curated metadata saved to {curated_metadata_path}")

Curated metadata saved to ../curated/json/adamson_2016_pilot_curated.json
