In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import json
import pandera as pa
from pandera.typing import Series, Index
from typing import Optional
from neofuzz import char_ngram_process, Process

from tools.curation_tools import (
    search_compounds_in_chebi,
    standardize_gene_symbols,
    standardize_ontology,
    get_vals,
    get_dict_vals,
)
from tools.perturbseq_anndata_schema import ObsSchema, VarSchema

import sys

sys.path.append("../../")
from unified_metadata_schema.unified_metadata_schema import Experiment

# Read/download anndata from a file

**Change the directory to the location of your file!**

In [2]:
data_source_link = "https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad"
noncurated_path = "../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad"

# Download the data if it doesn't exist
if not os.path.exists(noncurated_path):
    print(f"Downloading data from {data_source_link} to {noncurated_path}")
    os.makedirs(os.path.dirname(noncurated_path), exist_ok=True)
    os.system(f"wget {data_source_link} -O {noncurated_path}")
else:
    print(f"File {noncurated_path} already exists. Skipping download.")

File ../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad already exists. Skipping download.


In [3]:
# Load the data
adata = sc.read_h5ad(noncurated_path)
adata

AnnData object with n_obs × n_vars = 65337 × 32738
    obs: 'perturbation', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts'
    var: 'ensembl_id', 'ncounts', 'ncells'

# Load ontologies from parquets

In [4]:
gene_ont = pd.read_parquet('../ontologies/genes.parquet').drop_duplicates()
ctype_ont = pd.read_parquet('../ontologies/cell_types.parquet').drop_duplicates()
cline_ont = pd.read_parquet('../ontologies/cell_lines.parquet').drop_duplicates()
tis_ont = pd.read_parquet('../ontologies/tissues.parquet').drop_duplicates()
dis_ont = pd.read_parquet('../ontologies/diseases.parquet').drop_duplicates()

### Run the cell below to enable fast fuzzy mapping of gene names

In [5]:
# index gene_ont for fast fuzzy search

# if os.path.exists("tools/fuzzy_gene_index.joblib"):
#     print("Loading existing process from disk")
#     process = Process.from_disk("tools/fuzzy_gene_index.joblib")
# else:
#     print("Creating new process")
#     # Create a process
#     process = char_ngram_process()
#     # Index the options
#     process.index(gene_ont["symbol"].dropna().drop_duplicates())
#     # save the process to disk
#     process.to_disk("tools/fuzzy_gene_index.joblib")
    

# OBS curation

### Filter out nans and unknowns before proceeding with further curation

In [6]:
# rename the columns to match the schema
adata.obs = adata.obs.rename(
    columns={
        'perturbation': 'perturbation_name'
    }
)
# drop None values from the perturbation_name column
print(adata.obs.shape)
adata = adata[~adata.obs["perturbation_name"].isna()]
print(adata.obs.shape)

(65337, 15)
(62724, 15)


In [7]:
print(len(set(adata.obs["perturbation_name"])))
set(adata.obs["perturbation_name"])

114


{'*',
 '62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_pDS089',
 'IARS2_pDS090',
 'IARS2_pDS091',
 'IDH3A_pDS393',
 'IER3IP1_pDS002',
 'IER3IP1_pDS003',
 'IER3IP1_pDS110',
 'KCTD16_pDS096',
 'MANF_pDS027',
 

In [8]:

# extract the perturbed target symbol from the perturbation_name column
adata.obs['perturbed_target_symbol'] = adata.obs['perturbation_name'].str.rsplit(pat="_", n=1).str[0] # remove plasmid name
# adata.obs['perturbed_target_symbol'] = adata.obs['perturbed_target_symbol'].str.replace('_only', '') # remove "_only" suffix
adata.obs['perturbed_target_symbol'] = adata.obs['perturbed_target_symbol'].replace(
    {
        "62(mod)": "control",
        "63(mod)": "control",
        "Gal4-4(mod)": "control",
        "*": "unknown"
    }
)
# drop the rows with unknown perturbed target symbol
adata = adata[adata.obs['perturbed_target_symbol'] != "unknown"]
print(adata.obs.shape)
print(len(set(adata.obs["perturbed_target_symbol"])))
set(adata.obs["perturbed_target_symbol"])

  adata.obs['perturbed_target_symbol'] = adata.obs['perturbation_name'].str.rsplit(pat="_", n=1).str[0] # remove plasmid name


(62623, 16)
91


{'AARS',
 'AMIGO3',
 'ARHGAP22',
 'ASCC3',
 'ATF4',
 'ATF6',
 'ATP5B',
 'C7orf26',
 'CAD',
 'CARS',
 'CCND3',
 'CHERP',
 'COPB1',
 'COPZ1',
 'DAD1',
 'DARS',
 'DDOST',
 'DDRGK1',
 'DERL2',
 'DHDDS',
 'DNAJC19',
 'EIF2AK3',
 'EIF2B2',
 'EIF2B3',
 'EIF2B4',
 'EIF2S1',
 'ERN1',
 'FARSB',
 'FECH',
 'GBF1',
 'GMPPB',
 'GNPNAT1',
 'HARS',
 'HSD17B12',
 'HSPA5',
 'HSPA9',
 'HYOU1',
 'IARS2',
 'IDH3A',
 'IER3IP1',
 'KCTD16',
 'MANF',
 'MARS',
 'MRGBP',
 'MRPL39',
 'MTHFD1',
 'NEDD8',
 'OST4',
 'P4HB',
 'PDIA6',
 'PPWD1',
 'PSMA1',
 'PSMD12',
 'PSMD4',
 'PTDSS1',
 'QARS',
 'SAMM50',
 'SARS',
 'SCYL1',
 'SEC61A1',
 'SEC61B',
 'SEC61G',
 'SEC63',
 'SEL1L',
 'SLC35B1',
 'SLC39A7',
 'SLMO2',
 'SOCS1',
 'SPCS2',
 'SPCS3',
 'SRP68',
 'SRP72',
 'SRPR',
 'SRPRB',
 'STT3A',
 'SYVN1',
 'TARS',
 'TELO2',
 'TIMM23',
 'TIMM44',
 'TMED10',
 'TMED2',
 'TMEM167A',
 'TTI1',
 'TTI2',
 'UFL1',
 'UFM1',
 'XBP1',
 'XRN1',
 'YIPF5',
 'control'}

## Proceed with the curation of the adata.obs slot

Standardize perturbed target gene symbols, ENSG IDs and biotypes

In [9]:
obs = adata.obs.copy()

# del standardize_gene_symbols
# from tools.curation_tools import standardize_gene_symbols


obs = standardize_gene_symbols(obs, "perturbed_target_symbol")

# map the perturbed target symbol to the ENSG
obs['perturbed_target_ensg'] = obs['perturbed_target_symbol'].map(
    gene_ont.set_index('symbol')['ensembl_gene_id'].to_dict()
)

# map the perturbed target ENSG to the biotype
obs['perturbed_target_category'] = obs['perturbed_target_ensg'].map(
    gene_ont.set_index('ensembl_gene_id')['biotype'].to_dict()
)

# add the perturbed target number column based on the number of symbols in the perturbed_target_symbol column
obs['perturbed_target_number'] = [len(x.split("|")) if x is not None else 0 for x in obs['perturbed_target_symbol']]


Loaded gene ontology with 86403 entries
--------------------------------------------------
79 out of 91 gene symbols mapped to standardized symbols
--------------------------------------------------
12 gene symbols could not be mapped to standardized symbols
--------------------------------------------------
Trying to match the unmatched gene symbols against known synonyms
--------------------------------------------------
12 gene symbols mapped to standardized symbols using synonyms
--------------------------------------------------
All unmatched gene symbols have been mapped to standardized symbols using synonyms
--------------------------------------------------


Unnamed: 0,perturbed_target_symbol,standardized_symbol
0,control,control
1,OST4,OST4
2,SEC61A1,SEC61A1
3,EIF2B4,EIF2B4
4,IER3IP1,IER3IP1
...,...,...
86,TARS,TARS1
87,MARS,MARS1
88,SARS,SARS1
89,AARS,AARS1


Mapped the standardized symbols in column perturbed_target_symbol back to the original DataFrame


Add treatment information

In [10]:
# treatment
obs['treatment_label'] = None
obs['treatment_id'] = None

Add perturbation information

In [11]:
# perturbation type
obs['perturbation_type_label'] = 'CRISPRi'
obs['perturbation_type_id'] = None


Add timepoint information

In [12]:

# timepoint
obs['timepoint'] = "P0DT0H0M0S" # this follows the ISO 8601 format for time intervals


Add model system and tissue information

In [13]:

# model system
obs['model_system_label'] = "cell line"
obs['model_system_id'] = None

# tissue
obs['tissue_label'] = None
obs['tissue_id'] = None


Add cell type information

In [14]:
# cell type
obs['cell_type_label'] = obs['celltype']
obs['cell_type_label'] = obs['cell_type_label'].replace(
    {
        'lymphoblasts': 'lymphoblast'
    }
)
obs['cell_type_id'] = obs['cell_type_label'].map(
    ctype_ont.set_index('name')['ontology_id'].to_dict()
)

  obs['cell_type_label'] = obs['cell_type_label'].replace(


Add cell line information

In [15]:
# cell line label
obs['cell_line_label'] = obs['cell_line']
obs = standardize_ontology(obs, "cell_line_label", cline_ont)

obs['cell_line_id'] = obs['cell_line_label'].map(
    cline_ont.set_index('name')['ontology_id'].to_dict()
)


0 out of 1 ontology labels mapped to standardized names
--------------------------------------------------
1 ontology label could not be mapped to standardized names
--------------------------------------------------
Trying to match the unmatched ontology labels against known synonyms
--------------------------------------------------
1 ontology label mapped to standardized names using synonyms
--------------------------------------------------
All unmatched ontology labels have been mapped to standardized names using synonyms
--------------------------------------------------


Unnamed: 0,cell_line_label,standardized_name
0,K562,K 562 cell


Mapped the standardized ontology labels in column cell_line_label back to the original DataFrame


Add disease information

In [16]:

# disease
obs['disease_term_label'] = obs['disease']
obs = standardize_ontology(obs, "disease_term_label", dis_ont)

obs['disease_term_id'] = obs['disease_term_label'].map(
    dis_ont.set_index('name')['ontology_id'].to_dict()
)


0 out of 1 ontology labels mapped to standardized names
--------------------------------------------------
1 ontology label could not be mapped to standardized names
--------------------------------------------------
Trying to match the unmatched ontology labels against known synonyms
--------------------------------------------------
1 ontology label mapped to standardized names using synonyms
--------------------------------------------------
All unmatched ontology labels have been mapped to standardized names using synonyms
--------------------------------------------------


Unnamed: 0,disease_term_label,standardized_name
0,chronic myelogenous leukemia,"chronic myelogenous leukemia, BCR-ABL1 positive"


Mapped the standardized ontology labels in column disease_term_label back to the original DataFrame


Add species, sex and developmental stage information

In [17]:

# species
obs['species'] = 'Homo sapiens'

# sex
obs['sex_label'] = None
obs['sex_id'] = None

# developmental stage
obs['developmental_stage_label'] = None
obs['developmental_stage_id'] = None

# change the order of the columns to match that of the schema
obs = obs[ObsSchema.to_schema().columns.keys()]

obs

Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACAAGATG,63(mod)_pBA580,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACACCTAG,OST4_pDS353,1,ENSG00000228474,OST4,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTTCCCG,SEC61A1_pDS031,1,ENSG00000058262,SEC61A1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGAAACAG,EIF2B4_pDS491,1,ENSG00000115211,EIF2B4,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGCAGCTA,SRPR_pDS482,1,ENSG00000182934,SRPRA,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCATGCTTTAC,STT3A_pDS011,1,ENSG00000134910,STT3A,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGGAGGAC,ARHGAP22_pDS458,1,ENSG00000128805,ARHGAP22,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGTAGAGA,63(mod)_pBA580,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGTCAAGC,KCTD16_pDS096,1,ENSG00000183775,KCTD16,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


## Validate the adata.obs slot

In [18]:
try:
    validated_obs = ObsSchema.validate(obs, lazy=True)
    print("Data is successfully validated!")
    display(validated_obs)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))

Data is successfully validated!


Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACAAGATG,63(mod)_pBA580,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACACCTAG,OST4_pDS353,1,ENSG00000228474,OST4,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTTCCCG,SEC61A1_pDS031,1,ENSG00000058262,SEC61A1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGAAACAG,EIF2B4_pDS491,1,ENSG00000115211,EIF2B4,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGCAGCTA,SRPR_pDS482,1,ENSG00000182934,SRPRA,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCATGCTTTAC,STT3A_pDS011,1,ENSG00000134910,STT3A,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGGAGGAC,ARHGAP22_pDS458,1,ENSG00000128805,ARHGAP22,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGTAGAGA,63(mod)_pBA580,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGTCAAGC,KCTD16_pDS096,1,ENSG00000183775,KCTD16,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


## VAR curation

In [19]:
var = adata.var.copy()
var["symbol"] = var.index
var = var.rename(columns={"ensembl_id": "ensembl_gene_id"})
# var.index = var["ensembl_gene_id"]

# map symbols from gene_ont
var = var.merge(
    right= gene_ont[['ensembl_gene_id', 'symbol']].dropna(),
    how='left',
    left_on='ensembl_gene_id',
    right_on='ensembl_gene_id'
)
# replace the symbol column with the gene_ont symbol, when ENSG is not null
var.loc[var['symbol_y'].notna(), 'symbol_x'] = var.loc[var['symbol_y'].notna(), 'symbol_y']

var = var.drop(columns=['symbol_y']).rename(columns={'symbol_x': 'symbol'})

var

Unnamed: 0,ensembl_gene_id,ncounts,ncells,symbol
0,ENSG00000243485,11.0,11,MIR1302-2HG
1,ENSG00000237613,0.0,0,FAM138A
2,ENSG00000186092,0.0,0,OR4F5
3,ENSG00000238009,0.0,0,RP11-34P13.7
4,ENSG00000239945,43.0,43,RP11-34P13.8
...,...,...,...,...
32733,ENSG00000215635,0.0,0,AC145205.1
32734,ENSG00000268590,0.0,0,BAGE5
32735,ENSG00000251180,0.0,0,CU459201.1
32736,ENSG00000215616,0.0,0,AC002321.2


In [21]:
var = standardize_gene_symbols(
    obs_df=var,
    column="symbol"
)

var.set_index('ensembl_gene_id', inplace=True, drop=False)
var

Loaded gene ontology with 86403 entries
--------------------------------------------------
23905 out of 32682 gene symbols mapped to standardized symbols
--------------------------------------------------
8777 gene symbols could not be mapped to standardized symbols
--------------------------------------------------
Trying to match the unmatched gene symbols against known synonyms
--------------------------------------------------
176 gene symbols mapped to standardized symbols using synonyms
--------------------------------------------------
8601 gene symbols could not be mapped to standardized symbols using synonyms
--------------------------------------------------
These genes will be kept as is in the final DataFrame
Unmatched gene symbols: ['RP11-34P13.7' 'RP11-34P13.8' 'AL627309.1' ... 'CU459201.1' 'AC002321.2'
 'AC002321.1']


Unnamed: 0,symbol,standardized_symbol
0,MIR1302-2HG,MIR1302-2HG
1,FAM138A,FAM138A
2,OR4F5,OR4F5
3,OR4F29,OR4F29
4,OR4F16,OR4F16
...,...,...
32677,AC145205.1,AC145205.1
32678,BAGE5,BAGE5
32679,CU459201.1,CU459201.1
32680,AC002321.2,AC002321.2


Mapped the standardized symbols in column symbol back to the original DataFrame


Unnamed: 0_level_0,ensembl_gene_id,ncounts,ncells,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,11.0,11,MIR1302-2HG
ENSG00000237613,ENSG00000237613,0.0,0,FAM138A
ENSG00000186092,ENSG00000186092,0.0,0,OR4F5
ENSG00000238009,ENSG00000238009,0.0,0,RP11-34P13.7
ENSG00000239945,ENSG00000239945,43.0,43,RP11-34P13.8
...,...,...,...,...
ENSG00000215635,ENSG00000215635,0.0,0,AC145205.1
ENSG00000268590,ENSG00000268590,0.0,0,BAGE5
ENSG00000251180,ENSG00000251180,0.0,0,CU459201.1
ENSG00000215616,ENSG00000215616,0.0,0,AC002321.2


In [22]:
try:
    validated_var = VarSchema.validate(var, lazy=True)
    print("Data is successfully validated!")
    display(validated_var)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))
    

Data is successfully validated!


Unnamed: 0_level_0,ensembl_gene_id,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,RP11-34P13.7
ENSG00000239945,ENSG00000239945,RP11-34P13.8
...,...,...
ENSG00000215635,ENSG00000215635,AC145205.1
ENSG00000268590,ENSG00000268590,BAGE5
ENSG00000251180,ENSG00000251180,CU459201.1
ENSG00000215616,ENSG00000215616,AC002321.2


## Reassign obs and var

Since we dropped cells that were not uniquely assigned to one or another treatment, we need to filter the original adata before we can reassign standardised obs and var names.

In [23]:
adata = adata[adata.obs.index.isin(obs.index), :]

adata

View of AnnData object with n_obs × n_vars = 62623 × 32738
    obs: 'perturbation_name', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'perturbed_target_symbol'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [24]:
adata.var = validated_var
adata.obs = validated_obs

In [25]:
adata

AnnData object with n_obs × n_vars = 62623 × 32738
    obs: 'perturbation_name', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_category', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_term_label', 'disease_term_id'
    var: 'ensembl_gene_id', 'symbol'

# Metadata curation

### Fill the dictionary below

In [30]:
len(set(adata.obs['perturbed_target_ensg']))

91

In [31]:
metadata = {
    "study": {
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {"first_name": "Britt", "last_name": "Adamson"},
        "last_author": {"first_name": "Jonathan", "last_name": "Weissman"},
    },
    "experiment": {
        "title": "63000 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs.",
        "summary": "Perturb-seq was applied to a small CRISPRi library of 91 sgRNAs targeting UPR genes in K562 cells.",
        "treatments": get_dict_vals("treatment_id", "treatment_label", adata),
        "timepoints": get_vals(adata.obs["timepoint"], "list"),
        "replicates": "none",
        "number_of_samples": 1,
        "number_of_perturbed_cells": adata.obs.shape[0],
        "perturbation_type": get_dict_vals(
            "perturbation_type_id", "perturbation_type_label", adata
        ),
        "perturbed_target_category": get_vals(
            adata.obs["perturbed_target_category"], "list"
        ),
        "number_of_perturbed_targets": len(
            get_vals(adata.obs["perturbed_target_ensg"], "list")
        ),
        "perturbed_targets": get_vals(adata.obs["perturbed_target_ensg"], "list"),
    },
    "perturbation": {
        "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022895",
            "term_label": "dCas9-KRAB",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "retroviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "pooled",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "inhibition",
                },
            ],
            "manufacturer": "Weissman",
            "lentiviral_generation": "3",
            "grnas_per_gene": "1",
            "total_grnas": "91",
            "total_genes": len(get_vals(adata.obs["perturbed_target_ensg"], "list")),
            "total_variants": None,
        },
    },
    "assay": {
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "single-cell rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "Perturb-seq",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "10x Genomics Single Cell 3-prime",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina HiSeq 4000"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "CellRanger"},
        "software_analysis": {"term_id": None, "term_label": "MAGeCK"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh37",
        },
    },
    "model_system": {
        "model_system": get_dict_vals("model_system_id", "model_system_label", adata),
        "species": "Homo sapiens",
        "tissue": get_dict_vals("tissue_id", "tissue_label", adata),
        "cell_type": get_dict_vals("cell_type_id", "cell_type_label", adata),
        "cell_line": get_dict_vals("cell_line_id", "cell_line_label", adata),
        "sex": get_dict_vals("sex_id", "sex_label", adata),
        "developmental_stage": get_dict_vals(
            "developmental_stage_id", "developmental_stage_label", adata
        ),
        "passage_number": None,
        "sample_quantity": {
            "sample_quantity_value": adata.obs.shape[0],
            "sample_quantity_unit": "cells",
        },
    },
    "associated_diseases": get_dict_vals("disease_term_id", "disease_term_label", adata),
    "associated_datasets": [
        {
            "dataset_accession": "GSM2406677",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406681",
            "dataset_description": "Barcode, cell identities, raw gene expression matrix",
            "dataset_file_name": "GSM2406677_10X010",
        },
        {
            "dataset_accession": "GSM2406677_10X005",
            "dataset_uri": "https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad",
            "dataset_description": "Processed .h5ad file",
            "dataset_file_name": "AdamsonWeissman2016_GSM2406681_10X010.h5ad"
        }
    ]
}

### Validate the metadata

In [32]:
m = Experiment.model_validate(metadata)

In [33]:
print(m.model_dump_json(indent=4))

{
    "study": {
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {
            "first_name": "Britt",
            "last_name": "Adamson"
        },
        "last_author": {
            "first_name": "Jonathan",
            "last_name": "Weissman"
        }
    },
    "experiment": {
        "title": "63000 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs.",
        "summary": "Perturb-seq was applied to a small CRISPRi library of 91 sgRNAs targeting UPR genes in K562 cells.",
        "treatments": null,
        "timepoints": [
            "P0DT0H0M0S"
        ],
        "replicates": "none",
        "number_of_samples": 1,
        "number_of_perturbed_cells": 62623,
        "perturbation_type": null,
        "perturbed_target_category": [
           

### Show the metadata

In [34]:
adata.var

Unnamed: 0_level_0,ensembl_gene_id,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,RP11-34P13.7
ENSG00000239945,ENSG00000239945,RP11-34P13.8
...,...,...
ENSG00000215635,ENSG00000215635,AC145205.1
ENSG00000268590,ENSG00000268590,BAGE5
ENSG00000251180,ENSG00000251180,CU459201.1
ENSG00000215616,ENSG00000215616,AC002321.2


In [35]:
adata.obs

Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACAAGATG,63(mod)_pBA580,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACACCTAG,OST4_pDS353,1,ENSG00000228474,OST4,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTTCCCG,SEC61A1_pDS031,1,ENSG00000058262,SEC61A1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGAAACAG,EIF2B4_pDS491,1,ENSG00000115211,EIF2B4,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGCAGCTA,SRPR_pDS482,1,ENSG00000182934,SRPRA,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCATGCTTTAC,STT3A_pDS011,1,ENSG00000134910,STT3A,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGGAGGAC,ARHGAP22_pDS458,1,ENSG00000128805,ARHGAP22,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGTAGAGA,63(mod)_pBA580,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGTCAAGC,KCTD16_pDS096,1,ENSG00000183775,KCTD16,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


### Replace None values with np.nan

None values are not supported in anndata

In [36]:
adata.obs = adata.obs.replace({None: np.nan})

  adata.obs = adata.obs.replace({None: np.nan})


# Save the anndata object

In [37]:
curated_path = noncurated_path.replace("non_curated", "curated").replace(
    ".h5ad", "_curated.h5ad"
)

# create the directory if it doesn't exist
if not os.path.exists(os.path.dirname(curated_path)):
    os.makedirs(os.path.dirname(curated_path))

# save the adata object
adata.write_h5ad(curated_path)
print(f"Curated data saved to {curated_path}")

Curated data saved to ../curated/h5ad/adamson_2016_upr_perturb_seq_curated.h5ad


## Save the metadata

In [38]:
# savet the serialized m object
curated_metadata_path = curated_path.replace("/h5ad", "/json").replace(".h5ad", ".json")
with open(curated_metadata_path, "w") as f:
    json.dump(m.model_dump(), f, indent=4)
print(f"Curated metadata saved to {curated_metadata_path}")

Curated metadata saved to ../curated/json/adamson_2016_upr_perturb_seq_curated.json
