In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import json
import pandera as pa
from pandera.typing import Series, Index
from typing import Optional
from neofuzz import char_ngram_process, Process

from tools.curation_tools import (
    search_compounds_in_chebi,
    standardize_gene_symbols,
    standardize_ontology,
    get_vals,
    get_dict_vals,
)
from tools.perturbseq_anndata_schema import ObsSchema, VarSchema

import sys

sys.path.append("../../")
from unified_metadata_schema.unified_metadata_schema import Experiment

# Read/download anndata from a file

**Change the directory to the location of your file!**

In [2]:
data_source_link = "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406677_10X005.h5ad"
noncurated_path = "../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad"

# Download the data if it doesn't exist
if not os.path.exists(noncurated_path):
    print(f"Downloading data from {data_source_link} to {noncurated_path}")
    os.makedirs(os.path.dirname(noncurated_path), exist_ok=True)
    os.system(f"wget {data_source_link} -O {noncurated_path}")
else:
    print(f"File {noncurated_path} already exists. Skipping download.")

File ../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad already exists. Skipping download.


In [3]:
# Load the data
adata = sc.read_h5ad(noncurated_path)
adata

AnnData object with n_obs × n_vars = 15006 × 32738
    obs: 'perturbation', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts'
    var: 'ensembl_id', 'ncounts', 'ncells'

# Load ontologies from parquets

In [4]:
gene_ont = pd.read_parquet('../ontologies/genes.parquet').drop_duplicates()
ctype_ont = pd.read_parquet('../ontologies/cell_types.parquet').drop_duplicates()
cline_ont = pd.read_parquet('../ontologies/cell_lines.parquet').drop_duplicates()
tis_ont = pd.read_parquet('../ontologies/tissues.parquet').drop_duplicates()
dis_ont = pd.read_parquet('../ontologies/diseases.parquet').drop_duplicates()

# OBS curation

### Filter out nans and unknowns before proceeding with further curation

In [5]:
# rename the columns to match the schema
adata.obs = adata.obs.rename(
    columns={
        'perturbation': 'perturbation_name'
    }
)
# drop None values from the perturbation_name column
print(adata.obs.shape)
adata = adata[~adata.obs["perturbation_name"].isna()]
print(adata.obs.shape)


(15006, 15)
(14710, 15)


In [6]:
set(adata.obs["perturbation_name"])

{'*',
 '3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}

In [7]:

# extract the perturbed target symbol from the perturbation_name column
adata.obs['perturbed_target_symbol'] = adata.obs['perturbation_name'].str.rsplit(pat="_", n=1).str[0] # remove plasmid name
adata.obs['perturbed_target_symbol'] = adata.obs['perturbed_target_symbol'].str.replace('_only', '') # remove "_only" suffix
adata.obs['perturbed_target_symbol'] = adata.obs['perturbed_target_symbol'].replace(
    {
        "3x_neg_ctrl": "control",
        "Gal4-4(mod)": "control",
        "*": "unknown"
    }
)
# drop the rows with unknown perturbed target symbol
adata = adata[adata.obs['perturbed_target_symbol'] != "unknown"]
print(adata.obs.shape)

set(adata.obs["perturbed_target_symbol"])

  adata.obs['perturbed_target_symbol'] = adata.obs['perturbation_name'].str.rsplit(pat="_", n=1).str[0] # remove plasmid name


(14697, 16)


{'ATF4',
 'ATF6',
 'ATF6_IRE1',
 'ATF6_PERK',
 'ATF6_PERK_IRE1',
 'C7orf26',
 'IER3IP1',
 'IRE1',
 'PERK',
 'PERK_IRE1',
 'PSMA1',
 'PSMD12',
 'SNAI1',
 'XBP1',
 'YIPF5',
 'control'}

## Proceed with the curation of the adata.obs slot

Standardize perturbed target gene symbols, ENSG IDs and biotypes

In [8]:
obs = adata.obs.copy()

std_gnames = obs.copy()
std_gnames["cell_barcode"] = std_gnames.index
std_gnames = std_gnames[["cell_barcode", "perturbed_target_symbol"]]
std_gnames["perturbed_target_symbol"] = std_gnames["perturbed_target_symbol"].str.split("_")
std_gnames = std_gnames.explode("perturbed_target_symbol", ignore_index=True)

# standardize the perturbed target symbol
# std_gnames = standardize_data(std_gnames, "perturbed_target_symbol", gene_ont, "symbol")
std_gnames = standardize_gene_symbols(std_gnames, "perturbed_target_symbol")


# create perturbed_target_ensg column
std_gnames["perturbed_target_ensg"] = std_gnames["perturbed_target_symbol"].map(
    gene_ont.set_index("symbol")["ensembl_gene_id"].to_dict()
)

std_gnames['perturbed_target_category'] = std_gnames['perturbed_target_ensg'].map(
    gene_ont.set_index('ensembl_gene_id')['biotype'].to_dict()
)

# collapse the perturbed_target_symbol and perturbed_target_ensg columns
std_gnames = std_gnames.groupby("cell_barcode").agg(
    {
        "perturbed_target_symbol": lambda x: "|".join(x),
        "perturbed_target_ensg": lambda x: "|".join(x),
        "perturbed_target_category": lambda x: "|".join(x),
    }
)

obs = obs.drop(columns=["perturbed_target_symbol"])

obs = obs.merge(
    std_gnames,
    on="cell_barcode",
    how="left",
)

# add the perturbed target number column based on the number of symbols in the perturbed_target_symbol column
obs['perturbed_target_number'] = [len(x.split("|")) if x is not None else 0 for x in obs['perturbed_target_symbol']]


Loaded gene ontology with 86403 entries
--------------------------------------------------
9 out of 12 gene symbols mapped to standardized symbols
--------------------------------------------------
3 gene symbols could not be mapped to standardized symbols
--------------------------------------------------
Trying to match the unmatched gene symbols against known synonyms
--------------------------------------------------
3 gene symbols mapped to standardized symbols using synonyms
--------------------------------------------------
All unmatched gene symbols have been mapped to standardized symbols using synonyms
--------------------------------------------------


Unnamed: 0,perturbed_target_symbol,standardized_symbol
0,control,control
1,ATF6,ATF6
2,XBP1,XBP1
3,IER3IP1,IER3IP1
4,PSMA1,PSMA1
5,PSMD12,PSMD12
6,ATF4,ATF4
7,SNAI1,SNAI1
8,YIPF5,YIPF5
9,PERK,EIF2AK3


Mapped the standardized symbols in column perturbed_target_symbol back to the original DataFrame


Add treatment information

In [9]:
orig_cell_ident_link = r"https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2406nnn/GSM2406677/suppl/GSM2406677%5F10X005%5Fcell%5Fidentities.csv.gz"
orig_cell_ident = pd.read_csv(orig_cell_ident_link)

# the number after the dash is the cell barcode indicates the treatment
display(orig_cell_ident.head())

Unnamed: 0,cell BC,guide identity,read count,UMI count,coverage,good coverage,number of cells
0,ACGGTATGCTTAGG-3,PERK_IRE1_pMJ154,3907,645,6.057364,True,1
1,ACAATCCTACCCTC-1,PERK_IRE1_pMJ154,2173,455,4.775824,True,1
2,ACGAACACGTGCTA-3,ATF6_PERK_IRE1_pMJ158,2093,322,6.5,True,1
3,CTGTGAGATTGGTG-1,ATF6_PERK_IRE1_pMJ158,1594,369,4.319783,True,1
4,ATGTTGCTAATCGC-2,3x_neg_ctrl_pMJ144-2,1537,228,6.741228,True,1


In [10]:

treatment_label_map = {"1": "tunicamycin", "2": "thapsigargin", "3": "DMSO"}

treatment_df = orig_cell_ident['cell BC'].str.split("-", expand=True).rename(
    columns={0: "cell_barcode", 1: "treatment_number"}
)

treatment_df["treatment_label"] = treatment_df["treatment_number"].map(
    treatment_label_map
)

treatment_df.drop_duplicates()

Unnamed: 0,cell_barcode,treatment_number,treatment_label
0,ACGGTATGCTTAGG,3,DMSO
1,ACAATCCTACCCTC,1,tunicamycin
2,ACGAACACGTGCTA,3,DMSO
3,CTGTGAGATTGGTG,1,tunicamycin
4,ATGTTGCTAATCGC,2,thapsigargin
...,...,...,...
14851,CGTAACGAGTTGCA,3,DMSO
14852,CCATGCTGGCTTCC,3,DMSO
14853,CCCTCAGAAAAGTG,3,DMSO
14854,TCAAGTCTAGGTCT,2,thapsigargin


In [11]:
std_compounds_df = search_compounds_in_chebi(treatment_df["treatment_label"].unique())
std_compounds_df

Unnamed: 0,original_name,standardized_name,chebi_id
0,DMSO,dimethyl sulfoxide,CHEBI:28262
1,tunicamycin,tunicamycin,CHEBI:29699
2,thapsigargin,thapsigargin,CHEBI:9516


In [12]:
clean_treatment_df = treatment_df.merge(
    std_compounds_df,
    left_on="treatment_label",
    right_on="original_name",
    how="left",
).drop(columns=["treatment_number","original_name", "treatment_label"]).rename(
    columns={
        "standardized_name": "treatment_label",
        "chebi_id": "treatment_id"
    }
).set_index("cell_barcode")

clean_treatment_df

Unnamed: 0_level_0,treatment_label,treatment_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
ACGGTATGCTTAGG,dimethyl sulfoxide,CHEBI:28262
ACAATCCTACCCTC,tunicamycin,CHEBI:29699
ACGAACACGTGCTA,dimethyl sulfoxide,CHEBI:28262
CTGTGAGATTGGTG,tunicamycin,CHEBI:29699
ATGTTGCTAATCGC,thapsigargin,CHEBI:9516
...,...,...
CGTAACGAGTTGCA,dimethyl sulfoxide,CHEBI:28262
CCATGCTGGCTTCC,dimethyl sulfoxide,CHEBI:28262
CCCTCAGAAAAGTG,dimethyl sulfoxide,CHEBI:28262
TCAAGTCTAGGTCT,thapsigargin,CHEBI:9516


In [13]:
# add the treatment label and id to the obs dataframe
obs = obs.merge(
    clean_treatment_df,
    left_index=True,
    right_index=True,
    how="left"
)

In [14]:
# check for duplicated cell barcodes
print(f"Duplicated cell barcodes: {obs.index.duplicated().sum()}")

# filter out the duplicated cell barcodes (all of them)
duplicated_cells = obs.index[obs.index.duplicated()]

obs = obs[~obs.index.isin(duplicated_cells)]
print(f"Duplicated cell barcodes: {obs.index.duplicated().sum()}")

Duplicated cell barcodes: 102
Duplicated cell barcodes: 0


Add perturbation information

In [15]:
# perturbation type
obs['perturbation_type_label'] = 'CRISPRi'
obs['perturbation_type_id'] = None


Add timepoint information

In [16]:

# timepoint
obs['timepoint'] = "P0DT0H0M0S" # this follows the ISO 8601 format for time intervals


Add model system and tissue information

In [17]:

# model system
obs['model_system_label'] = "cell line"
obs['model_system_id'] = None

# tissue
obs['tissue_label'] = None
obs['tissue_id'] = None


Add cell type information

In [18]:
# cell type
obs['cell_type_label'] = obs['celltype']
obs['cell_type_label'] = obs['cell_type_label'].replace(
    {
        'lymphoblasts': 'lymphoblast'
    }
)
obs['cell_type_id'] = obs['cell_type_label'].map(
    ctype_ont.set_index('name')['ontology_id'].to_dict()
)

  obs['cell_type_label'] = obs['cell_type_label'].replace(


Add cell line information

In [19]:
# cell line label
obs['cell_line_label'] = obs['cell_line']
obs = standardize_ontology(obs, "cell_line_label", cline_ont)

obs['cell_line_id'] = obs['cell_line_label'].map(
    cline_ont.set_index('name')['ontology_id'].to_dict()
)


0 out of 1 ontology labels mapped to standardized names
--------------------------------------------------
1 ontology label could not be mapped to standardized names
--------------------------------------------------
Trying to match the unmatched ontology labels against known synonyms
--------------------------------------------------
1 ontology label mapped to standardized names using synonyms
--------------------------------------------------
All unmatched ontology labels have been mapped to standardized names using synonyms
--------------------------------------------------


Unnamed: 0,cell_line_label,standardized_name
0,K562,K 562 cell


Mapped the standardized ontology labels in column cell_line_label back to the original DataFrame


Add disease information

In [20]:

# disease
obs['disease_term_label'] = obs['disease']
obs = standardize_ontology(obs, "disease_term_label", dis_ont)

obs['disease_term_id'] = obs['disease_term_label'].map(
    dis_ont.set_index('name')['ontology_id'].to_dict()
)


0 out of 1 ontology labels mapped to standardized names
--------------------------------------------------
1 ontology label could not be mapped to standardized names
--------------------------------------------------
Trying to match the unmatched ontology labels against known synonyms
--------------------------------------------------
1 ontology label mapped to standardized names using synonyms
--------------------------------------------------
All unmatched ontology labels have been mapped to standardized names using synonyms
--------------------------------------------------


Unnamed: 0,disease_term_label,standardized_name
0,chronic myelogenous leukemia,"chronic myelogenous leukemia, BCR-ABL1 positive"


Mapped the standardized ontology labels in column disease_term_label back to the original DataFrame


Add species, sex and developmental stage information

In [21]:

# species
obs['species'] = 'Homo sapiens'

# sex
obs['sex_label'] = None
obs['sex_id'] = None

# developmental stage
obs['developmental_stage_label'] = None
obs['developmental_stage_id'] = None

# change the order of the columns to match that of the schema
obs = obs[ObsSchema.to_schema().columns.keys()]

obs

Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACTCAG,3x_neg_ctrl_pMJ144-1,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTCCTAT,3x_neg_ctrl_pMJ144-2,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGCAGAGG,3x_neg_ctrl_pMJ144-2,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGGCGAAG,ATF6_PERK_IRE1_pMJ158,3,ENSG00000118217|ENSG00000172071|ENSG00000178607,ATF6|EIF2AK3|ERN1,protein_coding|protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACCGTGATACCG,ATF6_PERK_pMJ150,2,ENSG00000118217|ENSG00000172071,ATF6|EIF2AK3,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGCTTAG,PERK_only_pMJ146,1,ENSG00000172071,EIF2AK3,protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGGGGATG,PERK_IRE1_pMJ154,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGTGGTCA,3x_neg_ctrl_pMJ144-1,1,control,control,control,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCGGAGA,PERK_IRE1_pMJ154,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


## Validate the adata.obs slot

In [22]:
try:
    validated_obs = ObsSchema.validate(obs, lazy=True)
    print("Data is successfully validated!")
    display(validated_obs)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))

Data is successfully validated!


Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACTCAG,3x_neg_ctrl_pMJ144-1,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTCCTAT,3x_neg_ctrl_pMJ144-2,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGCAGAGG,3x_neg_ctrl_pMJ144-2,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGGCGAAG,ATF6_PERK_IRE1_pMJ158,3,ENSG00000118217|ENSG00000172071|ENSG00000178607,ATF6|EIF2AK3|ERN1,protein_coding|protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACCGTGATACCG,ATF6_PERK_pMJ150,2,ENSG00000118217|ENSG00000172071,ATF6|EIF2AK3,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGCTTAG,PERK_only_pMJ146,1,ENSG00000172071,EIF2AK3,protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGGGGATG,PERK_IRE1_pMJ154,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGTGGTCA,3x_neg_ctrl_pMJ144-1,1,control,control,control,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCGGAGA,PERK_IRE1_pMJ154,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


## VAR curation

In [23]:
var = adata.var.copy()
var["symbol"] = var.index
var = var.rename(columns={"ensembl_id": "ensembl_gene_id"})

# map symbols from gene_ont
var = var.merge(
    right= gene_ont[['ensembl_gene_id', 'symbol']].dropna(),
    how='left',
    left_on='ensembl_gene_id',
    right_on='ensembl_gene_id'
)
# replace the symbol column with the gene_ont symbol, when ENSG is not null
var.loc[var['symbol_y'].notna(), 'symbol_x'] = var.loc[var['symbol_y'].notna(), 'symbol_y']

var = var.drop(columns=['symbol_y']).rename(columns={'symbol_x': 'symbol'})

var

Unnamed: 0,ensembl_gene_id,ncounts,ncells,symbol
0,ENSG00000243485,2.0,2,MIR1302-2HG
1,ENSG00000237613,0.0,0,FAM138A
2,ENSG00000186092,0.0,0,OR4F5
3,ENSG00000238009,1.0,1,RP11-34P13.7
4,ENSG00000239945,2.0,2,RP11-34P13.8
...,...,...,...,...
32733,ENSG00000215635,0.0,0,AC145205.1
32734,ENSG00000268590,0.0,0,BAGE5
32735,ENSG00000251180,0.0,0,CU459201.1
32736,ENSG00000215616,0.0,0,AC002321.2


In [24]:
var = standardize_gene_symbols(
    obs_df=var,
    column="symbol"
)

var.set_index('ensembl_gene_id', inplace=True, drop=False)
var

Loaded gene ontology with 86403 entries
--------------------------------------------------
23905 out of 32682 gene symbols mapped to standardized symbols
--------------------------------------------------
8777 gene symbols could not be mapped to standardized symbols
--------------------------------------------------
Trying to match the unmatched gene symbols against known synonyms
--------------------------------------------------
176 gene symbols mapped to standardized symbols using synonyms
--------------------------------------------------
8601 gene symbols could not be mapped to standardized symbols using synonyms
--------------------------------------------------
These genes will be kept as is in the final DataFrame
Unmatched gene symbols: ['RP11-34P13.7' 'RP11-34P13.8' 'AL627309.1' ... 'CU459201.1' 'AC002321.2'
 'AC002321.1']


Unnamed: 0,symbol,standardized_symbol
0,MIR1302-2HG,MIR1302-2HG
1,FAM138A,FAM138A
2,OR4F5,OR4F5
3,OR4F29,OR4F29
4,OR4F16,OR4F16
...,...,...
32677,AC145205.1,AC145205.1
32678,BAGE5,BAGE5
32679,CU459201.1,CU459201.1
32680,AC002321.2,AC002321.2


Mapped the standardized symbols in column symbol back to the original DataFrame


Unnamed: 0_level_0,ensembl_gene_id,ncounts,ncells,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,ENSG00000243485,2.0,2,MIR1302-2HG
ENSG00000237613,ENSG00000237613,0.0,0,FAM138A
ENSG00000186092,ENSG00000186092,0.0,0,OR4F5
ENSG00000238009,ENSG00000238009,1.0,1,RP11-34P13.7
ENSG00000239945,ENSG00000239945,2.0,2,RP11-34P13.8
...,...,...,...,...
ENSG00000215635,ENSG00000215635,0.0,0,AC145205.1
ENSG00000268590,ENSG00000268590,0.0,0,BAGE5
ENSG00000251180,ENSG00000251180,0.0,0,CU459201.1
ENSG00000215616,ENSG00000215616,0.0,0,AC002321.2


In [25]:
try:
    validated_var = VarSchema.validate(var, lazy=True)
    print("Data is successfully validated!")
    display(validated_var)
except pa.errors.SchemaErrors as e:
    print(json.dumps(e.message, indent=2))
    

Data is successfully validated!


Unnamed: 0_level_0,ensembl_gene_id,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,RP11-34P13.7
ENSG00000239945,ENSG00000239945,RP11-34P13.8
...,...,...
ENSG00000215635,ENSG00000215635,AC145205.1
ENSG00000268590,ENSG00000268590,BAGE5
ENSG00000251180,ENSG00000251180,CU459201.1
ENSG00000215616,ENSG00000215616,AC002321.2


## Reassign obs and var

Since we dropped cells that were not uniquely assigned to one or another treatment, we need to filter the original adata before we can reassign standardised obs and var names.

In [26]:
adata = adata[adata.obs.index.isin(obs.index), :]

adata

View of AnnData object with n_obs × n_vars = 14595 × 32738
    obs: 'perturbation_name', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'perturbed_target_symbol'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [27]:
adata.var = validated_var
adata.obs = validated_obs

In [28]:
adata

AnnData object with n_obs × n_vars = 14595 × 32738
    obs: 'perturbation_name', 'perturbed_target_number', 'perturbed_target_ensg', 'perturbed_target_symbol', 'perturbed_target_category', 'perturbation_type_label', 'perturbation_type_id', 'timepoint', 'treatment_label', 'treatment_id', 'model_system_label', 'model_system_id', 'species', 'tissue_label', 'tissue_id', 'cell_type_label', 'cell_type_id', 'cell_line_label', 'cell_line_id', 'sex_label', 'sex_id', 'developmental_stage_label', 'developmental_stage_id', 'disease_term_label', 'disease_term_id'
    var: 'ensembl_gene_id', 'symbol'

# Metadata curation

### Fill the dictionary below

In [29]:
metadata = {
    "study": {
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {"first_name": "Britt", "last_name": "Adamson"},
        "last_author": {"first_name": "Jonathan", "last_name": "Weissman"},
    },
    "experiment": {
        "title": "14595 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs in every combination (singly with controls, doubly with a control, or triply).",
        "summary": " Using our final three-guide Perturb-seq vector to simultaneously deliver 3 sgRNAs, we individually transduced K562 cells expressing dCas9-KRAB (cBA010) with constructs that targeted all three UPR sensor genes in every combination (singly with controls, doubly with a control, or triply). Transduced cells were then pooled and selected. After 2 days of combined growth, the cells were treated with DMSO for 6 hr, 4 μg/mL tunicamycin (Tm) for 6 hr, or 100 nM thapsigargin (Tg) for 4 hr and were profiled by Perturb-seq (24 conditions in total).",
        "treatments": get_dict_vals("treatment_id", "treatment_label", adata),
        "timepoints": get_vals(adata.obs["timepoint"], "list"),
        "replicates": "none",
        "number_of_samples": 1,
        "number_of_perturbed_cells": adata.obs.shape[0],
        "perturbation_type": get_dict_vals(
            "perturbation_type_id", "perturbation_type_label", adata
        ),
        "perturbed_target_category": get_vals(
            adata.obs["perturbed_target_category"], "list"
        ),
        "number_of_perturbed_targets": len(
            get_vals(adata.obs["perturbed_target_ensg"], "list")
        ),
        "perturbed_targets": get_vals(adata.obs["perturbed_target_ensg"], "list"),
    },
    "perturbation": {
        "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022895",
            "term_label": "dCas9-KRAB",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "retroviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "pooled",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "inhibition",
                },
            ],
            "manufacturer": "Weissman",
            "lentiviral_generation": "3",
            "grnas_per_gene": "1",
            "total_grnas": "16",
            "total_genes": len(get_vals(adata.obs["perturbed_target_ensg"], "list")),
            "total_variants": None,
        },
    },
    "assay": {
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "single-cell rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "Perturb-seq",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "10x Genomics Single Cell 3-prime",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina HiSeq 4000"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "CellRanger"},
        "software_analysis": {"term_id": None, "term_label": "MAGeCK"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh37",
        },
    },
    "model_system": {
        "model_system": get_dict_vals("model_system_id", "model_system_label", adata),
        "species": "Homo sapiens",
        "tissue": get_dict_vals("tissue_id", "tissue_label", adata),
        "cell_type": get_dict_vals("cell_type_id", "cell_type_label", adata),
        "cell_line": get_dict_vals("cell_line_id", "cell_line_label", adata),
        "sex": get_dict_vals("sex_id", "sex_label", adata),
        "developmental_stage": get_dict_vals(
            "developmental_stage_id", "developmental_stage_label", adata
        ),
        "passage_number": None,
        "sample_quantity": {
            "sample_quantity_value": adata.obs.shape[0],
            "sample_quantity_unit": "cells",
        },
    },
    "associated_diseases": get_dict_vals("disease_term_id", "disease_term_label", adata),
    "associated_datasets": [
        {
            "dataset_accession": "GSM2406677",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406677",
            "dataset_description": "Barcode, cell identities, raw gene expression matrix",
            "dataset_file_name": "GSM2406677_10X005",
        },
        {
            "dataset_accession": "GSM2406677_10X005",
            "dataset_uri": "https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406677_10X005.h5ad",
            "dataset_description": "Processed .h5ad file",
            "dataset_file_name": "AdamsonWeissman2016_GSM2406677_10X005.h5ad"
        },
    ],
}

### Validate the metadata

In [30]:
m = Experiment.model_validate(metadata)

In [31]:
print(m.model_dump_json(indent=4))

{
    "study": {
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {
            "first_name": "Britt",
            "last_name": "Adamson"
        },
        "last_author": {
            "first_name": "Jonathan",
            "last_name": "Weissman"
        }
    },
    "experiment": {
        "title": "14595 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs in every combination (singly with controls, doubly with a control, or triply).",
        "summary": " Using our final three-guide Perturb-seq vector to simultaneously deliver 3 sgRNAs, we individually transduced K562 cells expressing dCas9-KRAB (cBA010) with constructs that targeted all three UPR sensor genes in every combination (singly with controls, doubly with a control, or triply). Transduced ce

### Show the metadata

In [32]:
adata.var

Unnamed: 0_level_0,ensembl_gene_id,symbol
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,ENSG00000243485,MIR1302-2HG
ENSG00000237613,ENSG00000237613,FAM138A
ENSG00000186092,ENSG00000186092,OR4F5
ENSG00000238009,ENSG00000238009,RP11-34P13.7
ENSG00000239945,ENSG00000239945,RP11-34P13.8
...,...,...
ENSG00000215635,ENSG00000215635,AC145205.1
ENSG00000268590,ENSG00000268590,BAGE5
ENSG00000251180,ENSG00000251180,CU459201.1
ENSG00000215616,ENSG00000215616,AC002321.2


In [33]:
adata.obs

Unnamed: 0_level_0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_category,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_term_label,disease_term_id
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACTCAG,3x_neg_ctrl_pMJ144-1,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATACTCCTAT,3x_neg_ctrl_pMJ144-2,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGCAGAGG,3x_neg_ctrl_pMJ144-2,1,control,control,control,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACATTGGCGAAG,ATF6_PERK_IRE1_pMJ158,3,ENSG00000118217|ENSG00000172071|ENSG00000178607,ATF6|EIF2AK3|ERN1,protein_coding|protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
AAACCGTGATACCG,ATF6_PERK_pMJ150,2,ENSG00000118217|ENSG00000172071,ATF6|EIF2AK3,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,tunicamycin,CHEBI:29699,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGCTTAG,PERK_only_pMJ146,1,ENSG00000172071,EIF2AK3,protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGGGGATG,PERK_IRE1_pMJ154,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGACTGTGGTCA,3x_neg_ctrl_pMJ144-1,1,control,control,control,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
TTTGCATGCGGAGA,PERK_IRE1_pMJ154,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,CRISPRi,,P0DT0H0M0S,dimethyl sulfoxide,CHEBI:28262,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,,,,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


### Replace None values with np.nan

None values are not supported in anndata

In [34]:
adata.obs = adata.obs.replace({None: np.nan})

  adata.obs = adata.obs.replace({None: np.nan})


# Save the anndata object

In [35]:
curated_path = noncurated_path.replace("non_curated", "curated").replace(
    ".h5ad", "_curated.h5ad"
)

# create the directory if it doesn't exist
if not os.path.exists(os.path.dirname(curated_path)):
    os.makedirs(os.path.dirname(curated_path))

# save the adata object
adata.write_h5ad(curated_path)
print(f"Curated data saved to {curated_path}")

Curated data saved to ../curated/h5ad/adamson_2016_upr_epistasis_curated.h5ad


## Save the metadata

In [36]:
# savet the serialized m object
curated_metadata_path = curated_path.replace("/h5ad", "/json").replace(".h5ad", ".json")
with open(curated_metadata_path, "w") as f:
    json.dump(m.model_dump(), f, indent=4)
print(f"Curated metadata saved to {curated_metadata_path}")

Curated metadata saved to ../curated/json/adamson_2016_upr_epistasis_curated.json
