# Import

In [1]:
import pandas as pd

import sys

sys.path.append("../../")
from curation_tools.curation_tools import CuratedDataset
from curation_tools.perturbseq_anndata_schema import ObsSchema, VarSchema

from curation_tools.unified_metadata_schema.unified_metadata_schema import Experiment

# Initialise the dataset object

In [2]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    data_source_link = "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406677_10X005.h5ad",
    noncurated_path = "../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad"
)

# Download the dataset

In [3]:
cur_data.download_data()
cur_data.load_data()
# show the data
cur_data.show_obs()

File ../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad already exists. Skipping download.
Loading data from ../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad
Observation data:
DataFrame shape: (15006, 15)
--------------------------------------------------
                         perturbation  read count  UMI count tissue_type  \
cell_barcode                                                               
AAACATACACTCAG   3x_neg_ctrl_pMJ144-1       261.0       59.0   cell_line   
AAACATACTCCTAT   3x_neg_ctrl_pMJ144-2       132.0       37.0   cell_line   
AAACATTGCAGAGG   3x_neg_ctrl_pMJ144-2       560.0      117.0   cell_line   
AAACATTGGCGAAG  ATF6_PERK_IRE1_pMJ158       215.0       49.0   cell_line   
AAACCGTGATACCG       ATF6_PERK_pMJ150       567.0      124.0   cell_line   
...                               ...         ...        ...         ...   
TTTGACTGGCTTAG       PERK_only_pMJ146       215.0       60.0   cell_line   
TTTGACTGGGGATG       PERK_IRE1_pMJ154        64.0       

In [4]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      2.0       2
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      1.0       1
RP11-34P13.8  ENSG00000239945      2.0       2
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


# OBS slot curation

### Show unique perturbations

In [5]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 21
--------------------------------------------------
{nan,
 '*',
 '3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


### Drop NAs

In [6]:
cur_data.remove_na(slot = 'obs', column = 'perturbation')

Removed 296 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [7]:
cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')

Removed 13 entries \* from column perturbation of adata.obs


  cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')


In [8]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 19
--------------------------------------------------
{'3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


### Rename `perturbation` to `perturbation_name`

In [9]:
cur_data.rename_columns(slot = 'obs', name_dict = {'perturbation': 'perturbation_name'})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Add guide RNA information

In [10]:
# Not reported by study authors for this dataset
cur_data.create_columns(
    slot='obs',
    col_dict={
        'guide_sequence': None
    }
)

Column guide_sequence added to adata.obs


### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [11]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Column perturbed_target_symbol added to adata.obs
Unique values in adata.obs.perturbed_target_symbol: 19
--------------------------------------------------
{'3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


#### Clean up `perturbation_target_symbol` column

In [13]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    map_dict={
        r"3x_neg_ctrl.*": "control_nontargeting",
        r"Gal4-4.*": "control_nontargeting",
        r"_(pM|pD|pB|only).*": ""
    }
)

cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Replaced '3x_neg_ctrl.*' with 'control_nontargeting' in column perturbed_target_symbol of adata.obs
Replaced 'Gal4-4.*' with 'control_nontargeting' in column perturbed_target_symbol of adata.obs
Replaced '_(pM|pD|pB|only).*' with '' in column perturbed_target_symbol of adata.obs
Unique values in adata.obs.perturbed_target_symbol: 16
--------------------------------------------------
{'ATF4',
 'ATF6',
 'ATF6_IRE1',
 'ATF6_PERK',
 'ATF6_PERK_IRE1',
 'C7orf26',
 'IER3IP1',
 'IRE1',
 'PERK',
 'PERK_IRE1',
 'PSMA1',
 'PSMD12',
 'SNAI1',
 'XBP1',
 'YIPF5',
 'control_nontargeting'}
--------------------------------------------------


  if df[column].str.contains(old_val).any():


### Standardise perturbation targets

In [14]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol',
    input_column_type='gene_symbol',
    multiple_entries=True,
    multiple_entries_sep='_'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Mapped potential synonyms in perturbed_target_symbol of the provided dataframe to gene symbols
Converted 22762/26130 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------
Collapsed column index using separator |


### Add `perturbed_target_number` column

In [15]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Add treatment information

Data is stored in a separate file, as part of the `cell BC` column's suffix


In [16]:
orig_cell_ident_link = r"https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2406nnn/GSM2406677/suppl/GSM2406677%5F10X005%5Fcell%5Fidentities.csv.gz"
orig_cell_ident = pd.read_csv(orig_cell_ident_link)

# the number after the dash is the cell barcode indicates the treatment
display(orig_cell_ident[["cell BC", "guide identity"]].head())

treatment_label_map = {"1": "tunicamycin", "2": "thapsigargin", "3": "DMSO"}

treatment_df = (
    orig_cell_ident["cell BC"]
    .str.split("-", expand=True)
    .rename(columns={0: "cell_barcode", 1: "treatment_number"})
)

treatment_df["treatment_temp"] = treatment_df["treatment_number"].map(
    treatment_label_map
)

treatment_df = (
    treatment_df.drop_duplicates(subset=["cell_barcode"])
    .drop(columns=["treatment_number"])
    .set_index("cell_barcode")
)

treatment_df

Unnamed: 0,cell BC,guide identity
0,ACGGTATGCTTAGG-3,PERK_IRE1_pMJ154
1,ACAATCCTACCCTC-1,PERK_IRE1_pMJ154
2,ACGAACACGTGCTA-3,ATF6_PERK_IRE1_pMJ158
3,CTGTGAGATTGGTG-1,ATF6_PERK_IRE1_pMJ158
4,ATGTTGCTAATCGC-2,3x_neg_ctrl_pMJ144-2


Unnamed: 0_level_0,treatment_temp
cell_barcode,Unnamed: 1_level_1
ACGGTATGCTTAGG,DMSO
ACAATCCTACCCTC,tunicamycin
ACGAACACGTGCTA,DMSO
CTGTGAGATTGGTG,tunicamycin
ATGTTGCTAATCGC,thapsigargin
...,...
CGTAACGAGTTGCA,DMSO
CCATGCTGGCTTCC,DMSO
CCCTCAGAAAAGTG,DMSO
TCAAGTCTAGGTCT,thapsigargin


Add treatment information with the dataset

In [17]:
cur_data.adata.obs = cur_data.adata.obs.merge(
    treatment_df,
    left_index=True,
    right_index=True,
    how="left"
)

Map treatment compounds to CHEBI

In [18]:
cur_data.standardize_compounds(column='treatment_temp')

Found standardized name for compound 'tunicamycin': tunicamycin (ChEBI ID: CHEBI:29699)
Found standardized name for compound 'thapsigargin': thapsigargin (ChEBI ID: CHEBI:9516)
Found standardized name for compound 'DMSO': dimethyl sulfoxide (ChEBI ID: CHEBI:28262)
Standardized compound names in column 'treatment_temp' and added 'treatment_label' and 'treatment_id' columns.


### Add perturbation information

In [19]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None
    }
)

Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs


### Add timepoint information

In [20]:
cur_data.create_columns(
    slot="obs",
    col_dict={"timepoint": "P0DT0H0M0S"},
)

Column timepoint added to adata.obs


### Add model system information

In [21]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "model_system_label": "cell line", 
        "model_system_id": None
    }
)

Column model_system_label added to adata.obs
Column model_system_id added to adata.obs


### Add tissue information

In [22]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        'tissue': 'blood'
    }
)

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Column tissue added to adata.obs
Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


### Add cell type information

In [23]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
   input_column input_column_lower    name_lower ontology_id
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005
--------------------------------------------------


### Add cell line information

In [24]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower  ontology_id
0         K562               k562       k562  CLO:0007050
--------------------------------------------------


### Add disease information

In [25]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  
0  chronic myelogenous leukemia  MONDO:0011996  
--------------------------------------------------


### Add species information

In [26]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "species": "Homo sapiens"
    }
)

Column species added to adata.obs


### Add sex information

In [27]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "sex_label": "female", 
        "sex_id": None
    }
)

Column sex_label added to adata.obs
Column sex_id added to adata.obs


### Add developmental stage information

In [28]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None
    }
)

Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs


### Match schema column order

In [29]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [30]:
cur_data.validate_data(slot='obs')

adata.obs is valid according to the obs_schema.
Validated data:


Unnamed: 0,perturbation_name,perturbed_target_coord,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,guide_sequence,perturbation_type_label,perturbation_type_id,timepoint,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_label,disease_id
0,3x_neg_ctrl_pMJ144-1,,2,control|None,control|None,,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
1,3x_neg_ctrl_pMJ144-2,,2,control|None,control|None,,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
2,3x_neg_ctrl_pMJ144-2,,2,control|None,control|None,,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
3,ATF6_PERK_IRE1_pMJ158,chr1:161766298-161977574;1|chr2:88556741-88691...,3,ENSG00000118217|ENSG00000172071|ENSG00000178607,ATF6|EIF2AK3|ERN1,protein_coding|protein_coding|protein_coding,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
4,ATF6_PERK_pMJ150,chr1:161766298-161977574;1|chr2:88556741-88691...,2,ENSG00000118217|ENSG00000172071,ATF6|EIF2AK3,protein_coding|protein_coding,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14692,PERK_only_pMJ146,chr2:88556741-88691518;-1,1,ENSG00000172071,EIF2AK3,protein_coding,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
14693,PERK_IRE1_pMJ154,chr2:88556741-88691518;-1|chr17:64039080-64130...,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
14694,3x_neg_ctrl_pMJ144-1,,2,control|None,control|None,,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
14695,PERK_IRE1_pMJ154,chr2:88556741-88691518;-1|chr17:64039080-64130...,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,,CRISPRi,,P0DT0H0M0S,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


# VAR slot curation

### Standardise genes

In [31]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      2.0       2
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      1.0       1
RP11-34P13.8  ENSG00000239945      2.0       2
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


In [32]:
cur_data.standardize_genes(
    slot="var", input_column="ensembl_id", input_column_type="ensembl_gene_id"
)

Converted 30168/32738 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Validate var metadata

In [33]:
cur_data.validate_data(slot='var')

adata.var is valid according to the var_schema.
Validated data:


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,
RP11-34P13.8,ENSG00000239945,
...,...,...
AC145205.1,ENSG00000215635,
BAGE5,ENSG00000268590,
CU459201.1,ENSG00000251180,
AC002321.2,ENSG00000215616,


# Metadata curation

### Auto-populate available metadata

In [34]:
cur_data.populate_exp_metadata()

Experiment metadata populated with available fields from adata.obs:
--------------------------------------------------
{'associated_diseases': [{'term_id': 'MONDO:0011996',
                          'term_label': 'chronic myelogenous leukemia, '
                                        'BCR-ABL1 positive'}],
 'experiment': {'number_of_perturbed_entities': 14697,
                'number_of_perturbed_targets': 14,
                'perturbation_type': [{'term_id': None,
                                       'term_label': 'CRISPRi'}],
                'perturbed_target_biotype': ['protein_coding|protein_coding|protein_coding',
                                             'protein_coding|protein_coding',
                                             'protein_coding',
                                             'processed_pseudogene'],
                'perturbed_targets': ['control|None',
                                      'ENSG00000118217|ENSG00000172071|ENSG00000178607',
                

### Manually curate metadata

Study details

In [35]:
cur_data.add_exp_metadata(
    metadata_slot='study',
    metadata={
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {"first_name": "Britt", "last_name": "Adamson"},
        "last_author": {"first_name": "Jonathan", "last_name": "Weissman"},
    }
)

Metadata for 'study' successfully validated:
--------------------------------------------------
{'first_author': {'first_name': 'Britt', 'last_name': 'Adamson'},
 'last_author': {'first_name': 'Jonathan', 'last_name': 'Weissman'},
 'study_uri': 'https://doi.org/10.1016/j.cell.2016.11.048',
 'title': 'A Multiplexed Single-Cell CRISPR Screening Platform Enables '
          'Systematic Dissection of the Unfolded Protein Response',
 'year': 2016}
--------------------------------------------------


Experiment details

In [36]:
cur_data.add_exp_metadata(
    metadata_slot='experiment',
    metadata={
        "title": "14595 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs in every combination (singly with controls, doubly with a control, or triply).",
        "summary": " Using our final three-guide Perturb-seq vector to simultaneously deliver 3 sgRNAs, we individually transduced K562 cells expressing dCas9-KRAB (cBA010) with constructs that targeted all three UPR sensor genes in every combination (singly with controls, doubly with a control, or triply). Transduced cells were then pooled and selected. After 2 days of combined growth, the cells were treated with DMSO for 6 hr, 4 μg/mL tunicamycin (Tm) for 6 hr, or 100 nM thapsigargin (Tg) for 4 hr and were profiled by Perturb-seq (24 conditions in total).",
        "replicates": "none",
        "number_of_samples": 1
    }
)

Metadata for 'experiment' successfully validated:
--------------------------------------------------
{'number_of_perturbed_entities': 14697,
 'number_of_perturbed_targets': 14,
 'number_of_samples': 1,
 'perturbation_type': [{'term_id': None, 'term_label': 'CRISPRi'}],
 'perturbed_target_biotype': ['protein_coding|protein_coding|protein_coding',
                              'protein_coding|protein_coding',
                              'protein_coding',
                              'processed_pseudogene'],
 'perturbed_targets': ['control|None',
                       'ENSG00000118217|ENSG00000172071|ENSG00000178607',
                       'ENSG00000118217|ENSG00000172071',
                       'ENSG00000118217',
                       'ENSG00000172071|ENSG00000178607',
                       'ENSG00000118217|ENSG00000178607',
                       'ENSG00000178607',
                       'ENSG00000172071',
                       'ENSG00000249947',
                       'ENSG000

Perturbation details

In [37]:
cur_data.add_exp_metadata(
    metadata_slot='perturbation',
    metadata={
        "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022895",
            "term_label": "dCas9-KRAB",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "retroviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "pooled",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "inhibition",
                },
            ],
            "manufacturer": "Weissman",
            "lentiviral_generation": "3",
            "grnas_per_gene": "1",
            "total_grnas": "16",
            "total_variants": None,
        }
    }
)

Metadata for 'perturbation' successfully validated:
--------------------------------------------------
{'enzyme_delivery_method': {'term_id': None,
                            'term_label': 'retroviral transduction'},
 'enzyme_expression_control': {'term_id': None,
                               'term_label': 'constitutive expression'},
 'enzyme_integration_state': {'term_id': None,
                              'term_label': 'random locus integration'},
 'library': {'accession': None,
             'grnas_per_target': None,
             'lentiviral_generation': '3',
             'library_format': {'term_id': None, 'term_label': 'pooled'},
             'library_name': 'custom',
             'library_perturbation_type': [{'term_id': None,
                                            'term_label': 'inhibition'}],
             'library_scope': {'term_id': None, 'term_label': 'focused'},
             'manufacturer': 'Weissman',
             'total_grnas': '16',
             'total_variants':

Assay details

In [38]:
cur_data.add_exp_metadata(
    metadata_slot='assay',
    metadata={
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "single-cell rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "Perturb-seq",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "10x Genomics Single Cell 3-prime",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina HiSeq 4000"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "CellRanger"},
        "software_analysis": {"term_id": None, "term_label": "MAGeCK"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh37",
        }
    }
)

Metadata for 'assay' successfully validated:
--------------------------------------------------
{'method_name': {'term_id': None, 'term_label': 'Perturb-seq'},
 'method_uri': None,
 'readout_dimensionality': {'term_id': None,
                            'term_label': 'high-dimensional assay'},
 'readout_technology': {'term_id': None, 'term_label': 'single-cell rna-seq'},
 'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
 'reference_genome': {'term_id': None, 'term_label': 'GRCh37'},
 'sequencing_library_kit': {'term_id': None,
                            'term_label': '10x Genomics Single Cell 3-prime'},
 'sequencing_platform': {'term_id': None, 'term_label': 'Illumina HiSeq 4000'},
 'sequencing_strategy': {'term_id': None, 'term_label': 'barcode sequencing'},
 'software_analysis': {'term_id': None, 'term_label': 'MAGeCK'},
 'software_counts': {'term_id': None, 'term_label': 'CellRanger'}}
--------------------------------------------------


Model system details

In [39]:
cur_data.add_exp_metadata(
    metadata_slot='model_system',
    metadata={
        "species": "Homo sapiens",
        "passage_number": None,
        }
)

Metadata for 'model_system' successfully validated:
--------------------------------------------------
{'cell_line': [{'term_id': 'CLO:0007050', 'term_label': 'K 562 cell'}],
 'cell_type': [{'term_id': 'CL:0017005', 'term_label': 'lymphoblast'}],
 'developmental_stage': [{'term_id': None, 'term_label': 'adult'}],
 'model_system': [{'term_id': None, 'term_label': 'cell line'}],
 'passage_number': None,
 'sex': [{'term_id': None, 'term_label': 'female'}],
 'species': 'Homo sapiens',
 'tissue': [{'term_id': 'UBERON:0000178', 'term_label': 'blood'}]}
--------------------------------------------------


Associated dataset details

In [40]:
cur_data.add_exp_metadata(
    metadata_slot='associated_datasets',
    metadata=[
        {
            "dataset_accession": "GSM2406677",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406677",
            "dataset_description": "Barcode, cell identities, raw gene expression matrix",
            "dataset_file_name": "GSM2406677_10X005",
        },
        {
            "dataset_accession": "GSM2406677_10X005",
            "dataset_uri": "https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406677_10X005.h5ad",
            "dataset_description": "Processed .h5ad file",
            "dataset_file_name": "AdamsonWeissman2016_GSM2406677_10X005.h5ad"
        }
    ]
)

Metadata for 'associated_datasets' successfully validated:
--------------------------------------------------
[{'dataset_accession': 'GSM2406677',
  'dataset_description': 'Barcode, cell identities, raw gene expression matrix',
  'dataset_file_name': 'GSM2406677_10X005',
  'dataset_uri': 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406677'},
 {'dataset_accession': 'GSM2406677_10X005',
  'dataset_description': 'Processed .h5ad file',
  'dataset_file_name': 'AdamsonWeissman2016_GSM2406677_10X005.h5ad',
  'dataset_uri': 'https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406677_10X005.h5ad'}]
--------------------------------------------------


### Validate metadata

In [41]:
cur_data.validate_exp_metadata()

Experiment metadata successfully validated:
--------------------------------------------------
{'assay': {'method_name': {'term_id': None, 'term_label': 'Perturb-seq'},
           'method_uri': None,
           'readout_dimensionality': {'term_id': None,
                                      'term_label': 'high-dimensional assay'},
           'readout_technology': {'term_id': None,
                                  'term_label': 'single-cell rna-seq'},
           'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
           'reference_genome': {'term_id': None, 'term_label': 'GRCh37'},
           'sequencing_library_kit': {'term_id': None,
                                      'term_label': '10x Genomics Single Cell '
                                                    '3-prime'},
           'sequencing_platform': {'term_id': None,
                                   'term_label': 'Illumina HiSeq 4000'},
           'sequencing_strategy': {'term_id': None,
               

# Save the dataset

In [42]:
cur_data.save_curated_data()

  adata.obs = adata.obs.fillna(value=np.nan)


Curated data saved to ../curated/h5ad/adamson_2016_upr_epistasis_curated.h5ad
