# Import

In [2]:
import pandas as pd

import sys

sys.path.append("../../")
from tools.curation_tools import CuratedDataset
from tools.perturbseq_anndata_schema import ObsSchema, VarSchema

from unified_metadata_schema.unified_metadata_schema import Experiment

# Initialise the dataset object

In [3]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    data_source_link = "https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad",
    noncurated_path = "../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad"
)

# Download the dataset

In [4]:
cur_data.download_data()
cur_data.load_data()
# show the data
cur_data.show_obs()

File ../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad already exists. Skipping download.
Loading data from ../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad
Observation data:
DataFrame shape: (65337, 15)
--------------------------------------------------
                   perturbation  read count  UMI count tissue_type cell_line  \
cell_barcode                                                                   
AAACATACAAGATG   63(mod)_pBA580       282.0        8.0   cell_line      K562   
AAACATACACCTAG      OST4_pDS353       331.0        7.0   cell_line      K562   
AAACATACTTCCCG   SEC61A1_pDS031       285.0       10.0   cell_line      K562   
AAACATTGAAACAG    EIF2B4_pDS491      1036.0       30.0   cell_line      K562   
AAACATTGCAGCTA      SRPR_pDS482       863.0       25.0   cell_line      K562   
...                         ...         ...        ...         ...       ...   
TTTGCATGCTTTAC     STT3A_pDS011       476.0       17.0   cell_line      K562   
TTTGCATGGAGGAC  

In [5]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485     11.0      11
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945     43.0      43
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


# OBS slot curation

### Show unique perturbations

In [8]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 114
--------------------------------------------------
{'*',
 '62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_pDS089',
 'IARS2_pDS090',
 'IARS2_pDS091',
 'IDH3A_pDS39

### Drop NAs

In [7]:
cur_data.remove_na(slot = 'obs', column = 'perturbation')

Removed 2613 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [10]:
cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = r'\*')

ValueError: Column perturbation has no entries matching \* in adata.obs

In [11]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 113
--------------------------------------------------
{'62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_pDS089',
 'IARS2_pDS090',
 'IARS2_pDS091',
 'IDH3A_pDS393',
 '

### Rename `perturbation` to `perturbation_name`

In [12]:
cur_data.rename_columns(slot = 'obs', name_dict = {'perturbation': 'perturbation_name'})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [13]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Column perturbed_target_symbol added to adata.obs
Unique values in adata.obs.perturbed_target_symbol: 113
--------------------------------------------------
{'62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_p

#### Clean up `perturbation_target_symbol` column

In [14]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace=r"62\(mod\).*",
    replace_value="control"
)
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace=r"63\(mod\).*",
    replace_value="control"
)
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace="Gal4-4.*",
    replace_value="control"
)
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace="_(pD|pB).*",
    replace_value=""
)

cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Replaced entries 62\(mod\).* -> control in column perturbed_target_symbol of adata.obs
Replaced entries 63\(mod\).* -> control in column perturbed_target_symbol of adata.obs
Replaced entries Gal4-4.* -> control in column perturbed_target_symbol of adata.obs
Replaced entries _(pD|pB).* ->  in column perturbed_target_symbol of adata.obs
Unique values in adata.obs.perturbed_target_symbol: 91
--------------------------------------------------
{'AARS',
 'AMIGO3',
 'ARHGAP22',
 'ASCC3',
 'ATF4',
 'ATF6',
 'ATP5B',
 'C7orf26',
 'CAD',
 'CARS',
 'CCND3',
 'CHERP',
 'COPB1',
 'COPZ1',
 'DAD1',
 'DARS',
 'DDOST',
 'DDRGK1',
 'DERL2',
 'DHDDS',
 'DNAJC19',
 'EIF2AK3',
 'EIF2B2',
 'EIF2B3',
 'EIF2B4',
 'EIF2S1',
 'ERN1',
 'FARSB',
 'FECH',
 'GBF1',
 'GMPPB',
 'GNPNAT1',
 'HARS',
 'HSD17B12',
 'HSPA5',
 'HSPA9',
 'HYOU1',
 'IARS2',
 'IDH3A',
 'IER3IP1',
 'KCTD16',
 'MANF',
 'MARS',
 'MRGBP',
 'MRPL39',
 'MTHFD1',
 'NEDD8',
 'OST4',
 'P4HB',
 'PDIA6',
 'PPWD1',
 'PSMA1',
 'PSMD12',
 'PSMD4',
 'PTDSS

### Standardise perturbation targets

In [15]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol',
    input_column_type='gene_symbol',
    multiple_entries=True,
    multiple_entries_sep='_'
)

Exploded column perturbed_target_symbol using separator _


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Mapped potential synonyms in perturbed_target_symbol of the provided dataframe to gene symbols
Converted 62611/62623 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------
Collapsed column index using separator |


### Add `perturbed_target_number` column

In [16]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


In [18]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_symbol', 'perturbed_target_ensg'])

Observation data:
DataFrame shape: (62623, 3)
--------------------------------------------------
               perturbation_name perturbed_target_symbol perturbed_target_ensg
index                                                                         
AAACATACAAGATG    63(mod)_pBA580                 control               control
AAACATACACCTAG       OST4_pDS353                    OST4       ENSG00000228474
AAACATACTTCCCG    SEC61A1_pDS031                 SEC61A1       ENSG00000058262
AAACATTGAAACAG     EIF2B4_pDS491                  EIF2B4       ENSG00000115211
AAACATTGCAGCTA       SRPR_pDS482                   SRPRA       ENSG00000182934
...                          ...                     ...                   ...
TTTGCATGCTTTAC      STT3A_pDS011                   STT3A       ENSG00000134910
TTTGCATGGAGGAC   ARHGAP22_pDS458                ARHGAP22       ENSG00000128805
TTTGCATGTAGAGA    63(mod)_pBA580                 control               control
TTTGCATGTCAAGC     KCTD16_pDS096  

### Add treatment information

Add treatment information with the dataset

In [19]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "treatment_label": None, 
        "treatment_id": None
    }
)

Column treatment_label added to adata.obs
Column treatment_id added to adata.obs


### Add perturbation information

In [20]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None
    }
)

Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs


### Add timepoint information

In [21]:
cur_data.create_columns(
    slot="obs",
    col_dict={"timepoint": "P0DT0H0M0S"},
)

Column timepoint added to adata.obs


### Add model system information

In [22]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "model_system_label": "cell line", 
        "model_system_id": None
    }
)

Column model_system_label added to adata.obs
Column model_system_id added to adata.obs


### Add tissue information

In [23]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        'tissue': 'blood'
    }
)

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Column tissue added to adata.obs
Mapped 1 tissue ontology terms from `tissue` column to ontology terms
  input_column input_column_lower name_lower     ontology_id   name  \
0        blood              blood      blood  UBERON:0000178  blood   

  matching_type  
0          name  


### Add cell type information

In [24]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
   input_column input_column_lower    name_lower ontology_id         name  \
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005  lymphoblast   

     matching_type  
0  pluralised name  


### Add cell line information

In [25]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
  input_column input_column_lower name_lower  ontology_id        name  \
0         K562               k562       k562  CLO:0007050  K 562 cell   

  matching_type  
0       synonym  


### Add disease information

In [26]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  \
0  chronic myelogenous leukemia  MONDO:0011996   

                                              name matching_type  
0  chronic myelogenous leukemia, BCR-ABL1 positive       synonym  


### Add species information

In [27]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "species": "Homo sapiens"
    }
)

Column species added to adata.obs


### Add sex information

In [28]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "sex_label": "female", 
        "sex_id": None
    }
)

Column sex_label added to adata.obs
Column sex_id added to adata.obs


### Add developmental stage information

In [29]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None
    }
)

Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs


### Match schema column order

In [30]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [31]:
cur_data.validate_data(slot='obs')

adata.obs is valid according to the obs_schema.
Validated data:


Unnamed: 0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_label,disease_id
0,63(mod)_pBA580,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
1,OST4_pDS353,1,ENSG00000228474,OST4,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
2,SEC61A1_pDS031,1,ENSG00000058262,SEC61A1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
3,EIF2B4_pDS491,1,ENSG00000115211,EIF2B4,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
4,SRPR_pDS482,1,ENSG00000182934,SRPRA,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62618,STT3A_pDS011,1,ENSG00000134910,STT3A,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
62619,ARHGAP22_pDS458,1,ENSG00000128805,ARHGAP22,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
62620,63(mod)_pBA580,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
62621,KCTD16_pDS096,1,ENSG00000183775,KCTD16,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


# VAR slot curation

### Standardise genes

In [32]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485     11.0      11
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945     43.0      43
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


In [33]:
cur_data.standardize_genes(
    slot="var", input_column="ensembl_id", input_column_type="ensembl_gene_id"
)

Converted 30773/32738 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Validate var metadata

In [34]:
cur_data.validate_data(slot='var')

adata.var is valid according to the var_schema.
Validated data:


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,
RP11-34P13.8,ENSG00000239945,
...,...,...
AC145205.1,ENSG00000215635,
BAGE5,ENSG00000268590,
CU459201.1,ENSG00000251180,
AC002321.2,ENSG00000215616,


# Metadata curation

### Auto-populate available metadata

In [35]:
cur_data.populate_exp_metadata()

Experiment metadata populated with available fields from adata.obs:
--------------------------------------------------
{'associated_diseases': [{'term_id': 'MONDO:0011996',
                          'term_label': 'chronic myelogenous leukemia, '
                                        'BCR-ABL1 positive'}],
 'experiment': {'number_of_perturbed_cells': 62623,
                'number_of_perturbed_targets': 90,
                'perturbation_type': [{'term_id': None,
                                       'term_label': 'CRISPRi'}],
                'perturbed_target_biotype': ['control',
                                             'protein_coding',
                                             'processed_pseudogene'],
                'perturbed_targets': ['control',
                                      'ENSG00000228474',
                                      'ENSG00000058262',
                                      'ENSG00000115211',
                                      'ENSG00000182934',


### Manually curate metadata

Study details

In [36]:
cur_data.add_exp_metadata(
    metadata_slot='study',
    metadata={
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {"first_name": "Britt", "last_name": "Adamson"},
        "last_author": {"first_name": "Jonathan", "last_name": "Weissman"},
    }
)

Metadata for 'study' successfully validated:
--------------------------------------------------
{'first_author': {'first_name': 'Britt', 'last_name': 'Adamson'},
 'last_author': {'first_name': 'Jonathan', 'last_name': 'Weissman'},
 'study_uri': 'https://doi.org/10.1016/j.cell.2016.11.048',
 'title': 'A Multiplexed Single-Cell CRISPR Screening Platform Enables '
          'Systematic Dissection of the Unfolded Protein Response',
 'year': 2016}
--------------------------------------------------


Experiment details

In [37]:
cur_data.add_exp_metadata(
    metadata_slot='experiment',
    metadata={
        "title": "63000 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs.",
        "summary": "Perturb-seq was applied to a small CRISPRi library of 91 sgRNAs targeting UPR genes in K562 cells.",
        "replicates": "none",
        "number_of_samples": 1
    }
)

Metadata for 'experiment' successfully validated:
--------------------------------------------------
{'number_of_perturbed_cells': 62623,
 'number_of_perturbed_targets': 90,
 'number_of_samples': 1,
 'perturbation_type': [{'term_id': None, 'term_label': 'CRISPRi'}],
 'perturbed_target_biotype': ['control',
                              'protein_coding',
                              'processed_pseudogene'],
 'perturbed_targets': ['control',
                       'ENSG00000228474',
                       'ENSG00000058262',
                       'ENSG00000115211',
                       'ENSG00000182934',
                       'ENSG00000134049',
                       'ENSG00000145817',
                       'ENSG00000170445',
                       'ENSG00000101166',
                       'ENSG00000129562',
                       'ENSG00000265354',
                       'ENSG00000115866',
                       'ENSG00000176020',
                       'ENSG00000112249',
         

Perturbation details

In [39]:
cur_data.add_exp_metadata(
    metadata_slot='perturbation',
    metadata={
        "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022895",
            "term_label": "dCas9-KRAB",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "retroviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "pooled",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "inhibition",
                },
            ],
            "manufacturer": "Weissman",
            "lentiviral_generation": "3",
            "grnas_per_gene": "1",
            "total_grnas": "91",
            "total_variants": None,
        }
    }
)

Metadata for 'perturbation' successfully validated:
--------------------------------------------------
{'enzyme_delivery_method': {'term_id': None,
                            'term_label': 'retroviral transduction'},
 'enzyme_expression_control': {'term_id': None,
                               'term_label': 'constitutive expression'},
 'enzyme_integration_state': {'term_id': None,
                              'term_label': 'random locus integration'},
 'library': {'accession': None,
             'grnas_per_gene': '1',
             'lentiviral_generation': '3',
             'library_format': {'term_id': None, 'term_label': 'pooled'},
             'library_name': 'custom',
             'library_perturbation_type': [{'term_id': None,
                                            'term_label': 'inhibition'}],
             'library_scope': {'term_id': None, 'term_label': 'focused'},
             'manufacturer': 'Weissman',
             'total_grnas': '91',
             'total_variants': No

Assay details

In [40]:
cur_data.add_exp_metadata(
    metadata_slot='assay',
    metadata={
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "single-cell rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "Perturb-seq",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "10x Genomics Single Cell 3-prime",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina HiSeq 4000"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "CellRanger"},
        "software_analysis": {"term_id": None, "term_label": "MAGeCK"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh37",
        }
    }
)

Metadata for 'assay' successfully validated:
--------------------------------------------------
{'method_name': {'term_id': None, 'term_label': 'Perturb-seq'},
 'method_uri': None,
 'readout_dimensionality': {'term_id': None,
                            'term_label': 'high-dimensional assay'},
 'readout_technology': {'term_id': None, 'term_label': 'single-cell rna-seq'},
 'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
 'reference_genome': {'term_id': None, 'term_label': 'GRCh37'},
 'sequencing_library_kit': {'term_id': None,
                            'term_label': '10x Genomics Single Cell 3-prime'},
 'sequencing_platform': {'term_id': None, 'term_label': 'Illumina HiSeq 4000'},
 'sequencing_strategy': {'term_id': None, 'term_label': 'barcode sequencing'},
 'software_analysis': {'term_id': None, 'term_label': 'MAGeCK'},
 'software_counts': {'term_id': None, 'term_label': 'CellRanger'}}
--------------------------------------------------


Model system details

In [41]:
cur_data.add_exp_metadata(
    metadata_slot='model_system',
    metadata={
        "species": "Homo sapiens",
        "passage_number": None,
        }
)

Metadata for 'model_system' successfully validated:
--------------------------------------------------
{'cell_line': [{'term_id': 'CLO:0007050', 'term_label': 'K 562 cell'}],
 'cell_type': [{'term_id': 'CL:0017005', 'term_label': 'lymphoblast'}],
 'developmental_stage': [{'term_id': None, 'term_label': 'adult'}],
 'model_system': [{'term_id': None, 'term_label': 'cell line'}],
 'passage_number': None,
 'sex': [{'term_id': None, 'term_label': 'female'}],
 'species': 'Homo sapiens',
 'tissue': [{'term_id': 'UBERON:0000178', 'term_label': 'blood'}]}
--------------------------------------------------


Associated dataset details

In [42]:
cur_data.add_exp_metadata(
    metadata_slot='associated_datasets',
    metadata=[
        {
            "dataset_accession": "GSM2406677",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406681",
            "dataset_description": "Barcode, cell identities, raw gene expression matrix",
            "dataset_file_name": "GSM2406677_10X010",
        },
        {
            "dataset_accession": "GSM2406677_10X005",
            "dataset_uri": "https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad",
            "dataset_description": "Processed .h5ad file",
            "dataset_file_name": "AdamsonWeissman2016_GSM2406681_10X010.h5ad"
        }
    ]
)

Metadata for 'associated_datasets' successfully validated:
--------------------------------------------------
[{'dataset_accession': 'GSM2406677',
  'dataset_description': 'Barcode, cell identities, raw gene expression matrix',
  'dataset_file_name': 'GSM2406677_10X010',
  'dataset_uri': 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406681'},
 {'dataset_accession': 'GSM2406677_10X005',
  'dataset_description': 'Processed .h5ad file',
  'dataset_file_name': 'AdamsonWeissman2016_GSM2406681_10X010.h5ad',
  'dataset_uri': 'https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad'}]
--------------------------------------------------


### Validate metadata

In [43]:
cur_data.validate_exp_metadata()

Experiment metadata successfully validated:
--------------------------------------------------
{'assay': {'method_name': {'term_id': None, 'term_label': 'Perturb-seq'},
           'method_uri': None,
           'readout_dimensionality': {'term_id': None,
                                      'term_label': 'high-dimensional assay'},
           'readout_technology': {'term_id': None,
                                  'term_label': 'single-cell rna-seq'},
           'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
           'reference_genome': {'term_id': None, 'term_label': 'GRCh37'},
           'sequencing_library_kit': {'term_id': None,
                                      'term_label': '10x Genomics Single Cell '
                                                    '3-prime'},
           'sequencing_platform': {'term_id': None,
                                   'term_label': 'Illumina HiSeq 4000'},
           'sequencing_strategy': {'term_id': None,
               

# Save the dataset

In [44]:
cur_data.save_curated_data()

  adata.obs = adata.obs.fillna(value=np.nan)


Curated data saved to ../curated/h5ad/adamson_2016_upr_perturb_seq_curated.h5ad
