# Import

In [9]:
import pandas as pd

import sys

sys.path.append("../../")

from curation_tools.curation_tools import CuratedDataset
from curation_tools.perturbseq_anndata_schema import ObsSchema, VarSchema

from curation_tools.unified_metadata_schema.unified_metadata_schema import Experiment

# Initialise the dataset object

In [10]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    data_source_link = "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
    noncurated_path = "../non_curated/h5ad/adamson_2016_pilot.h5ad"
)

# Download the dataset

In [11]:
cur_data.download_data()
cur_data.load_data()
# show the data
cur_data.show_obs()

File ../non_curated/h5ad/adamson_2016_pilot.h5ad already exists. Skipping download.
Loading data from ../non_curated/h5ad/adamson_2016_pilot.h5ad
Observation data:
DataFrame shape: (5768, 15)
--------------------------------------------------
                  perturbation  read count  UMI count tissue_type cell_line  \
cell_barcode                                                                  
AAACATACACCGAT    CREB1_pDS269      1286.0       98.0   cell_line      K562   
AAACATACAGAGAT    SNAI1_pDS266       296.0       19.0   cell_line      K562   
AAACATACCAGAAA  62(mod)_pBA581      1829.0      162.0   cell_line      K562   
AAACATACGTTGAC    EP300_pDS268      1580.0       98.0   cell_line      K562   
AAACATACTGTTCT  62(mod)_pBA581       748.0       51.0   cell_line      K562   
...                        ...         ...        ...         ...       ...   
TTTGACTGGAAGGC    SNAI1_pDS266       331.0       22.0   cell_line      K562   
TTTGACTGGACGAG    SNAI1_pDS266       745.0    

In [12]:
cur_data.show_var()

Variable data:
DataFrame shape: (35635, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      0.0       0
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945      1.0       1
...                       ...      ...     ...
MT-ND4L       ENSG00000212907      0.0       0
MT-ND4        ENSG00000198886      0.0       0
MT-ND5        ENSG00000198786      0.0       0
MT-ND6        ENSG00000198695      0.0       0
MT-CYB        ENSG00000198727      0.0       0

[35635 rows x 3 columns]
--------------------------------------------------


# OBS slot curation

### Show unique perturbations

In [13]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 10
--------------------------------------------------
{nan,
 '*',
 '62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


### Drop NAs

In [14]:
cur_data.remove_na(slot = 'obs', column = 'perturbation')

Removed 10 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [15]:
cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')

Removed 6 entries \* from column perturbation of adata.obs


  cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')


In [16]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 8
--------------------------------------------------
{'62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


### Rename `perturbation` to `perturbation_name`

In [17]:
cur_data.rename_columns(slot = 'obs', name_dict = {'perturbation': 'perturbation_name'})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [18]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Column perturbed_target_symbol added to adata.obs
Unique values in adata.obs.perturbed_target_symbol: 8
--------------------------------------------------
{'62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


#### Clean up `perturbation_target_symbol` column

In [None]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace=r"62\(mod\).*",
    replace_value="control"
)

cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace="_(pD|pB).*",
    replace_value=""
)

cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Replaced entries 62\(mod\).* -> control in column perturbed_target_symbol of adata.obs
Replaced entries _(pD|pB).* ->  in column perturbed_target_symbol of adata.obs
Unique values in adata.obs.perturbed_target_symbol: 8
--------------------------------------------------
{'SNAI1', 'SPI1', 'EP300', 'control', 'CREB1', 'ZNF326', 'BHLHE40', 'DDIT3'}
--------------------------------------------------


### Standardise perturbation targets

In [12]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol',
    input_column_type='gene_symbol',
    multiple_entries=True,
    multiple_entries_sep='_'
)

Exploded column perturbed_target_symbol using separator _


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Mapped potential synonyms in perturbed_target_symbol of the provided dataframe to gene symbols
Converted 5752/5752 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------
Collapsed column index using separator |


### Add `perturbed_target_number` column

In [13]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


In [14]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_symbol', 'perturbed_target_ensg'])

Observation data:
DataFrame shape: (5752, 3)
--------------------------------------------------
               perturbation_name perturbed_target_symbol perturbed_target_ensg
index                                                                         
AAACATACACCGAT      CREB1_pDS269                   CREB1       ENSG00000118260
AAACATACAGAGAT      SNAI1_pDS266                   SNAI1       ENSG00000124216
AAACATACCAGAAA    62(mod)_pBA581                 control               control
AAACATACGTTGAC      EP300_pDS268                   EP300       ENSG00000100393
AAACATACTGTTCT    62(mod)_pBA581                 control               control
...                          ...                     ...                   ...
TTTGACTGGAAGGC      SNAI1_pDS266                   SNAI1       ENSG00000124216
TTTGACTGGACGAG      SNAI1_pDS266                   SNAI1       ENSG00000124216
TTTGCATGCCCGTT      SNAI1_pDS266                   SNAI1       ENSG00000124216
TTTGCATGCCGTTC    62(mod)_pBA581   

### Add treatment information

Add treatment information with the dataset

In [15]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "treatment_label": None, 
        "treatment_id": None
    }
)

Column treatment_label added to adata.obs
Column treatment_id added to adata.obs


### Add perturbation information

In [16]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None
    }
)

Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs


### Add timepoint information

In [17]:
cur_data.create_columns(
    slot="obs",
    col_dict={"timepoint": "P0DT0H0M0S"},
)

Column timepoint added to adata.obs


### Add model system information

In [18]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "model_system_label": "cell line", 
        "model_system_id": None
    }
)

Column model_system_label added to adata.obs
Column model_system_id added to adata.obs


### Add tissue information

In [19]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        'tissue': 'blood'
    }
)

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Column tissue added to adata.obs
Mapped 1 tissue ontology terms from `tissue` column to ontology terms
  input_column input_column_lower name_lower     ontology_id   name  \
0        blood              blood      blood  UBERON:0000178  blood   

  matching_type  
0          name  


### Add cell type information

In [20]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
   input_column input_column_lower    name_lower ontology_id         name  \
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005  lymphoblast   

     matching_type  
0  pluralised name  


### Add cell line information

In [21]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
  input_column input_column_lower name_lower  ontology_id        name  \
0         K562               k562       k562  CLO:0007050  K 562 cell   

  matching_type  
0       synonym  


### Add disease information

In [22]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  \
0  chronic myelogenous leukemia  MONDO:0011996   

                                              name matching_type  
0  chronic myelogenous leukemia, BCR-ABL1 positive       synonym  


### Add species information

In [23]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "species": "Homo sapiens"
    }
)

Column species added to adata.obs


### Add sex information

In [24]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "sex_label": "female", 
        "sex_id": None
    }
)

Column sex_label added to adata.obs
Column sex_id added to adata.obs


### Add developmental stage information

In [25]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None
    }
)

Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs


### Match schema column order

In [26]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [27]:
cur_data.validate_data(slot='obs')

adata.obs is valid according to the obs_schema.
Validated data:


Unnamed: 0,perturbation_name,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,perturbation_type_label,perturbation_type_id,timepoint,treatment_label,treatment_id,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_label,disease_id
0,CREB1_pDS269,1,ENSG00000118260,CREB1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
1,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
2,62(mod)_pBA581,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
3,EP300_pDS268,1,ENSG00000100393,EP300,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
4,62(mod)_pBA581,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5747,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
5748,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
5749,SNAI1_pDS266,1,ENSG00000124216,SNAI1,protein_coding,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996
5750,62(mod)_pBA581,1,control,control,control,CRISPRi,,P0DT0H0M0S,,,...,lymphoblast,CL:0017005,K 562 cell,CLO:0007050,female,,adult,,"chronic myelogenous leukemia, BCR-ABL1 positive",MONDO:0011996


# VAR slot curation

### Standardise genes

In [28]:
cur_data.show_var()

Variable data:
DataFrame shape: (35635, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      0.0       0
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945      1.0       1
...                       ...      ...     ...
MT-ND4L       ENSG00000212907      0.0       0
MT-ND4        ENSG00000198886      0.0       0
MT-ND5        ENSG00000198786      0.0       0
MT-ND6        ENSG00000198695      0.0       0
MT-CYB        ENSG00000198727      0.0       0

[35635 rows x 3 columns]
--------------------------------------------------


In [29]:
cur_data.standardize_genes(
    slot="var", input_column="ensembl_id", input_column_type="ensembl_gene_id"
)

Converted 30910/35635 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Validate var metadata

In [30]:
cur_data.validate_data(slot='var')

adata.var is valid according to the var_schema.
Validated data:


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,
RP11-34P13.8,ENSG00000239945,
...,...,...
MT-ND4L,ENSG00000212907,MT-ND4L
MT-ND4,ENSG00000198886,MT-ND4
MT-ND5,ENSG00000198786,MT-ND5
MT-ND6,ENSG00000198695,MT-ND6


# Metadata curation

### Auto-populate available metadata

In [31]:
cur_data.populate_exp_metadata()

Experiment metadata populated with available fields from adata.obs:
--------------------------------------------------
{'associated_diseases': [{'term_id': 'MONDO:0011996',
                          'term_label': 'chronic myelogenous leukemia, '
                                        'BCR-ABL1 positive'}],
 'experiment': {'number_of_perturbed_cells': 5752,
                'number_of_perturbed_targets': 8,
                'perturbation_type': [{'term_id': None,
                                       'term_label': 'CRISPRi'}],
                'perturbed_target_biotype': ['protein_coding', 'control'],
                'perturbed_targets': ['ENSG00000118260',
                                      'ENSG00000124216',
                                      'control',
                                      'ENSG00000100393',
                                      'ENSG00000162664',
                                      'ENSG00000134107',
                                      'ENSG00000066336',
  

### Manually curate metadata

Study details

In [32]:
cur_data.add_exp_metadata(
    metadata_slot='study',
    metadata={
        "title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "year": 2016,
        "first_author": {"first_name": "Britt", "last_name": "Adamson"},
        "last_author": {"first_name": "Jonathan", "last_name": "Weissman"},
    }
)

Metadata for 'study' successfully validated:
--------------------------------------------------
{'first_author': {'first_name': 'Britt', 'last_name': 'Adamson'},
 'last_author': {'first_name': 'Jonathan', 'last_name': 'Weissman'},
 'study_uri': 'https://doi.org/10.1016/j.cell.2016.11.048',
 'title': 'A Multiplexed Single-Cell CRISPR Screening Platform Enables '
          'Systematic Dissection of the Unfolded Protein Response',
 'year': 2016}
--------------------------------------------------


Experiment details

In [33]:
cur_data.add_exp_metadata(
    metadata_slot='experiment',
    metadata={
        "title": "6000 chronic myeloid leukemia (K562) cells transfected with gRNAs against 7 transcription factors + 1 control",
        "summary": "In a pilot experiment, single-cell RNA-seq was performed on a pool of individually transduced chronic myeloid leukemia cells (K562) carrying 8 distinct guide barcodes, analyzing \u223c6,000 cells total.",
        "replicates": "none",
        "number_of_samples": 1
    }
)

Metadata for 'experiment' successfully validated:
--------------------------------------------------
{'number_of_perturbed_cells': 5752,
 'number_of_perturbed_targets': 8,
 'number_of_samples': 1,
 'perturbation_type': [{'term_id': None, 'term_label': 'CRISPRi'}],
 'perturbed_target_biotype': ['protein_coding', 'control'],
 'perturbed_targets': ['ENSG00000118260',
                       'ENSG00000124216',
                       'control',
                       'ENSG00000100393',
                       'ENSG00000162664',
                       'ENSG00000134107',
                       'ENSG00000066336',
                       'ENSG00000175197'],
 'replicates': 'none',
 'summary': 'In a pilot experiment, single-cell RNA-seq was performed on a '
            'pool of individually transduced chronic myeloid leukemia cells '
            '(K562) carrying 8 distinct guide barcodes, analyzing ∼6,000 cells '
            'total.',
 'timepoints': ['P0DT0H0M0S'],
 'title': '6000 chronic myeloid le

Perturbation details

In [34]:
cur_data.add_exp_metadata(
    metadata_slot='perturbation',
    metadata={
         "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022895",
            "term_label": "dCas9-KRAB",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "retroviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "pooled",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "inhibition",
                },
            ],
            "manufacturer": "Weissman",
            "lentiviral_generation": "3",
            "grnas_per_gene": "1",
            "total_grnas": "8",
            "total_variants": None
        }
    }
)

Metadata for 'perturbation' successfully validated:
--------------------------------------------------
{'enzyme_delivery_method': {'term_id': None,
                            'term_label': 'retroviral transduction'},
 'enzyme_expression_control': {'term_id': None,
                               'term_label': 'constitutive expression'},
 'enzyme_integration_state': {'term_id': None,
                              'term_label': 'random locus integration'},
 'library': {'accession': None,
             'grnas_per_gene': '1',
             'lentiviral_generation': '3',
             'library_format': {'term_id': None, 'term_label': 'pooled'},
             'library_name': 'custom',
             'library_perturbation_type': [{'term_id': None,
                                            'term_label': 'inhibition'}],
             'library_scope': {'term_id': None, 'term_label': 'focused'},
             'manufacturer': 'Weissman',
             'total_grnas': '8',
             'total_variants': Non

Assay details

In [35]:
cur_data.add_exp_metadata(
    metadata_slot='assay',
    metadata={
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "single-cell rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "Perturb-seq",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "10x Genomics Single Cell 3-prime",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina HiSeq 2500"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "CellRanger"},
        "software_analysis": {"term_id": None, "term_label": "MAGeCK"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh37",
        }
    }
)

Metadata for 'assay' successfully validated:
--------------------------------------------------
{'method_name': {'term_id': None, 'term_label': 'Perturb-seq'},
 'method_uri': None,
 'readout_dimensionality': {'term_id': None,
                            'term_label': 'high-dimensional assay'},
 'readout_technology': {'term_id': None, 'term_label': 'single-cell rna-seq'},
 'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
 'reference_genome': {'term_id': None, 'term_label': 'GRCh37'},
 'sequencing_library_kit': {'term_id': None,
                            'term_label': '10x Genomics Single Cell 3-prime'},
 'sequencing_platform': {'term_id': None, 'term_label': 'Illumina HiSeq 2500'},
 'sequencing_strategy': {'term_id': None, 'term_label': 'barcode sequencing'},
 'software_analysis': {'term_id': None, 'term_label': 'MAGeCK'},
 'software_counts': {'term_id': None, 'term_label': 'CellRanger'}}
--------------------------------------------------


Model system details

In [36]:
cur_data.add_exp_metadata(
    metadata_slot='model_system',
    metadata={
        "species": "Homo sapiens",
        "passage_number": None,
        }
)

Metadata for 'model_system' successfully validated:
--------------------------------------------------
{'cell_line': [{'term_id': 'CLO:0007050', 'term_label': 'K 562 cell'}],
 'cell_type': [{'term_id': 'CL:0017005', 'term_label': 'lymphoblast'}],
 'developmental_stage': [{'term_id': None, 'term_label': 'adult'}],
 'model_system': [{'term_id': None, 'term_label': 'cell line'}],
 'passage_number': None,
 'sex': [{'term_id': None, 'term_label': 'female'}],
 'species': 'Homo sapiens',
 'tissue': [{'term_id': 'UBERON:0000178', 'term_label': 'blood'}]}
--------------------------------------------------


Associated dataset details

In [37]:
cur_data.add_exp_metadata(
    metadata_slot='associated_datasets',
    metadata=[
        {
            "dataset_accession": "GSM2406675",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406675",
            "dataset_description": "Raw counts",
            "dataset_file_name": "GSE90546_RAW.tar",
        },
        {
            "dataset_accession": "GSM2406675_10X001",
            "dataset_uri": "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
            "dataset_description": "Processed .h5ad file",
            "dataset_file_name": "AdamsonWeissman2016_GSM2406675_10X001.h5ad",
        }
    ]
)

Metadata for 'associated_datasets' successfully validated:
--------------------------------------------------
[{'dataset_accession': 'GSM2406675',
  'dataset_description': 'Raw counts',
  'dataset_file_name': 'GSE90546_RAW.tar',
  'dataset_uri': 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406675'},
 {'dataset_accession': 'GSM2406675_10X001',
  'dataset_description': 'Processed .h5ad file',
  'dataset_file_name': 'AdamsonWeissman2016_GSM2406675_10X001.h5ad',
  'dataset_uri': 'https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad'}]
--------------------------------------------------


### Validate metadata

In [38]:
cur_data.validate_exp_metadata()

Experiment metadata successfully validated:
--------------------------------------------------
{'assay': {'method_name': {'term_id': None, 'term_label': 'Perturb-seq'},
           'method_uri': None,
           'readout_dimensionality': {'term_id': None,
                                      'term_label': 'high-dimensional assay'},
           'readout_technology': {'term_id': None,
                                  'term_label': 'single-cell rna-seq'},
           'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
           'reference_genome': {'term_id': None, 'term_label': 'GRCh37'},
           'sequencing_library_kit': {'term_id': None,
                                      'term_label': '10x Genomics Single Cell '
                                                    '3-prime'},
           'sequencing_platform': {'term_id': None,
                                   'term_label': 'Illumina HiSeq 2500'},
           'sequencing_strategy': {'term_id': None,
               

# Save the dataset

In [39]:
cur_data.save_curated_data()

  adata.obs = adata.obs.fillna(value=np.nan)


Curated data saved to ../curated/h5ad/adamson_2016_pilot_curated.h5ad
