# Import

In [1]:
import pandas as pd
import json
import sys

sys.path.append("../../")

from curation_tools.curation_tools import CuratedDataset
from curation_tools.perturbseq_anndata_schema import ObsSchema, VarSchema

from curation_tools.unified_metadata_schema.unified_metadata_schema import Experiment
from curation_tools.polars_schema import polars_schema

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



# Initialise the dataset object

In [2]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    polars_schema=polars_schema,
    data_source_link = "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
    noncurated_path = "../non_curated/h5ad/adamson_2016_pilot.h5ad"
)

# Download the dataset

In [3]:
cur_data.download_data()
cur_data.load_data(path="../non_curated/h5ad/adamson_2016_pilot.h5ad")
# show the data
cur_data.show_obs()

File ../non_curated/h5ad/adamson_2016_pilot.h5ad already exists. Skipping download.
Loading data from ../non_curated/h5ad/adamson_2016_pilot.h5ad
Observation data:
DataFrame shape: (5768, 15)
--------------------------------------------------
                  perturbation  read count  UMI count tissue_type cell_line  \
cell_barcode                                                                  
AAACATACACCGAT    CREB1_pDS269      1286.0       98.0   cell_line      K562   
AAACATACAGAGAT    SNAI1_pDS266       296.0       19.0   cell_line      K562   
AAACATACCAGAAA  62(mod)_pBA581      1829.0      162.0   cell_line      K562   
AAACATACGTTGAC    EP300_pDS268      1580.0       98.0   cell_line      K562   
AAACATACTGTTCT  62(mod)_pBA581       748.0       51.0   cell_line      K562   
...                        ...         ...        ...         ...       ...   
TTTGACTGGAAGGC    SNAI1_pDS266       331.0       22.0   cell_line      K562   
TTTGACTGGACGAG    SNAI1_pDS266       745.0    

In [4]:
cur_data.show_var()

Variable data:
DataFrame shape: (35635, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      0.0       0
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945      1.0       1
...                       ...      ...     ...
MT-ND4L       ENSG00000212907      0.0       0
MT-ND4        ENSG00000198886      0.0       0
MT-ND5        ENSG00000198786      0.0       0
MT-ND6        ENSG00000198695      0.0       0
MT-CYB        ENSG00000198727      0.0       0

[35635 rows x 3 columns]
--------------------------------------------------


# OBS slot curation

### Show unique perturbations

In [5]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 10
--------------------------------------------------
{nan,
 '*',
 '62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


### Drop NAs

In [6]:
cur_data.remove_na(slot = 'obs', column = 'perturbation')

Removed 10 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [7]:
cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')

Removed 6 entries \* from column perturbation of adata.obs


  cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')


In [8]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 8
--------------------------------------------------
{'62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


### Rename `perturbation` to `perturbation_name`

In [9]:
cur_data.rename_columns(slot = 'obs', name_dict = {'perturbation': 'perturbation_name'})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Add guide RNA information

In [10]:
# Not reported by study authors for this dataset
cur_data.create_columns(
    slot='obs',
    col_dict={
        'guide_sequence': None
    }
)

Column guide_sequence added to adata.obs


### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [11]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Column perturbed_target_symbol added to adata.obs
Unique values in adata.obs.perturbed_target_symbol: 8
--------------------------------------------------
{'62(mod)_pBA581',
 'BHLHE40_pDS258',
 'CREB1_pDS269',
 'DDIT3_pDS263',
 'EP300_pDS268',
 'SNAI1_pDS266',
 'SPI1_pDS255',
 'ZNF326_pDS262'}
--------------------------------------------------


#### Clean up `perturbation_target_symbol` column

In [12]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    map_dict={
        r"62\(mod\).*": "control_nontargeting",
        r"_(pD|pB).*": ""
    }
)

cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Replaced '62\(mod\).*' with 'control_nontargeting' in column perturbed_target_symbol of adata.obs
Replaced '_(pD|pB).*' with '' in column perturbed_target_symbol of adata.obs
Unique values in adata.obs.perturbed_target_symbol: 8
--------------------------------------------------
{'BHLHE40',
 'CONTROL_NONTARGETING',
 'CREB1',
 'DDIT3',
 'EP300',
 'SNAI1',
 'SPI1',
 'ZNF326'}
--------------------------------------------------


  if df[column].str.upper().str.contains(old_val.upper()).any():


### Standardise perturbation targets

In [13]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol',
    input_column_type='gene_symbol',
    multiple_entries=False
)

Mapped potential synonyms in perturbed_target_symbol of the provided dataframe to gene symbols


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Converted 7/8 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Add `perturbed_target_number` column

In [14]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [15]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


  self.adata.obs["perturbed_target_chromosome_encoding"] = self.adata.obs[chromosome_col].replace(


In [16]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (5752, 2)
--------------------------------------------------
               perturbation_name  perturbed_target_chromosome_encoding
index                                                                 
AAACATACACCGAT      CREB1_pDS269                                   2.0
AAACATACAGAGAT      SNAI1_pDS266                                  20.0
AAACATACCAGAAA    62(mod)_pBA581                                   0.0
AAACATACGTTGAC      EP300_pDS268                                  22.0
AAACATACTGTTCT    62(mod)_pBA581                                   0.0
...                          ...                                   ...
TTTGACTGGAAGGC      SNAI1_pDS266                                  20.0
TTTGACTGGACGAG      SNAI1_pDS266                                  20.0
TTTGCATGCCCGTT      SNAI1_pDS266                                  20.0
TTTGCATGCCGTTC    62(mod)_pBA581                                   0.0
TTTGCATGTTCTAC    62(mod)_pBA581                    

### Add metadata

In [17]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # treatment
        "treatment_label": None, 
        "treatment_id": None,
        # perturbation type
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None,
        # model system
        "model_system_label": "cell line", 
        "model_system_id": None,
        'tissue': 'blood',
        "timepoint": "P0DT0H0M0S",
        
        "species": "Homo sapiens",
        "sex_label": "female", 
        "sex_id": None,
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None,

        "study_title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "study_year": 2016,
        "first_author": "Britt Adamson",
        "last_author": "Jonathan Weissman",
        
        "experiment_title": "6000 chronic myeloid leukemia (K562) cells transfected with gRNAs against 7 transcription factors + 1 control",
        "experiment_summary": "In a pilot experiment, single-cell RNA-seq was performed on a pool of individually transduced chronic myeloid leukemia cells (K562) carrying 8 distinct guide barcodes, analyzing \u223c6,000 cells total.",
        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],
        
        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": "EFO:0022895",
        "library_generation_method_label": "dCas9-KRAB",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "retroviral transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentiviral transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",
        
        "library_manufacturer": "Weissman",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "1",
        "library_total_grnas": "8",
        "library_total_variants": None,
        
        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina HiSeq 2500",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "MAGeCK",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh37",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "GSM2406675",
                "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406675",
                "dataset_description": "Raw counts",
                "dataset_file_name": "GSE90546_RAW.tar",
            },
            {
                "dataset_accession": "GSM2406675_10X001",
                "dataset_uri": "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
                "dataset_description": "Processed .h5ad file",
                "dataset_file_name": "AdamsonWeissman2016_GSM2406675_10X001.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column treatment_label added to adata.obs
Column treatment_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column tissue added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs
Column study_title added to adata.obs
Column study_uri added to adata.obs
Column study_year added to adata.obs
Column first_author added to adata.obs
Column last_author added to adata.obs
Column experiment_title added to adata.obs
Column experiment_summary added to adata.obs
Column number_of_perturbed_targets added to adata.obs
Column number_of_perturbed_samples added to adata.obs
Colum

In [18]:
cur_data.adata.obs

Unnamed: 0_level_0,percent_mito,organism,disease,tissue_type,read count,nperts,ngenes,perturbation_type,perturbation_name,ncounts,...,sequencing_platform_label,sequencing_strategy_id,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,reference_genome_id,reference_genome_label,associated_datasets
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCGAT,0.0,human,chronic myelogenous leukemia,cell_line,1286.0,2,2412,CRISPR,CREB1_pDS269,8138.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
AAACATACAGAGAT,0.0,human,chronic myelogenous leukemia,cell_line,296.0,2,2386,CRISPR,SNAI1_pDS266,8980.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
AAACATACCAGAAA,0.0,human,chronic myelogenous leukemia,cell_line,1829.0,2,4404,CRISPR,62(mod)_pBA581,28610.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
AAACATACGTTGAC,0.0,human,chronic myelogenous leukemia,cell_line,1580.0,2,2815,CRISPR,EP300_pDS268,11346.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
AAACATACTGTTCT,0.0,human,chronic myelogenous leukemia,cell_line,748.0,2,2584,CRISPR,62(mod)_pBA581,9864.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGACTGGAAGGC,0.0,human,chronic myelogenous leukemia,cell_line,331.0,2,1856,CRISPR,SNAI1_pDS266,5942.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
TTTGACTGGACGAG,0.0,human,chronic myelogenous leukemia,cell_line,745.0,2,2263,CRISPR,SNAI1_pDS266,8150.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
TTTGCATGCCCGTT,0.0,human,chronic myelogenous leukemia,cell_line,784.0,2,2473,CRISPR,SNAI1_pDS266,9179.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
TTTGCATGCCGTTC,0.0,human,chronic myelogenous leukemia,cell_line,1469.0,2,3167,CRISPR,62(mod)_pBA581,15785.0,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."


### Curate tissue information

In [19]:

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


### Curate cell type information

In [20]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
   input_column input_column_lower    name_lower ontology_id
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005
--------------------------------------------------


### Curate cell line information

In [21]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower  ontology_id
0         K562               k562       k562  CLO:0007050
--------------------------------------------------


### Curate disease information

In [22]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  
0  chronic myelogenous leukemia  MONDO:0011996  
--------------------------------------------------


### Match schema column order

In [23]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [24]:
cur_data.validate_data(slot='obs')

adata.obs is valid according to the obs_schema.
Validated data:


Unnamed: 0,dataset_id,sample_id,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,...,sequencing_platform_label,sequencing_strategy_id,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,reference_genome_id,reference_genome_label,associated_datasets
0,adamson_2016_pilot,1,CREB1_pDS269,chr2:207529737-207605988;1,2,2,1,ENSG00000118260,CREB1,protein_coding,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
1,adamson_2016_pilot,2,SNAI1_pDS266,chr20:49982980-49988886;1,20,20,1,ENSG00000124216,SNAI1,protein_coding,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
2,adamson_2016_pilot,3,62(mod)_pBA581,,,0,1,,,,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
3,adamson_2016_pilot,4,EP300_pDS268,chr22:41092510-41180077;1,22,22,1,ENSG00000100393,EP300,protein_coding,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
4,adamson_2016_pilot,5,62(mod)_pBA581,,,0,1,,,,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5747,adamson_2016_pilot,5748,SNAI1_pDS266,chr20:49982980-49988886;1,20,20,1,ENSG00000124216,SNAI1,protein_coding,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
5748,adamson_2016_pilot,5749,SNAI1_pDS266,chr20:49982980-49988886;1,20,20,1,ENSG00000124216,SNAI1,protein_coding,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
5749,adamson_2016_pilot,5750,SNAI1_pDS266,chr20:49982980-49988886;1,20,20,1,ENSG00000124216,SNAI1,protein_coding,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
5750,adamson_2016_pilot,5751,62(mod)_pBA581,,,0,1,,,,...,Illumina HiSeq 2500,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."


# VAR slot curation

### Standardise genes

In [25]:
cur_data.show_var()

Variable data:
DataFrame shape: (35635, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      0.0       0
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945      1.0       1
...                       ...      ...     ...
MT-ND4L       ENSG00000212907      0.0       0
MT-ND4        ENSG00000198886      0.0       0
MT-ND5        ENSG00000198786      0.0       0
MT-ND6        ENSG00000198695      0.0       0
MT-CYB        ENSG00000198727      0.0       0

[35635 rows x 3 columns]
--------------------------------------------------


In [26]:
cur_data.standardize_genes(
    slot="var", input_column="ensembl_id", input_column_type="ensembl_gene_id"
)

Converted 30304/35635 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Validate var metadata

In [27]:
cur_data.validate_data(slot='var')

adata.var is valid according to the var_schema.
Validated data:


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,
RP11-34P13.8,ENSG00000239945,
...,...,...
MT-ND4L,ENSG00000212907,MT-ND4L
MT-ND4,ENSG00000198886,MT-ND4
MT-ND5,ENSG00000198786,MT-ND5
MT-ND6,ENSG00000198695,MT-ND6


# Save the dataset

In [28]:
cur_data.save_curated_data_h5ad()

  adata.obs = adata.obs.fillna(value=np.nan)


Curated data saved to ../curated/h5ad/adamson_2016_pilot_curated.h5ad


In [None]:
cur_data.save_curated_data_parquet(split_metadata=True)

Starting the conversion of adata to a long format DataFrame...
Starting the conversion to long format...
Processing 35635 genes in 179 chunks of size 200...
Created ParquetWriter and wrote chunk 1/179
Appended chunk 2/179 to parquet file
Appended chunk 3/179 to parquet file
Appended chunk 4/179 to parquet file
Appended chunk 5/179 to parquet file
Appended chunk 6/179 to parquet file
Appended chunk 7/179 to parquet file
Appended chunk 8/179 to parquet file
Appended chunk 9/179 to parquet file
Appended chunk 10/179 to parquet file
Appended chunk 11/179 to parquet file
Appended chunk 12/179 to parquet file
Appended chunk 13/179 to parquet file
Appended chunk 14/179 to parquet file
Appended chunk 15/179 to parquet file
Appended chunk 16/179 to parquet file
Appended chunk 17/179 to parquet file
Appended chunk 18/179 to parquet file
Appended chunk 19/179 to parquet file
Appended chunk 20/179 to parquet file
Appended chunk 21/179 to parquet file
Appended chunk 22/179 to parquet file
Appended 

: 

In [None]:
cur_data.save_curated_data_parquet(split_metadata=False)

Starting the conversion of adata to a long format DataFrame...
Starting the conversion to long format...
Processing 35635 genes in 179 chunks of size 200...
Created ParquetWriter and wrote chunk 1/179
Appended chunk 2/179 to parquet file
Appended chunk 3/179 to parquet file
Appended chunk 4/179 to parquet file
Appended chunk 5/179 to parquet file
Appended chunk 6/179 to parquet file
Appended chunk 7/179 to parquet file
Appended chunk 8/179 to parquet file
Appended chunk 9/179 to parquet file
Appended chunk 10/179 to parquet file
Appended chunk 11/179 to parquet file
Appended chunk 12/179 to parquet file
Appended chunk 13/179 to parquet file
Appended chunk 14/179 to parquet file
Appended chunk 15/179 to parquet file
Appended chunk 16/179 to parquet file
Appended chunk 17/179 to parquet file
Appended chunk 18/179 to parquet file
Appended chunk 19/179 to parquet file
Appended chunk 20/179 to parquet file
Appended chunk 21/179 to parquet file
Appended chunk 22/179 to parquet file
Appended 

: 