# Import

In [20]:
import pandas as pd
import json
import sys

sys.path.append("../../")
from curation_tools.curation_tools import CuratedDataset
from curation_tools.perturbseq_anndata_schema import ObsSchema, VarSchema

from curation_tools.unified_metadata_schema.unified_metadata_schema import Experiment


# Initialise the dataset object

In [2]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    data_source_link = "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406677_10X005.h5ad",
    noncurated_path = "../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad"
)

# Download the dataset

In [3]:
cur_data.download_data()
cur_data.load_data(path="../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad")
# show the data
cur_data.show_obs()

File ../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad already exists. Skipping download.
Loading data from ../non_curated/h5ad/adamson_2016_upr_epistasis.h5ad
Observation data:
DataFrame shape: (15006, 15)
--------------------------------------------------
                         perturbation  read count  UMI count tissue_type  \
cell_barcode                                                               
AAACATACACTCAG   3x_neg_ctrl_pMJ144-1       261.0       59.0   cell_line   
AAACATACTCCTAT   3x_neg_ctrl_pMJ144-2       132.0       37.0   cell_line   
AAACATTGCAGAGG   3x_neg_ctrl_pMJ144-2       560.0      117.0   cell_line   
AAACATTGGCGAAG  ATF6_PERK_IRE1_pMJ158       215.0       49.0   cell_line   
AAACCGTGATACCG       ATF6_PERK_pMJ150       567.0      124.0   cell_line   
...                               ...         ...        ...         ...   
TTTGACTGGCTTAG       PERK_only_pMJ146       215.0       60.0   cell_line   
TTTGACTGGGGATG       PERK_IRE1_pMJ154        64.0       

In [4]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      2.0       2
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      1.0       1
RP11-34P13.8  ENSG00000239945      2.0       2
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


# OBS slot curation

### Show unique perturbations

In [5]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 21
--------------------------------------------------
{nan,
 '*',
 '3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


### Drop NAs

In [6]:
cur_data.remove_na(slot = 'obs', column = 'perturbation')

Removed 296 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [7]:
cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')

Removed 13 entries \* from column perturbation of adata.obs


  cur_data.remove_entries(slot = 'obs', column = 'perturbation', to_remove = '\*')


In [8]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 19
--------------------------------------------------
{'3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


### Rename `perturbation` to `perturbation_name`

In [9]:
cur_data.rename_columns(slot = 'obs', name_dict = {'perturbation': 'perturbation_name'})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Add guide RNA information

In [10]:
# Not reported by study authors for this dataset
cur_data.create_columns(
    slot='obs',
    col_dict={
        'guide_sequence': None
    }
)

Column guide_sequence added to adata.obs


### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [11]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Column perturbed_target_symbol added to adata.obs
Unique values in adata.obs.perturbed_target_symbol: 19
--------------------------------------------------
{'3x_neg_ctrl_pMJ144-1',
 '3x_neg_ctrl_pMJ144-2',
 'ATF4_pBA576',
 'ATF6_IRE1_pMJ152',
 'ATF6_PERK_IRE1_pMJ158',
 'ATF6_PERK_pMJ150',
 'ATF6_only_pMJ145',
 'C7orf26_pDS004',
 'Gal4-4(mod)_pBA582',
 'IER3IP1_pDS003',
 'IRE1_only_pMJ148',
 'PERK_IRE1_pMJ154',
 'PERK_only_pMJ146',
 'PSMA1_pDS007',
 'PSMD12_pDS009',
 'SNAI1_pDS266',
 'XBP1_pBA578',
 'XBP1_pBA579',
 'YIPF5_pDS001'}
--------------------------------------------------


#### Clean up `perturbation_target_symbol` column

In [12]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    map_dict={
        r"3x_neg_ctrl.*": "control_nontargeting",
        r"Gal4-4.*": "control_nontargeting",
        r"_(pM|pD|pB|only).*": ""
    }
)

cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Replaced '3x_neg_ctrl.*' with 'control_nontargeting' in column perturbed_target_symbol of adata.obs
Replaced 'Gal4-4.*' with 'control_nontargeting' in column perturbed_target_symbol of adata.obs
Replaced '_(pM|pD|pB|only).*' with '' in column perturbed_target_symbol of adata.obs
Unique values in adata.obs.perturbed_target_symbol: 16
--------------------------------------------------
{'ATF4',
 'ATF6',
 'ATF6_IRE1',
 'ATF6_PERK',
 'ATF6_PERK_IRE1',
 'C7ORF26',
 'CONTROL_NONTARGETING',
 'IER3IP1',
 'IRE1',
 'PERK',
 'PERK_IRE1',
 'PSMA1',
 'PSMD12',
 'SNAI1',
 'XBP1',
 'YIPF5'}
--------------------------------------------------


  if df[column].str.upper().str.contains(old_val.upper()).any():


### Standardise perturbation targets

In [13]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol',
    input_column_type='gene_symbol',
    multiple_entries=True,
    multiple_entries_sep='_'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Mapped potential synonyms in perturbed_target_symbol of the provided dataframe to gene symbols
Converted 11/13 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


Collapsed column index using separator |


### Add `perturbed_target_number` column

In [14]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [15]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


In [16]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (14697, 2)
--------------------------------------------------
                    perturbation_name  perturbed_target_chromosome_encoding
index                                                                      
AAACATACACTCAG   3x_neg_ctrl_pMJ144-1                                     0
AAACATACTCCTAT   3x_neg_ctrl_pMJ144-2                                     0
AAACATTGCAGAGG   3x_neg_ctrl_pMJ144-2                                     0
AAACATTGGCGAAG  ATF6_PERK_IRE1_pMJ158                                     0
AAACCGTGATACCG       ATF6_PERK_pMJ150                                     0
...                               ...                                   ...
TTTGACTGGCTTAG       PERK_only_pMJ146                                     2
TTTGACTGGGGATG       PERK_IRE1_pMJ154                                     0
TTTGACTGTGGTCA   3x_neg_ctrl_pMJ144-1                                     0
TTTGCATGCGGAGA       PERK_IRE1_pMJ154                              

### Add metadata

### Add treatment information

Data is stored in a separate file, as part of the `cell BC` column's suffix


In [17]:
orig_cell_ident_link = r"https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM2406nnn/GSM2406677/suppl/GSM2406677%5F10X005%5Fcell%5Fidentities.csv.gz"
orig_cell_ident = pd.read_csv(orig_cell_ident_link)

# the number after the dash is the cell barcode indicates the treatment
display(orig_cell_ident[["cell BC", "guide identity"]].head())

treatment_label_map = {"1": "tunicamycin", "2": "thapsigargin", "3": "DMSO"}

treatment_df = (
    orig_cell_ident["cell BC"]
    .str.split("-", expand=True)
    .rename(columns={0: "cell_barcode", 1: "treatment_number"})
)

treatment_df["treatment_temp"] = treatment_df["treatment_number"].map(
    treatment_label_map
)

treatment_df = (
    treatment_df.drop_duplicates(subset=["cell_barcode"])
    .drop(columns=["treatment_number"])
    .set_index("cell_barcode")
)

treatment_df

Unnamed: 0,cell BC,guide identity
0,ACGGTATGCTTAGG-3,PERK_IRE1_pMJ154
1,ACAATCCTACCCTC-1,PERK_IRE1_pMJ154
2,ACGAACACGTGCTA-3,ATF6_PERK_IRE1_pMJ158
3,CTGTGAGATTGGTG-1,ATF6_PERK_IRE1_pMJ158
4,ATGTTGCTAATCGC-2,3x_neg_ctrl_pMJ144-2


Unnamed: 0_level_0,treatment_temp
cell_barcode,Unnamed: 1_level_1
ACGGTATGCTTAGG,DMSO
ACAATCCTACCCTC,tunicamycin
ACGAACACGTGCTA,DMSO
CTGTGAGATTGGTG,tunicamycin
ATGTTGCTAATCGC,thapsigargin
...,...
CGTAACGAGTTGCA,DMSO
CCATGCTGGCTTCC,DMSO
CCCTCAGAAAAGTG,DMSO
TCAAGTCTAGGTCT,thapsigargin


Add treatment information to the dataset

In [18]:
cur_data.adata.obs = cur_data.adata.obs.merge(
    treatment_df,
    left_index=True,
    right_index=True,
    how="left"
)

Map treatment compounds to CHEBI

In [19]:
cur_data.standardize_compounds(column='treatment_temp')

Found standardized name for compound 'tunicamycin': tunicamycin (ChEBI ID: CHEBI:29699)
Found standardized name for compound 'thapsigargin': thapsigargin (ChEBI ID: CHEBI:9516)
Found standardized name for compound 'DMSO': dimethyl sulfoxide (ChEBI ID: CHEBI:28262)
Standardized compound names in column 'treatment_temp' and added 'treatment_label' and 'treatment_id' columns.


In [21]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # treatment
        # "treatment_label": None, 
        # "treatment_id": None,
        # perturbation type
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None,
        # model system
        "model_system_label": "cell line", 
        "model_system_id": None,
        'tissue': 'blood',
        "timepoint": "P0DT0H0M0S",
        
        "species": "Homo sapiens",
        "sex_label": "female", 
        "sex_id": None,
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None,

        "study_title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "study_year": 2016,
        "first_author": "Britt Adamson",
        "last_author": "Jonathan Weissman",
        
        "experiment_title": "14595 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs in every combination (singly with controls, doubly with a control, or triply).",
        "experiment_summary": "Using our final three-guide Perturb-seq vector to simultaneously deliver 3 sgRNAs, we individually transduced K562 cells expressing dCas9-KRAB (cBA010) with constructs that targeted all three UPR sensor genes in every combination (singly with controls, doubly with a control, or triply). Transduced cells were then pooled and selected. After 2 days of combined growth, the cells were treated with DMSO for 6 hr, 4 μg/mL tunicamycin (Tm) for 6 hr, or 100 nM thapsigargin (Tg) for 4 hr and were profiled by Perturb-seq (24 conditions in total).",
        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],
        
        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": "EFO:0022895",
        "library_generation_method_label": "dCas9-KRAB",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "retroviral transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentiviral transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",
        
        "library_manufacturer": "Weissman",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "1",
        "library_total_grnas": "16",
        "library_total_variants": None,
        
        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina HiSeq 4000",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "MAGeCK",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh37",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "GSM2406675",
                "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406675",
                "dataset_description": "Raw counts",
                "dataset_file_name": "GSE90546_RAW.tar",
            },
            {
                "dataset_accession": "GSM2406675_10X001",
                "dataset_uri": "https://zenodo.org/record/7041849/files/AdamsonWeissman2016_GSM2406675_10X001.h5ad",
                "dataset_description": "Processed .h5ad file",
                "dataset_file_name": "AdamsonWeissman2016_GSM2406675_10X001.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column tissue added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs
Column study_title added to adata.obs
Column study_uri added to adata.obs
Column study_year added to adata.obs
Column first_author added to adata.obs
Column last_author added to adata.obs
Column experiment_title added to adata.obs
Column experiment_summary added to adata.obs
Column number_of_perturbed_targets added to adata.obs
Column number_of_perturbed_samples added to adata.obs
Column library_generation_type_id added to adata.obs
Column library_generation_type_la

### Curate tissue information

In [22]:

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


### Curate cell type information

In [23]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
   input_column input_column_lower    name_lower ontology_id
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005
--------------------------------------------------


### Curate cell line information

In [24]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower  ontology_id
0         K562               k562       k562  CLO:0007050
--------------------------------------------------


### Curate disease information

In [25]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  
0  chronic myelogenous leukemia  MONDO:0011996  
--------------------------------------------------


### Match schema column order

In [26]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [27]:
cur_data.validate_data(slot='obs')

adata.obs is valid according to the obs_schema.
Validated data:


Unnamed: 0,dataset_id,sample_id,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,...,sequencing_platform_label,sequencing_strategy_id,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,reference_genome_id,reference_genome_label,associated_datasets
0,adamson_2016_upr_epistasis,1,3x_neg_ctrl_pMJ144-1,,,0,1,None|None,None|None,,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
1,adamson_2016_upr_epistasis,2,3x_neg_ctrl_pMJ144-2,,,0,1,None|None,None|None,,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
2,adamson_2016_upr_epistasis,3,3x_neg_ctrl_pMJ144-2,,,0,1,None|None,None|None,,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
3,adamson_2016_upr_epistasis,4,ATF6_PERK_IRE1_pMJ158,chr1:161766298-161977574;1|chr2:88556741-88691...,1|2|17,0,3,ENSG00000118217|ENSG00000172071|ENSG00000178607,ATF6|EIF2AK3|ERN1,protein_coding|protein_coding|protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
4,adamson_2016_upr_epistasis,5,ATF6_PERK_pMJ150,chr1:161766298-161977574;1|chr2:88556741-88691...,1|2,0,2,ENSG00000118217|ENSG00000172071,ATF6|EIF2AK3,protein_coding|protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14692,adamson_2016_upr_epistasis,14693,PERK_only_pMJ146,chr2:88556741-88691518;-1,2,2,1,ENSG00000172071,EIF2AK3,protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
14693,adamson_2016_upr_epistasis,14694,PERK_IRE1_pMJ154,chr2:88556741-88691518;-1|chr17:64039080-64130...,2|17,0,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
14694,adamson_2016_upr_epistasis,14695,3x_neg_ctrl_pMJ144-1,,,0,1,None|None,None|None,,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."
14695,adamson_2016_upr_epistasis,14696,PERK_IRE1_pMJ154,chr2:88556741-88691518;-1|chr17:64039080-64130...,2|17,0,2,ENSG00000172071|ENSG00000178607,EIF2AK3|ERN1,protein_coding|protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406675"", ""dataset_..."


# VAR slot curation

### Standardise genes

In [28]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485      2.0       2
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      1.0       1
RP11-34P13.8  ENSG00000239945      2.0       2
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


In [29]:
cur_data.standardize_genes(
    slot="var", input_column="ensembl_id", input_column_type="ensembl_gene_id"
)

Converted 30168/32738 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Validate var metadata

In [30]:
cur_data.validate_data(slot='var')

adata.var is valid according to the var_schema.
Validated data:


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,
RP11-34P13.8,ENSG00000239945,
...,...,...
AC145205.1,ENSG00000215635,
BAGE5,ENSG00000268590,
CU459201.1,ENSG00000251180,
AC002321.2,ENSG00000215616,


# Save the dataset

In [31]:
cur_data.save_curated_data_h5ad()

  adata.obs = adata.obs.fillna(value=np.nan)


Curated data saved to ../curated/h5ad/adamson_2016_upr_epistasis_curated.h5ad


In [32]:
cur_data.save_curated_data_parquet(split_metadata=True)

Starting the conversion of adata to a long format DataFrame...
Starting the conversion to long format...
Processing 32738 genes in 164 chunks of size 200...
Created ParquetWriter and wrote chunk 1/164
Appended chunk 2/164 to parquet file
Appended chunk 3/164 to parquet file
Appended chunk 4/164 to parquet file
Appended chunk 5/164 to parquet file
Appended chunk 6/164 to parquet file
Appended chunk 7/164 to parquet file
Appended chunk 8/164 to parquet file
Appended chunk 9/164 to parquet file
Appended chunk 10/164 to parquet file
Appended chunk 11/164 to parquet file
Appended chunk 12/164 to parquet file
Appended chunk 13/164 to parquet file
Appended chunk 14/164 to parquet file
Appended chunk 15/164 to parquet file
Appended chunk 16/164 to parquet file
Appended chunk 17/164 to parquet file
Appended chunk 18/164 to parquet file
Appended chunk 19/164 to parquet file
Appended chunk 20/164 to parquet file
Appended chunk 21/164 to parquet file
Appended chunk 22/164 to parquet file
Appended 

In [33]:
cur_data.save_curated_data_parquet(split_metadata=False)

Starting the conversion of adata to a long format DataFrame...
Starting the conversion to long format...
Processing 32738 genes in 164 chunks of size 200...
Created ParquetWriter and wrote chunk 1/164
Appended chunk 2/164 to parquet file
Appended chunk 3/164 to parquet file
Appended chunk 4/164 to parquet file
Appended chunk 5/164 to parquet file
Appended chunk 6/164 to parquet file
Appended chunk 7/164 to parquet file
Appended chunk 8/164 to parquet file
Appended chunk 9/164 to parquet file
Appended chunk 10/164 to parquet file
Appended chunk 11/164 to parquet file
Appended chunk 12/164 to parquet file
Appended chunk 13/164 to parquet file
Appended chunk 14/164 to parquet file
Appended chunk 15/164 to parquet file
Appended chunk 16/164 to parquet file
Appended chunk 17/164 to parquet file
Appended chunk 18/164 to parquet file
Appended chunk 19/164 to parquet file
Appended chunk 20/164 to parquet file
Appended chunk 21/164 to parquet file
Appended chunk 22/164 to parquet file
Appended 