# Import

In [1]:
import pandas as pd
import json
import sys

sys.path.append("../../")

from curation_tools.curation_tools import CuratedDataset
from curation_tools.perturbseq_anndata_schema import ObsSchema, VarSchema

from curation_tools.unified_metadata_schema.unified_metadata_schema import Experiment

top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```



# Initialise the dataset object

In [2]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    data_source_link="https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad",
    noncurated_path="../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad",
)

# Download the dataset

In [3]:
cur_data.download_data()
cur_data.load_data(path="../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad")
# show the data
cur_data.show_obs()

File ../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad already exists. Skipping download.
Loading data from ../non_curated/h5ad/adamson_2016_upr_perturb_seq.h5ad
Observation data:
DataFrame shape: (65337, 15)
--------------------------------------------------
                   perturbation  read count  UMI count tissue_type cell_line  \
cell_barcode                                                                   
AAACATACAAGATG   63(mod)_pBA580       282.0        8.0   cell_line      K562   
AAACATACACCTAG      OST4_pDS353       331.0        7.0   cell_line      K562   
AAACATACTTCCCG   SEC61A1_pDS031       285.0       10.0   cell_line      K562   
AAACATTGAAACAG    EIF2B4_pDS491      1036.0       30.0   cell_line      K562   
AAACATTGCAGCTA      SRPR_pDS482       863.0       25.0   cell_line      K562   
...                         ...         ...        ...         ...       ...   
TTTGCATGCTTTAC     STT3A_pDS011       476.0       17.0   cell_line      K562   
TTTGCATGGAGGAC  

In [4]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485     11.0      11
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945     43.0      43
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


# OBS slot curation

### Show unique perturbations

In [5]:
cur_data.show_unique(slot="obs", column="perturbation")

Unique values in adata.obs.perturbation: 115
--------------------------------------------------
{nan,
 '*',
 '62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_pDS089',
 'IARS2_pDS090',
 'IARS2_pDS091',
 'IDH3A

### Drop NAs

In [6]:
cur_data.remove_na(slot="obs", column="perturbation")

Removed 2613 NA entries from column perturbation of adata.obs


### Drop "*" entries

In [7]:
cur_data.remove_entries(slot="obs", column="perturbation", to_remove=r"\*")

Removed 101 entries \* from column perturbation of adata.obs


In [8]:
cur_data.show_unique(slot="obs", column="perturbation")

Unique values in adata.obs.perturbation: 113
--------------------------------------------------
{'62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_pDS089',
 'IARS2_pDS090',
 'IARS2_pDS091',
 'IDH3A_pDS393',
 '

### Rename `perturbation` to `perturbation_name`

In [9]:
cur_data.rename_columns(slot="obs", name_dict={"perturbation": "perturbation_name"})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Add guide RNA information

In [10]:
# download the protospacer sequences data from supplementary information Table S1
# https://ars.els-cdn.com/content/image/1-s2.0-S0092867416316609-mmc1.xlsx

guides_df = pd.read_csv("../supplementary/adamson_2016_pilot.csv", header=1)
display(guides_df)

guides_dict_vecid = (
    guides_df[["Perturb-seq_Vector_ID", "Protospacer"]]
    .dropna()
    .set_index("Perturb-seq_Vector_ID", drop=True)["Protospacer"]
    .to_dict()
)

guides_dict_vecid

Unnamed: 0,Gene,Protospacer,Guide_ID (synonymous with sgGuide_ID),Perturb-seq_Vector_ID,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,AARS,GAGGGCGGCCTACCTCTCCT,,pDS381,,,
1,AMIGO3/GMPPB,GGAACGCGACACCGGGTAGA,,pDS391,,,
2,AMIGO3/GMPPB,GGGGCCAGCAGCCGTCTACC,,pDS434,,,
3,ARHGAP22,GGTCCGTCCGGAGCCAGGAG,,pDS458,,,
4,ASCC3,GACGCAAAGACGCACAGACC,,pDS051,,,
...,...,...,...,...,...,...,...
97,YIPF5,GTGACACGTAGCAACGGGGC,,pDS226,,,
98,,GGCCAAACGTGCCCTGACGG,NegCtrl-1,,,,
99,,GCGATGGGGGGGTGGGTAGC,NegCtrl-2,,,,
100,,GACGACTAGTTAGGCGTGTA,NegCtrl-3,,,,


{'pDS381': 'GAGGGCGGCCTACCTCTCCT',
 'pDS391': 'GGAACGCGACACCGGGTAGA',
 'pDS434': 'GGGGCCAGCAGCCGTCTACC',
 'pDS458': 'GGTCCGTCCGGAGCCAGGAG',
 'pDS051': 'GACGCAAAGACGCACAGACC',
 'pDS052': 'GCGCACAGACCCGGCGAGGA',
 'pDS055': 'GAGTCTCCGCAAGGCCCCGG',
 'pDS468': 'GTAGGAGCCTCGGGCGCGCT',
 'pDS460': 'GAGCCATGGCAGATTCCTCC',
 'pDS006': 'GCGACGTCCGAGCATTCCA',
 'pDS024': 'GCGCTGGTGGTCGATCGTG',
 'pDS065': 'GCGGCTATGAACCGCAGCAG',
 'pDS462': 'GGATGCTGTGGTGTCCACAG',
 'pDS499': 'GACCTTGCGTGCAGTTATGT',
 'pDS495': 'GTGAGACCCCAGGGTCGGGA',
 'pDS382': 'GTGGGTCCTTCGGCAGGAGG',
 'pDS041': 'GCGGTCCACAAAGGCTCAGA',
 'pDS042': 'GGTAGGCGCGGCTGACCGGT',
 'pDS383': 'GGCGCCCAGCGGAGCTAATC',
 'pDS026': 'GGGCGCCTGTGCTTGAGGTT',
 'pDS074': 'GCTTGCCTGGAACTCCTGTA',
 'pDS463': 'GTAGCTGCCTTCAGCCTTCAC',
 'pDS508': 'GCCATTGGGCTGTCAGTCAG',
 'pDS491': 'GCTGAGGGCGATGGCTGCTG',
 'pDS386': 'GAGACTTGCTTCCCCCTCAC',
 'pDS390': 'GTCACTGTAGGTGCGGCCCA',
 'pDS494': 'GGCAGCCTCGGCCCGAGTCC',
 'pDS043': 'GCAGGGCTCGCGCAGTTACC',
 'pDS506': 'GCAGGGCCG

In [11]:
# extract vector IDs from perturbation_names
cur_data.create_columns(
    slot="obs",
    col_dict={
        "vector_id": cur_data.adata.obs["perturbation_name"].str.split("_").str[1]
    },
)

cur_data.show_unique(slot="obs", column="vector_id")

Column vector_id added to adata.obs
Unique values in adata.obs.vector_id: 113
--------------------------------------------------
{'pBA572',
 'pBA573',
 'pBA574',
 'pBA575',
 'pBA576',
 'pBA577',
 'pBA578',
 'pBA579',
 'pBA580',
 'pBA581',
 'pBA582',
 'pBA586',
 'pBA608',
 'pDS001',
 'pDS002',
 'pDS003',
 'pDS004',
 'pDS005',
 'pDS006',
 'pDS007',
 'pDS008',
 'pDS009',
 'pDS010',
 'pDS011',
 'pDS017',
 'pDS024',
 'pDS026',
 'pDS027',
 'pDS029',
 'pDS031',
 'pDS032',
 'pDS033',
 'pDS036',
 'pDS038',
 'pDS040',
 'pDS041',
 'pDS042',
 'pDS043',
 'pDS044',
 'pDS046',
 'pDS051',
 'pDS052',
 'pDS055',
 'pDS065',
 'pDS074',
 'pDS087',
 'pDS088',
 'pDS089',
 'pDS090',
 'pDS091',
 'pDS096',
 'pDS110',
 'pDS124',
 'pDS156',
 'pDS159',
 'pDS160',
 'pDS162',
 'pDS175',
 'pDS186',
 'pDS218',
 'pDS219',
 'pDS226',
 'pDS284',
 'pDS353',
 'pDS371',
 'pDS373',
 'pDS381',
 'pDS382',
 'pDS383',
 'pDS386',
 'pDS390',
 'pDS391',
 'pDS393',
 'pDS394',
 'pDS395',
 'pDS396',
 'pDS397',
 'pDS398',
 'pDS401',
 '

In [12]:
cur_data.map_values_from_column(
    ref_col="vector_id", target_col="guide_sequence", map_dict=guides_dict_vecid
)

Column guide_sequence created in adata.obs
Mapped value pDS381 in column vector_id to GAGGGCGGCCTACCTCTCCT in column guide_sequence of adata.obs
Mapped value pDS391 in column vector_id to GGAACGCGACACCGGGTAGA in column guide_sequence of adata.obs
Mapped value pDS434 in column vector_id to GGGGCCAGCAGCCGTCTACC in column guide_sequence of adata.obs
Mapped value pDS458 in column vector_id to GGTCCGTCCGGAGCCAGGAG in column guide_sequence of adata.obs
Mapped value pDS051 in column vector_id to GACGCAAAGACGCACAGACC in column guide_sequence of adata.obs
Mapped value pDS052 in column vector_id to GCGCACAGACCCGGCGAGGA in column guide_sequence of adata.obs
Mapped value pDS055 in column vector_id to GAGTCTCCGCAAGGCCCCGG in column guide_sequence of adata.obs
Mapped value pDS468 in column vector_id to GTAGGAGCCTCGGGCGCGCT in column guide_sequence of adata.obs
Mapped value pDS460 in column vector_id to GAGCCATGGCAGATTCCTCC in column guide_sequence of adata.obs
Mapped value pDS006 in column vector_id

### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [13]:
cur_data.create_columns(
    slot="obs",
    col_dict={"perturbed_target_symbol": cur_data.adata.obs["perturbation_name"]},
    overwrite=True,
)
cur_data.show_unique(slot="obs", column="perturbed_target_symbol")

Column perturbed_target_symbol added to adata.obs
Unique values in adata.obs.perturbed_target_symbol: 113
--------------------------------------------------
{'62(mod)_pBA581',
 '63(mod)_pBA580',
 'AARS_pDS381',
 'AMIGO3_pDS434',
 'ARHGAP22_pDS458',
 'ASCC3_pDS051',
 'ASCC3_pDS052',
 'ATF4_pBA576',
 'ATF4_pBA577',
 'ATF4_pBA608',
 'ATF6_pBA586',
 'ATP5B_pDS055',
 'C7orf26_pDS004',
 'CAD_pDS468',
 'CARS_pDS460',
 'CCND3_pDS005',
 'CCND3_pDS006',
 'CHERP_pDS024',
 'COPB1_pDS065',
 'COPZ1_pDS462',
 'DAD1_pDS499',
 'DARS_pDS495',
 'DDOST_pDS382',
 'DDRGK1_pDS041',
 'DERL2_pDS042',
 'DHDDS_pDS383',
 'DNAJC19_pDS026',
 'DNAJC19_pDS074',
 'EIF2AK3_pBA572',
 'EIF2AK3_pBA573',
 'EIF2B2_pDS463',
 'EIF2B3_pDS508',
 'EIF2B4_pDS491',
 'EIF2S1_pDS386',
 'ERN1_pBA574',
 'ERN1_pBA575',
 'FARSB_pDS390',
 'FECH_pDS494',
 'GBF1_pDS043',
 'GBF1_pDS044',
 'GMPPB_pDS391',
 'GNPNAT1_pDS506',
 'Gal4-4(mod)_pBA582',
 'HARS_pDS466',
 'HSD17B12_pDS087',
 'HSPA5_pDS017',
 'HSPA5_pDS371',
 'HSPA9_pDS088',
 'HYOU1_p

#### Clean up `perturbation_target_symbol` column

In [14]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    map_dict={
        r"62\(mod\).*": "control_nontargeting",
        r"63\(mod\).*": "control_nontargeting",
        r"Gal4-4.*": "control_nontargeting",
        r"_(pD|pB).*": "",
    }
)


cur_data.show_unique(slot="obs", column="perturbed_target_symbol")

Replaced '62\(mod\).*' with 'control_nontargeting' in column perturbed_target_symbol of adata.obs
Replaced '63\(mod\).*' with 'control_nontargeting' in column perturbed_target_symbol of adata.obs
Replaced 'Gal4-4.*' with 'control_nontargeting' in column perturbed_target_symbol of adata.obs
Replaced '_(pD|pB).*' with '' in column perturbed_target_symbol of adata.obs
Unique values in adata.obs.perturbed_target_symbol: 91
--------------------------------------------------
{'AARS',
 'AMIGO3',
 'ARHGAP22',
 'ASCC3',
 'ATF4',
 'ATF6',
 'ATP5B',
 'C7ORF26',
 'CAD',
 'CARS',
 'CCND3',
 'CHERP',
 'CONTROL_NONTARGETING',
 'COPB1',
 'COPZ1',
 'DAD1',
 'DARS',
 'DDOST',
 'DDRGK1',
 'DERL2',
 'DHDDS',
 'DNAJC19',
 'EIF2AK3',
 'EIF2B2',
 'EIF2B3',
 'EIF2B4',
 'EIF2S1',
 'ERN1',
 'FARSB',
 'FECH',
 'GBF1',
 'GMPPB',
 'GNPNAT1',
 'HARS',
 'HSD17B12',
 'HSPA5',
 'HSPA9',
 'HYOU1',
 'IARS2',
 'IDH3A',
 'IER3IP1',
 'KCTD16',
 'MANF',
 'MARS',
 'MRGBP',
 'MRPL39',
 'MTHFD1',
 'NEDD8',
 'OST4',
 'P4HB',
 '

  if df[column].str.upper().str.contains(old_val.upper()).any():


### Standardise perturbation targets

In [15]:
cur_data.standardize_genes(
    slot="obs",
    input_column="perturbed_target_symbol",
    input_column_type="gene_symbol",
    multiple_entries=False
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Mapped potential synonyms in perturbed_target_symbol of the provided dataframe to gene symbols
Converted 90/91 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Add `perturbed_target_number` column

In [16]:
cur_data.count_entries(
    slot="obs",
    input_column="perturbed_target_symbol",
    count_column_name="perturbed_target_number",
    sep="|",
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


### Encode chromosomes as integers

In [17]:
cur_data.chromosome_encoding()

Chromosome encoding applied to perturbed_target_chromosome in adata.obs and stored as 'perturbed_target_chromosome_encoding'.


In [18]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_chromosome_encoding'])

Observation data:
DataFrame shape: (62623, 2)
--------------------------------------------------
               perturbation_name  perturbed_target_chromosome_encoding
index                                                                 
AAACATACAAGATG    63(mod)_pBA580                                     0
AAACATACACCTAG       OST4_pDS353                                     2
AAACATACTTCCCG    SEC61A1_pDS031                                     3
AAACATTGAAACAG     EIF2B4_pDS491                                     2
AAACATTGCAGCTA       SRPR_pDS482                                    11
...                          ...                                   ...
TTTGCATGCTTTAC      STT3A_pDS011                                    11
TTTGCATGGAGGAC   ARHGAP22_pDS458                                    10
TTTGCATGTAGAGA    63(mod)_pBA580                                     0
TTTGCATGTCAAGC     KCTD16_pDS096                                     5
TTTGCATGTGGAGG     SAMM50_pDS156                   

### Add metadata

In [19]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "dataset_id": cur_data.dataset_id,
        "sample_id": range(1, cur_data.adata.obs.shape[0] + 1),
        # treatment
        "treatment_label": None, 
        "treatment_id": None,
        # perturbation type
        "perturbation_type_label": "CRISPRi", 
        "perturbation_type_id": None,
        # model system
        "model_system_label": "cell line", 
        "model_system_id": None,
        
        'tissue': 'blood',
        "timepoint": "P0DT0H0M0S",
        
        "species": "Homo sapiens",
        "sex_label": "female", 
        "sex_id": None,
        "developmental_stage_label": "adult", 
        "developmental_stage_id": None,

        "study_title": "A Multiplexed Single-Cell CRISPR Screening Platform Enables Systematic Dissection of the Unfolded Protein Response",
        "study_uri": "https://doi.org/10.1016/j.cell.2016.11.048",
        "study_year": 2016,
        "first_author": "Britt Adamson",
        "last_author": "Jonathan Weissman",
        
        "experiment_title": "63000 chronic myeloid leukemia (K562) cells transfected with a UPR sensor gene-targeting gRNAs.",
        "experiment_summary": "Perturb-seq was applied to a small CRISPRi library of 91 sgRNAs targeting UPR genes in K562 cells.",
        "number_of_perturbed_targets": len(set(cur_data.adata.obs['perturbed_target_coord'])),
        "number_of_perturbed_samples": cur_data.adata.obs.shape[0],
        
        "library_generation_type_id": "EFO:0022868",
        "library_generation_type_label": "endogenous",

        "library_generation_method_id": "EFO:0022895",
        "library_generation_method_label": "dCas9-KRAB",

        "enzyme_delivery_method_id": None,
        "enzyme_delivery_method_label": "retroviral transduction",

        "library_delivery_method_id": None,
        "library_delivery_method_label": "lentiviral transduction",

        "enzyme_integration_state_id": None,
        "enzyme_integration_state_label": "random locus integration",

        "library_integration_state_id": None,
        "library_integration_state_label": "random locus integration",

        "enzyme_expression_control_id": None,
        "enzyme_expression_control_label": "constitutive expression",

        "library_expression_control_id": None,
        "library_expression_control_label": "constitutive expression",

        "library_name": "custom",
        "library_uri": None,

        "library_format_id": None,
        "library_format_label": "pooled",

        "library_scope_id": None,
        "library_scope_label": "focused",

        "library_perturbation_type_id": None,
        "library_perturbation_type_label": "inhibition",
        
        "library_manufacturer": "Weissman",
        "library_lentiviral_generation": "3",
        "library_grnas_per_target": "1",
        "library_total_grnas": "91",
        "library_total_variants": None,
        
        "readout_dimensionality_id": None,
        "readout_dimensionality_label": "high-dimensional assay",

        "readout_type_id": None,
        "readout_type_label": "transcriptomic",

        "readout_technology_id": None,
        "readout_technology_label": "single-cell rna-seq",

        "method_name_id": None,
        "method_name_label": "Perturb-seq",

        "method_uri": None,

        "sequencing_library_kit_id": None,
        "sequencing_library_kit_label": "10x Genomics Single Cell 3-prime",

        "sequencing_platform_id": None,
        "sequencing_platform_label": "Illumina HiSeq 4000",

        "sequencing_strategy_id": None,
        "sequencing_strategy_label": "barcode sequencing",

        "software_counts_id": None,
        "software_counts_label": "CellRanger",

        "software_analysis_id": None,
        "software_analysis_label": "MAGeCK",

        "reference_genome_id": None,
        "reference_genome_label": "GRCh37",

        "associated_datasets": json.dumps([
            {
                "dataset_accession": "GSM2406677",
                "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM2406681",
                "dataset_description": "Barcode, cell identities, raw gene expression matrix",
                "dataset_file_name": "GSM2406677_10X010",
            },
            {
                "dataset_accession": "GSM2406677_10X005",
                "dataset_uri": "https://zenodo.org/records/7041849/files/AdamsonWeissman2016_GSM2406681_10X010.h5ad",
                "dataset_description": "Processed .h5ad file",
                "dataset_file_name": "AdamsonWeissman2016_GSM2406681_10X010.h5ad",
            }
        ])
    }
)

Column dataset_id added to adata.obs
Column sample_id added to adata.obs
Column treatment_label added to adata.obs
Column treatment_id added to adata.obs
Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs
Column model_system_label added to adata.obs
Column model_system_id added to adata.obs
Column tissue added to adata.obs
Column timepoint added to adata.obs
Column species added to adata.obs
Column sex_label added to adata.obs
Column sex_id added to adata.obs
Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs
Column study_title added to adata.obs
Column study_uri added to adata.obs
Column study_year added to adata.obs
Column first_author added to adata.obs
Column last_author added to adata.obs
Column experiment_title added to adata.obs
Column experiment_summary added to adata.obs
Column number_of_perturbed_targets added to adata.obs
Column number_of_perturbed_samples added to adata.obs
Colum

In [20]:
cur_data.adata.obs

Unnamed: 0_level_0,tissue_type,celltype,organism,cancer,disease,percent_ribo,read count,cell_line,UMI count,perturbation_type,...,sequencing_platform_label,sequencing_strategy_id,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,reference_genome_id,reference_genome_label,associated_datasets
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACAAGATG,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,21.306112,282.0,K562,8.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
AAACATACACCTAG,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,19.492201,331.0,K562,7.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
AAACATACTTCCCG,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,23.199894,285.0,K562,10.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
AAACATTGAAACAG,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,28.733555,1036.0,K562,30.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
AAACATTGCAGCTA,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,26.729864,863.0,K562,25.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCATGCTTTAC,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,22.679703,476.0,K562,17.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
TTTGCATGGAGGAC,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,26.983313,539.0,K562,19.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
TTTGCATGTAGAGA,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,26.207104,647.0,K562,35.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
TTTGCATGTCAAGC,cell_line,lymphoblasts,human,True,chronic myelogenous leukemia,20.755890,98.0,K562,4.0,CRISPR,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."


### Curate tissue information

In [21]:

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


### Curate cell type information

In [22]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
   input_column input_column_lower    name_lower ontology_id
0  lymphoblasts       lymphoblasts  lymphoblasts  CL:0017005
--------------------------------------------------


### Curate cell line information

In [23]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower  ontology_id
0         K562               k562       k562  CLO:0007050
--------------------------------------------------


### Curate disease information

In [24]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
                   input_column            input_column_lower  \
0  chronic myelogenous leukemia  chronic myelogenous leukemia   

                     name_lower    ontology_id  
0  chronic myelogenous leukemia  MONDO:0011996  
--------------------------------------------------


### Match schema column order

In [25]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [26]:
cur_data.validate_data(slot='obs')

adata.obs is valid according to the obs_schema.
Validated data:


Unnamed: 0,dataset_id,sample_id,perturbation_name,perturbed_target_coord,perturbed_target_chromosome,perturbed_target_chromosome_encoding,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,...,sequencing_platform_label,sequencing_strategy_id,sequencing_strategy_label,software_counts_id,software_counts_label,software_analysis_id,software_analysis_label,reference_genome_id,reference_genome_label,associated_datasets
0,adamson_2016_upr_perturb_seq,1,63(mod)_pBA580,,,0,1,,,,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
1,adamson_2016_upr_perturb_seq,2,OST4_pDS353,chr2:27070472-27071654;-1,2,2,1,ENSG00000228474,OST4,protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
2,adamson_2016_upr_perturb_seq,3,SEC61A1_pDS031,chr3:128051641-128071705;1,3,3,1,ENSG00000058262,SEC61A1,protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
3,adamson_2016_upr_perturb_seq,4,EIF2B4_pDS491,chr2:27364352-27370338;-1,2,2,1,ENSG00000115211,EIF2B4,protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
4,adamson_2016_upr_perturb_seq,5,SRPR_pDS482,chr11:126262938-126269144;-1,11,11,1,ENSG00000182934,SRPRA,protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62618,adamson_2016_upr_perturb_seq,62619,STT3A_pDS011,chr11:125591712-125625215;1,11,11,1,ENSG00000134910,STT3A,protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
62619,adamson_2016_upr_perturb_seq,62620,ARHGAP22_pDS458,chr10:48446036-48656265;-1,10,10,1,ENSG00000128805,ARHGAP22,protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
62620,adamson_2016_upr_perturb_seq,62621,63(mod)_pBA580,,,0,1,,,,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."
62621,adamson_2016_upr_perturb_seq,62622,KCTD16_pDS096,chr5:144170873-144485686;1,5,5,1,ENSG00000183775,KCTD16,protein_coding,...,Illumina HiSeq 4000,,barcode sequencing,,CellRanger,,MAGeCK,,GRCh37,"[{""dataset_accession"": ""GSM2406677"", ""dataset_..."


# VAR slot curation

### Standardise genes

In [27]:
cur_data.show_var()

Variable data:
DataFrame shape: (32738, 3)
--------------------------------------------------
                   ensembl_id  ncounts  ncells
gene_symbol                                   
MIR1302-10    ENSG00000243485     11.0      11
FAM138A       ENSG00000237613      0.0       0
OR4F5         ENSG00000186092      0.0       0
RP11-34P13.7  ENSG00000238009      0.0       0
RP11-34P13.8  ENSG00000239945     43.0      43
...                       ...      ...     ...
AC145205.1    ENSG00000215635      0.0       0
BAGE5         ENSG00000268590      0.0       0
CU459201.1    ENSG00000251180      0.0       0
AC002321.2    ENSG00000215616      0.0       0
AC002321.1    ENSG00000215611      0.0       0

[32738 rows x 3 columns]
--------------------------------------------------


In [28]:
cur_data.standardize_genes(
    slot="var", input_column="ensembl_id", input_column_type="ensembl_gene_id"
)

Converted 30168/32738 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Validate var metadata

In [29]:
cur_data.validate_data(slot="var")

adata.var is valid according to the var_schema.
Validated data:


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-10,ENSG00000243485,MIR1302-2HG
FAM138A,ENSG00000237613,FAM138A
OR4F5,ENSG00000186092,OR4F5
RP11-34P13.7,ENSG00000238009,
RP11-34P13.8,ENSG00000239945,
...,...,...
AC145205.1,ENSG00000215635,
BAGE5,ENSG00000268590,
CU459201.1,ENSG00000251180,
AC002321.2,ENSG00000215616,


# Save the dataset

In [30]:
cur_data.save_curated_data_h5ad()

Curated data saved to ../curated/h5ad/adamson_2016_upr_perturb_seq_curated.h5ad


In [31]:
cur_data.save_curated_data_parquet(split_metadata=True)

Starting the conversion of adata to a long format DataFrame...
Starting the conversion to long format...
Processing 32738 genes in 164 chunks of size 200...
Created ParquetWriter and wrote chunk 1/164
Appended chunk 2/164 to parquet file
Appended chunk 3/164 to parquet file
Appended chunk 4/164 to parquet file
Appended chunk 5/164 to parquet file
Appended chunk 6/164 to parquet file
Appended chunk 7/164 to parquet file
Appended chunk 8/164 to parquet file
Appended chunk 9/164 to parquet file
Appended chunk 10/164 to parquet file
Appended chunk 11/164 to parquet file
Appended chunk 12/164 to parquet file
Appended chunk 13/164 to parquet file
Appended chunk 14/164 to parquet file
Appended chunk 15/164 to parquet file
Appended chunk 16/164 to parquet file
Appended chunk 17/164 to parquet file
Appended chunk 18/164 to parquet file
Appended chunk 19/164 to parquet file
Appended chunk 20/164 to parquet file
Appended chunk 21/164 to parquet file
Appended chunk 22/164 to parquet file
Appended 

In [None]:
cur_data.save_curated_data_parquet(split_metadata=False)