# Import

In [1]:
import pandas as pd

import sys

sys.path.append("../../")
from curation_tools.curation_tools import CuratedDataset
from curation_tools.perturbseq_anndata_schema import ObsSchema, VarSchema

from curation_tools.unified_metadata_schema.unified_metadata_schema import Experiment

# Initialise the dataset object

In [2]:
cur_data = CuratedDataset(
    obs_schema=ObsSchema,
    var_schema=VarSchema,
    exp_metadata_schema=Experiment,
    data_source_link = "https://zenodo.org/records/13350497/files/DatlingerBock2017.h5ad",
    noncurated_path = "../non_curated/h5ad/datlinger_2017.h5ad"
)

# Download the dataset

In [3]:
cur_data.download_data()
cur_data.load_data()
# show the data
cur_data.show_obs()

File ../non_curated/h5ad/datlinger_2017.h5ad already exists. Skipping download.
Loading data from ../non_curated/h5ad/datlinger_2017.h5ad
Observation data:
DataFrame shape: (5905, 17)
--------------------------------------------------
                    perturbation perturbation_2 replicate target celltype  \
cell_barcode                                                                
TACTTGACCCCN             control     stimulated         1    NaN  T cells   
TTACAGCTGAAC   Tcrlibrary_JUND_2     stimulated         1   JUND  T cells   
CTAAGGCCCTTA  Tcrlibrary_BACH2_3     stimulated         1  BACH2  T cells   
CTTGACGCAGGT  Tcrlibrary_NFKB2_3     stimulated         1  NFKB2  T cells   
TAACCCGTACGC    Tcrlibrary_JUN_1     stimulated         1    JUN  T cells   
...                          ...            ...       ...    ...      ...   
GTGTGTCGGGGA             control   unstimulated         5    NaN  T cells   
TTTAGTATTCCA   Tcrlibrary_EGR1_3   unstimulated         5   EGR1  T cell

In [4]:
cur_data.show_var()

Variable data:
DataFrame shape: (36722, 2)
--------------------------------------------------
              ncounts  ncells
gene_symbol                  
A1BG             60.0      57
A1BG-AS1        324.0     306
A1CF              1.0       1
A2M               6.0       6
A2M-AS1           1.0       1
...               ...     ...
hsa-mir-1587      7.0       7
hsa-mir-3149      0.0       0
hsa-mir-4259      1.0       1
snosnR66          1.0       1
yR211F11.2        3.0       3

[36722 rows x 2 columns]
--------------------------------------------------


# OBS slot curation

### Show unique perturbations

In [5]:
cur_data.show_unique(slot = 'obs', column = 'perturbation')

Unique values in adata.obs.perturbation: 97
--------------------------------------------------
{'Essential_library_DHODH_1',
 'Essential_library_DHODH_2',
 'Essential_library_DHODH_3',
 'Essential_library_MVD_1',
 'Essential_library_MVD_2',
 'Essential_library_MVD_3',
 'Essential_library_TUBB_1',
 'Essential_library_TUBB_2',
 'Essential_library_TUBB_3',
 'Tcrlibrary_BACH2_1',
 'Tcrlibrary_BACH2_2',
 'Tcrlibrary_BACH2_3',
 'Tcrlibrary_DOK2_1',
 'Tcrlibrary_DOK2_2',
 'Tcrlibrary_DOK2_3',
 'Tcrlibrary_EGR1_1',
 'Tcrlibrary_EGR1_2',
 'Tcrlibrary_EGR1_3',
 'Tcrlibrary_EGR2_1',
 'Tcrlibrary_EGR2_2',
 'Tcrlibrary_EGR2_3',
 'Tcrlibrary_EGR3_1',
 'Tcrlibrary_EGR3_2',
 'Tcrlibrary_EGR3_3',
 'Tcrlibrary_EGR4_1',
 'Tcrlibrary_EGR4_2',
 'Tcrlibrary_EGR4_3',
 'Tcrlibrary_ETS1_1',
 'Tcrlibrary_ETS1_2',
 'Tcrlibrary_ETS1_3',
 'Tcrlibrary_FOS_1',
 'Tcrlibrary_FOS_2',
 'Tcrlibrary_FOS_3',
 'Tcrlibrary_GATA3_1',
 'Tcrlibrary_GATA3_2',
 'Tcrlibrary_GATA3_3',
 'Tcrlibrary_JUNB_1',
 'Tcrlibrary_JUNB_2',
 'T

### Rename `perturbation` to `perturbation_name`

In [6]:
cur_data.rename_columns(slot = 'obs', name_dict = {'perturbation': 'perturbation_name'})

Renamed columns in adata.obs: {'perturbation': 'perturbation_name'}


### Add guide RNA information

Data from perturbseq has non-uniquely identifiable control cells labelled as "control". This prevents unique assingnement of guide RNA information to these cells.

However, the original count matrix and supplementary data file contains this information, so we have to add it manually.

Original data: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE92nnn/GSE92872/suppl/GSE92872%5FCROP%2Dseq%5FJurkat%5FTCR.digital%5Fexpression.csv.gz

Supplementary table S2: https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.4177/MediaObjects/41592_2017_BFnmeth4177_MOESM268_ESM.xlsx

In [7]:
# read data from supplementary files
cell_ids_df = pd.read_csv("../supplementary/datlinger_2017_cell_ids.csv")
guide_ids_df = pd.read_csv("../supplementary/datlinger_2017_guide_ids.csv")
guide_ids_df = guide_ids_df[["gRNA_ID", "Sequence"]]

# merge the dataframes to get the guide sequences for each cell
guides_df = cell_ids_df.merge(
    guide_ids_df, left_on="grna", right_on="gRNA_ID", how="left", indicator=True
)
# convert the dataframe to a dictionary with cell barcodes as keys and guide sequences as values
guides_dict_vecid = guides_df.set_index("cell")["Sequence"].to_dict()

guides_dict_vecid

{'TACTTGACCCCN': 'GAGAACGTGATAAGACTCGG',
 'TTACAGCTGAAC': 'GAACTGTGAGCTCGTCGGCG',
 'CTAAGGCCCTTA': 'CAGCACAGCGGATGACCTCG',
 'CTTGACGCAGGT': 'CGAGGGACCAGCCAAGATCG',
 'TAACCCGTACGC': 'AAGGTCCGCTCTCGGACGGG',
 'ATCTAGATACNN': 'GAATGCTGAGTACGGTCTGT',
 'CTATCGTTCTTN': 'CCATCCCATGGTGGACTACC',
 'GTATTGCGAGCN': 'TACGGCGATGAGGCGCTGAG',
 'GTACTGTGTTAN': 'CAGTACTTTTACCCCCGCGG',
 'CGTCTTTCANNN': 'TAGGCTGTTCCACGATCACC',
 'CAGTTTATTCAC': 'CTACTACGGAAACTCGGTCA',
 'CTTGTTGCCTGT': 'TACGTGCCCGAGTACAGCTC',
 'GCATCAGTAATN': 'GGTGATCCTAGTCGACTGGC',
 'CCGCCAGCTCTC': 'CCCGTATGAGCTTCGGATTG',
 'AATCTTGTAGGG': 'CAGCACAGCGGATGACCTCG',
 'AAGCCACCTGAN': 'CAAGTGCGAGGGGCGCTCCG',
 'CAATAAGGTGNN': 'TCCTCCGGCCGACGCCTTCA',
 'CCAGGTTACTTG': 'CAGAAACCCATGTTCGGGAC',
 'GATCAGCCCGAG': 'GAACTGTGAGCTCGTCGGCG',
 'TGATGGCGTGAA': 'GCCCCACGACAACCGCACCA',
 'GTCTAATGTCNN': 'GCGCATCAATATGCCCGCAC',
 'TAGTATTTTCCN': 'CCCGTATGAGCTTCGGATTG',
 'CGGCTGCTCATN': 'GCTCAATGATCTCCACATAG',
 'CTCCGCCCCTGN': 'TCGGTGCGCACCAGCTCGCC',
 'GTTCTACGATNN':

In [8]:
# create a cell barcode column for mapping
cur_data.create_columns(
    slot="obs",
    col_dict={
        "cell_bc": cur_data.adata.obs.index
    },
)

# map the guide sequences to the cell barcodes
cur_data.map_values_from_column(
    ref_col="cell_bc", target_col="guide_sequence", map_dict=guides_dict_vecid
)


Column cell_bc added to adata.obs
Column guide_sequence created in adata.obs
Mapped value TACTTGACCCCN in column cell_bc to GAGAACGTGATAAGACTCGG in column guide_sequence of adata.obs
Mapped value TTACAGCTGAAC in column cell_bc to GAACTGTGAGCTCGTCGGCG in column guide_sequence of adata.obs
Mapped value CTAAGGCCCTTA in column cell_bc to CAGCACAGCGGATGACCTCG in column guide_sequence of adata.obs
Mapped value CTTGACGCAGGT in column cell_bc to CGAGGGACCAGCCAAGATCG in column guide_sequence of adata.obs
Mapped value TAACCCGTACGC in column cell_bc to AAGGTCCGCTCTCGGACGGG in column guide_sequence of adata.obs
Mapped value ATCTAGATACNN in column cell_bc to GAATGCTGAGTACGGTCTGT in column guide_sequence of adata.obs
Mapped value CTATCGTTCTTN in column cell_bc to CCATCCCATGGTGGACTACC in column guide_sequence of adata.obs
Mapped value GTATTGCGAGCN in column cell_bc to TACGGCGATGAGGCGCTGAG in column guide_sequence of adata.obs
Mapped value GTACTGTGTTAN in column cell_bc to CAGTACTTTTACCCCCGCGG in colu

### Extract perturbation symbols

#### Add `perturbation_target_symbol` column based on the `perturbation_name`

In [9]:
cur_data.create_columns(
    slot="obs", col_dict={"perturbed_target_symbol": cur_data.adata.obs["perturbation_name"]},
    overwrite=True
)
cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Column perturbed_target_symbol added to adata.obs
Unique values in adata.obs.perturbed_target_symbol: 97
--------------------------------------------------
{'Essential_library_DHODH_1',
 'Essential_library_DHODH_2',
 'Essential_library_DHODH_3',
 'Essential_library_MVD_1',
 'Essential_library_MVD_2',
 'Essential_library_MVD_3',
 'Essential_library_TUBB_1',
 'Essential_library_TUBB_2',
 'Essential_library_TUBB_3',
 'Tcrlibrary_BACH2_1',
 'Tcrlibrary_BACH2_2',
 'Tcrlibrary_BACH2_3',
 'Tcrlibrary_DOK2_1',
 'Tcrlibrary_DOK2_2',
 'Tcrlibrary_DOK2_3',
 'Tcrlibrary_EGR1_1',
 'Tcrlibrary_EGR1_2',
 'Tcrlibrary_EGR1_3',
 'Tcrlibrary_EGR2_1',
 'Tcrlibrary_EGR2_2',
 'Tcrlibrary_EGR2_3',
 'Tcrlibrary_EGR3_1',
 'Tcrlibrary_EGR3_2',
 'Tcrlibrary_EGR3_3',
 'Tcrlibrary_EGR4_1',
 'Tcrlibrary_EGR4_2',
 'Tcrlibrary_EGR4_3',
 'Tcrlibrary_ETS1_1',
 'Tcrlibrary_ETS1_2',
 'Tcrlibrary_ETS1_3',
 'Tcrlibrary_FOS_1',
 'Tcrlibrary_FOS_2',
 'Tcrlibrary_FOS_3',
 'Tcrlibrary_GATA3_1',
 'Tcrlibrary_GATA3_2',
 'Tcrlibr

#### Clean up `perturbation_target_symbol` column

In [10]:
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace=r"Essential_library_",
    replace_value=""
)
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace=r"Tcrlibrary_",
    replace_value=""
)
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace="_(1|2|3)",
    replace_value=""
)
cur_data.replace_entries(
    slot="obs",
    column="perturbed_target_symbol",
    to_replace="control",
    replace_value="control_nontargeting"
)



cur_data.show_unique(slot = 'obs', column = 'perturbed_target_symbol')

Replaced entries Essential_library_ ->  in column perturbed_target_symbol of adata.obs
Replaced entries Tcrlibrary_ ->  in column perturbed_target_symbol of adata.obs
Replaced entries _(1|2|3) ->  in column perturbed_target_symbol of adata.obs
Replaced entries control -> control_nontargeting in column perturbed_target_symbol of adata.obs
Unique values in adata.obs.perturbed_target_symbol: 33
--------------------------------------------------
{'BACH2',
 'DHODH',
 'DOK2',
 'EGR1',
 'EGR2',
 'EGR3',
 'EGR4',
 'ETS1',
 'FOS',
 'GATA3',
 'JUN',
 'JUNB',
 'JUND',
 'LAT',
 'LCK',
 'MVD',
 'NFAT5',
 'NFATC1',
 'NFATC2',
 'NFATC3',
 'NFKB1',
 'NFKB2',
 'NR4A1',
 'PTPN11',
 'PTPN6',
 'REL',
 'RELA',
 'RELB',
 'RUNX1',
 'RUNX2',
 'TUBB',
 'ZAP70',
 'control_nontargeting'}
--------------------------------------------------


### Standardise perturbation targets

In [11]:
cur_data.standardize_genes(
    slot='obs',
    input_column='perturbed_target_symbol',
    input_column_type='gene_symbol',
    multiple_entries=False
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Mapped potential synonyms in perturbed_target_symbol of the provided dataframe to gene symbols
Converted 5905/5905 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Add `perturbed_target_number` column

In [13]:
cur_data.count_entries(
    slot='obs',
    input_column='perturbed_target_symbol',
    count_column_name='perturbed_target_number',
    sep='|'
)

Counted entries in column perturbed_target_symbol of adata.obs and stored in perturbed_target_number


In [15]:
cur_data.show_obs(['perturbation_name', 'perturbed_target_symbol', 'perturbed_target_ensg', 'perturbed_target_coord'])

Observation data:
DataFrame shape: (5905, 4)
--------------------------------------------------
               perturbation_name perturbed_target_symbol  \
index                                                      
TACTTGACCCCN             control    control_nontargeting   
TTACAGCTGAAC   Tcrlibrary_JUND_2                    JUND   
CTAAGGCCCTTA  Tcrlibrary_BACH2_3                   BACH2   
CTTGACGCAGGT  Tcrlibrary_NFKB2_3                   NFKB2   
TAACCCGTACGC    Tcrlibrary_JUN_1                     JUN   
...                          ...                     ...   
GTGTGTCGGGGA             control    control_nontargeting   
TTTAGTATTCCA   Tcrlibrary_EGR1_3                    EGR1   
GGCGCCTAATCG             control    control_nontargeting   
GCTGAGCGTTTN  Tcrlibrary_NR4A1_1                   NR4A1   
ACTCGCAGAGAT    Tcrlibrary_REL_3                     REL   

             perturbed_target_ensg       perturbed_target_coord  
index                                                    

### Add treatment information

Map stimulated and unstimulated to "anti-CD3 antibody|anti-CD28 antibody" and "Untreated control" respectively.

In [16]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "treatment_label": cur_data.adata.obs['perturbation_2'],
        "treatment_id": cur_data.adata.obs['perturbation_2']
    }
)

Column treatment_label added to adata.obs
Column treatment_id added to adata.obs


In [17]:
cur_data.replace_entries(
    slot="obs",
    column="treatment_label",
    to_replace="unstimulated",
    replace_value="Untreated Control"
)

cur_data.replace_entries(
    slot="obs",
    column="treatment_label",
    to_replace="stimulated",
    replace_value="anti-CD3 antibody|anti-CD28 antibody"
)

cur_data.replace_entries(
    slot="obs",
    column="treatment_id",
    to_replace="unstimulated",
    replace_value="NCIT:C184729"
)

cur_data.replace_entries(
    slot="obs",
    column="treatment_id",
    to_replace="stimulated",
    replace_value="EFO:0003317|EFO:0003304"
)

Replaced entries unstimulated -> Untreated Control in column treatment_label of adata.obs
Replaced entries stimulated -> anti-CD3 antibody|anti-CD28 antibody in column treatment_label of adata.obs
Replaced entries unstimulated -> NCIT:C184729 in column treatment_id of adata.obs
Replaced entries stimulated -> EFO:0003317|EFO:0003304 in column treatment_id of adata.obs


In [18]:
cur_data.show_obs(['perturbation_2', 'treatment_label', 'treatment_id'])

Observation data:
DataFrame shape: (5905, 3)
--------------------------------------------------
             perturbation_2                       treatment_label  \
index                                                               
TACTTGACCCCN     stimulated  anti-CD3 antibody|anti-CD28 antibody   
TTACAGCTGAAC     stimulated  anti-CD3 antibody|anti-CD28 antibody   
CTAAGGCCCTTA     stimulated  anti-CD3 antibody|anti-CD28 antibody   
CTTGACGCAGGT     stimulated  anti-CD3 antibody|anti-CD28 antibody   
TAACCCGTACGC     stimulated  anti-CD3 antibody|anti-CD28 antibody   
...                     ...                                   ...   
GTGTGTCGGGGA   unstimulated                     Untreated Control   
TTTAGTATTCCA   unstimulated                     Untreated Control   
GGCGCCTAATCG   unstimulated                     Untreated Control   
GCTGAGCGTTTN   unstimulated                     Untreated Control   
ACTCGCAGAGAT   unstimulated                     Untreated Control   

      

### Add perturbation information

In [19]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "perturbation_type_label": "CRISPRn", 
        "perturbation_type_id": None
    }
)

Column perturbation_type_label added to adata.obs
Column perturbation_type_id added to adata.obs


### Add timepoint information

In [20]:
cur_data.create_columns(
    slot="obs",
    col_dict={"timepoint": "P0DT0H0M0S"},
)

Column timepoint added to adata.obs


### Add model system information

In [21]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "model_system_label": "cell line", 
        "model_system_id": None
    }
)

Column model_system_label added to adata.obs
Column model_system_id added to adata.obs


### Add tissue information

In [22]:
cur_data.create_columns(
    slot='obs',
    col_dict={
        'tissue': 'blood'
    }
)

cur_data.standardize_ontology(
    input_column='tissue',
    column_type='term_name',
    ontology_type='tissue',
    overwrite=True
)

Column tissue added to adata.obs
Mapped 1 tissue ontology terms from `tissue` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower     ontology_id
0        blood              blood      blood  UBERON:0000178
--------------------------------------------------


### Add cell type information

In [23]:
cur_data.standardize_ontology(
    input_column='celltype',
    column_type='term_name',
    ontology_type='cell_type'
)

Mapped 1 cell_type ontology terms from `celltype` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
  input_column input_column_lower name_lower ontology_id
0      T cells            t cells    t cells  CL:0000084
--------------------------------------------------


### Add cell line information

In [24]:
cur_data.standardize_ontology(
    input_column='cell_line',
    column_type='term_name',
    ontology_type='cell_line'
)

Mapped 1 cell_line ontology terms from `cell_line` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
   input_column input_column_lower    name_lower  ontology_id
0  Jurkat cells       jurkat cells  jurkat cells  CLO:0007043
--------------------------------------------------


### Add disease information

In [25]:
cur_data.standardize_ontology(
    input_column='disease',
    column_type='term_name',
    ontology_type='disease'
)

Mapped 1 disease ontology terms from `disease` column to ontology terms
DataFrame shape: (1, 4)
--------------------------------------------------
            input_column     input_column_lower             name_lower  \
0  acute T cell leukemia  acute t cell leukemia  acute t cell leukemia   

     ontology_id  
0  MONDO:0004963  
--------------------------------------------------


### Add species information

In [26]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "species": "Homo sapiens"
    }
)

Column species added to adata.obs


### Add sex information

In [27]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "sex_label": "male", 
        "sex_id": None
    }
)

Column sex_label added to adata.obs
Column sex_id added to adata.obs


### Add developmental stage information

In [28]:
cur_data.create_columns(
    slot="obs",
    col_dict={
        "developmental_stage_label": "adolescent", 
        "developmental_stage_id": None
    }
)

Column developmental_stage_label added to adata.obs
Column developmental_stage_id added to adata.obs


### Match schema column order

In [29]:
cur_data.match_schema_columns(slot='obs')

Matched columns of adata.obs to the obs_schema.


### Validate obs metadata

In [30]:
cur_data.validate_data(slot='obs')

adata.obs is valid according to the obs_schema.
Validated data:


Unnamed: 0,perturbation_name,perturbed_target_coord,perturbed_target_number,perturbed_target_ensg,perturbed_target_symbol,perturbed_target_biotype,guide_sequence,perturbation_type_label,perturbation_type_id,timepoint,...,cell_type_label,cell_type_id,cell_line_label,cell_line_id,sex_label,sex_id,developmental_stage_label,developmental_stage_id,disease_label,disease_id
0,control,,1,control_nontargeting,control_nontargeting,,GAGAACGTGATAAGACTCGG,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963
1,Tcrlibrary_JUND_2,chr19:18279694-18281622;-1,1,ENSG00000130522,JUND,protein_coding,GAACTGTGAGCTCGTCGGCG,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963
2,Tcrlibrary_BACH2_3,chr6:89926528-90296843;-1,1,ENSG00000112182,BACH2,protein_coding,CAGCACAGCGGATGACCTCG,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963
3,Tcrlibrary_NFKB2_3,chr10:102394110-102402524;1,1,ENSG00000077150,NFKB2,protein_coding,CGAGGGACCAGCCAAGATCG,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963
4,Tcrlibrary_JUN_1,chr1:58776845-58784048;-1,1,ENSG00000177606,JUN,protein_coding,AAGGTCCGCTCTCGGACGGG,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5900,control,,1,control_nontargeting,control_nontargeting,,GAATGCTGAGTACGGTCTGT,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963
5901,Tcrlibrary_EGR1_3,chr5:138465479-138469303;1,1,ENSG00000120738,EGR1,protein_coding,AGTGAGGAAAGGATCCGAAC,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963
5902,control,,1,control_nontargeting,control_nontargeting,,GACCCTGAACAACCTAGGAT,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963
5903,Tcrlibrary_NR4A1_1,chr12:52022832-52059507;1,1,ENSG00000123358,NR4A1,protein_coding,TCGCCCAGCCAGACTTACGA,CRISPRn,,P0DT0H0M0S,...,T cell,CL:0000084,JURKAT cell,CLO:0007043,male,,adolescent,,T-cell acute lymphoblastic leukemia,MONDO:0004963


# VAR slot curation

### Standardise genes

In [31]:
cur_data.show_var()

Variable data:
DataFrame shape: (36722, 2)
--------------------------------------------------
              ncounts  ncells
gene_symbol                  
A1BG             60.0      57
A1BG-AS1        324.0     306
A1CF              1.0       1
A2M               6.0       6
A2M-AS1           1.0       1
...               ...     ...
hsa-mir-1587      7.0       7
hsa-mir-3149      0.0       0
hsa-mir-4259      1.0       1
snosnR66          1.0       1
yR211F11.2        3.0       3

[36722 rows x 2 columns]
--------------------------------------------------


In [32]:
cur_data.create_columns(
    slot = 'var', 
    col_dict={'gene_symbol': cur_data.adata.var.index},
    overwrite=True
)

Column gene_symbol added to adata.var


In [33]:
cur_data.standardize_genes(
    slot="var", input_column="gene_symbol", input_column_type="gene_symbol", remove_version=True
)

Removed version numbers from gene_symbol in adata.var


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  map_df["synonyms"] = map_df["synonyms"].str.split("|")


Mapped potential synonyms in gene_symbol of the provided dataframe to gene symbols
Converted 24161/36722 gene symbols/ENSG IDs to standardized gene symbols/ENSG IDs
--------------------------------------------------


### Validate var metadata

In [34]:
cur_data.validate_data(slot='var')

adata.var is valid according to the var_schema.
Validated data:


Unnamed: 0_level_0,ensembl_gene_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG,ENSG00000121410,A1BG
A1BG-AS1,ENSG00000268895,A1BG-AS1
A1CF,ENSG00000148584,A1CF
A2M,ENSG00000175899,A2M
A2M-AS1,ENSG00000245105,A2M-AS1
...,...,...
hsa-mir-1587,ENSG00000263972,MIR1587
hsa-mir-3149,ENSG00000266712,MIR3149
hsa-mir-4259,ENSG00000266458,MIR4259
snosnR66,,


# Metadata curation

### Auto-populate available metadata

In [35]:
cur_data.populate_exp_metadata()

Experiment metadata populated with available fields from adata.obs:
--------------------------------------------------
{'associated_diseases': [{'term_id': 'MONDO:0004963',
                          'term_label': 'T-cell acute lymphoblastic leukemia'}],
 'experiment': {'number_of_perturbed_entities': 5905,
                'number_of_perturbed_targets': 32,
                'perturbation_type': [{'term_id': None,
                                       'term_label': 'CRISPRn'}],
                'perturbed_target_biotype': ['protein_coding'],
                'perturbed_targets': ['control_nontargeting',
                                      'ENSG00000130522',
                                      'ENSG00000112182',
                                      'ENSG00000077150',
                                      'ENSG00000177606',
                                      'ENSG00000109320',
                                      'ENSG00000171223',
                                      'ENSG00000107

### Manually curate metadata

Study details

In [36]:
cur_data.add_exp_metadata(
    metadata_slot='study',
    metadata={
        "title": "Pooled CRISPR screening with single-cell transcriptome readout",
        "study_uri": "https://doi.org/10.1038/nmeth.4177",
        "year": 2017,
        "first_author": {"first_name": "Paul", "last_name": "Datlinger"},
        "last_author": {"first_name": "Christoph", "last_name": "Bock"},
    }
)

Metadata for 'study' successfully validated:
--------------------------------------------------
{'first_author': {'first_name': 'Paul', 'last_name': 'Datlinger'},
 'last_author': {'first_name': 'Christoph', 'last_name': 'Bock'},
 'study_uri': 'https://doi.org/10.1038/nmeth.4177',
 'title': 'Pooled CRISPR screening with single-cell transcriptome readout',
 'year': 2017}
--------------------------------------------------


Experiment details

In [37]:
cur_data.add_exp_metadata(
    metadata_slot='experiment',
    metadata={
        "title": "Transcriptomics measurements of 5905 Jurkat cells induced with anti-CD3 and anti-CD28 antibodies",
        "summary": """
            Jurkat cells were transduced with a gRNA library targeting high-level
            regulators of T cell receptor signaling and a set of transcription factors. After 10
            days of antibiotic selection and expansion, cells were stimulated with anti-CD3 and
            anti-CD28 antibodies or left untreated. Both conditions were analyzed using CROP-seq,
            measuring TCR activation for each gene knockout. The dataset comprises 5,905 high-quality
            single-cell transcriptomes with uniquely assigned gRNAs.
            """,
        "replicates": "none",
        "number_of_samples": 2
    }
)

Metadata for 'experiment' successfully validated:
--------------------------------------------------
{'number_of_perturbed_entities': 5905,
 'number_of_perturbed_targets': 32,
 'number_of_samples': 2,
 'perturbation_type': [{'term_id': None, 'term_label': 'CRISPRn'}],
 'perturbed_target_biotype': ['protein_coding'],
 'perturbed_targets': ['control_nontargeting',
                       'ENSG00000130522',
                       'ENSG00000112182',
                       'ENSG00000077150',
                       'ENSG00000177606',
                       'ENSG00000109320',
                       'ENSG00000171223',
                       'ENSG00000107485',
                       'ENSG00000131196',
                       'ENSG00000173039',
                       'ENSG00000134954',
                       'ENSG00000124813',
                       'ENSG00000159216',
                       'ENSG00000170345',
                       'ENSG00000104856',
                       'ENSG00000182866',
     

Perturbation details

In [38]:
cur_data.add_exp_metadata(
    metadata_slot='perturbation',
    metadata={
        "library_generation_type": {
            "term_id": "EFO:0022868",
            "term_label": "endogenous",
        },
        "library_generation_method": {
            "term_id": "EFO:0022876",
            "term_label": "SpCas9",
        },
        "enzyme_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "library_delivery_method": {
            "term_id": None,
            "term_label": "lentiviral transduction",
        },
        "enzyme_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "library_integration_state": {
            "term_id": None,
            "term_label": "random locus integration",
        },
        "enzyme_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library_expression_control": {
            "term_id": None,
            "term_label": "constitutive expression",
        },
        "library": {
            "library_name": "custom",
            "accession": None,
            "library_format": {
                "term_id": None,
                "term_label": "pooled",
            },
            "library_scope": {
                "term_id": None,
                "term_label": "focused",
            },
            "library_perturbation_type": [
                {
                    "term_id": None,
                    "term_label": "knockout",
                },
            ],
            "manufacturer": "Bock",
            "lentiviral_generation": "3",
            "grnas_per_gene": "3",
            "total_grnas": "116",
            "total_variants": None,
        }
    }
)

Metadata for 'perturbation' successfully validated:
--------------------------------------------------
{'enzyme_delivery_method': {'term_id': None,
                            'term_label': 'lentiviral transduction'},
 'enzyme_expression_control': {'term_id': None,
                               'term_label': 'constitutive expression'},
 'enzyme_integration_state': {'term_id': None,
                              'term_label': 'random locus integration'},
 'library': {'accession': None,
             'grnas_per_target': None,
             'lentiviral_generation': '3',
             'library_format': {'term_id': None, 'term_label': 'pooled'},
             'library_name': 'custom',
             'library_perturbation_type': [{'term_id': None,
                                            'term_label': 'knockout'}],
             'library_scope': {'term_id': None, 'term_label': 'focused'},
             'manufacturer': 'Bock',
             'total_grnas': '116',
             'total_variants': None

Assay details

In [39]:
cur_data.add_exp_metadata(
    metadata_slot='assay',
    metadata={
        "readout_dimensionality": {
            "term_id": None,
            "term_label": "high-dimensional assay",
        },
        "readout_type": {
            "term_id": None,
            "term_label": "transcriptomic",
        },
        "readout_technology": {
            "term_id": None,
            "term_label": "single-cell rna-seq",
        },
        "method_name": {
            "term_id": None,
            "term_label": "Perturb-seq",
        },
        "method_uri": None,
        "sequencing_library_kit": {
            "term_id": None,
            "term_label": "Nextera XT",
        },
        "sequencing_platform": {"term_id": None, "term_label": "Illumina HiSeq 4000"},
        "sequencing_strategy": {"term_id": None, "term_label": "barcode sequencing"},
        "software_counts": {"term_id": None, "term_label": "Drop-seq Tools"},
        "software_analysis": {"term_id": None, "term_label": "Custom"},
        "reference_genome": {
            "term_id": None,
            "term_label": "GRCh38",
        }
    }
)

Metadata for 'assay' successfully validated:
--------------------------------------------------
{'method_name': {'term_id': None, 'term_label': 'Perturb-seq'},
 'method_uri': None,
 'readout_dimensionality': {'term_id': None,
                            'term_label': 'high-dimensional assay'},
 'readout_technology': {'term_id': None, 'term_label': 'single-cell rna-seq'},
 'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
 'reference_genome': {'term_id': None, 'term_label': 'GRCh38'},
 'sequencing_library_kit': {'term_id': None, 'term_label': 'Nextera XT'},
 'sequencing_platform': {'term_id': None, 'term_label': 'Illumina HiSeq 4000'},
 'sequencing_strategy': {'term_id': None, 'term_label': 'barcode sequencing'},
 'software_analysis': {'term_id': None, 'term_label': 'Custom'},
 'software_counts': {'term_id': None, 'term_label': 'Drop-seq Tools'}}
--------------------------------------------------


Model system details

In [40]:
cur_data.add_exp_metadata(
    metadata_slot='model_system',
    metadata={
        "species": "Homo sapiens",
        "passage_number": None,
        }
)

Metadata for 'model_system' successfully validated:
--------------------------------------------------
{'cell_line': [{'term_id': 'CLO:0007043', 'term_label': 'JURKAT cell'}],
 'cell_type': [{'term_id': 'CL:0000084', 'term_label': 'T cell'}],
 'developmental_stage': [{'term_id': None, 'term_label': 'adolescent'}],
 'model_system': [{'term_id': None, 'term_label': 'cell line'}],
 'passage_number': None,
 'sex': [{'term_id': None, 'term_label': 'male'}],
 'species': 'Homo sapiens',
 'tissue': [{'term_id': 'UBERON:0000178', 'term_label': 'blood'}]}
--------------------------------------------------


Associated dataset details

In [41]:
cur_data.add_exp_metadata(
    metadata_slot='associated_datasets',
    metadata=[
        {
            "dataset_accession": "GSE92872",
            "dataset_uri": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92872",
            "dataset_description": "Digital expression matrix",
            "dataset_file_name": "GSE92872_CROP-seq_Jurkat_TCR.digital_expression.csv.gz",
        },
        {
            "dataset_accession": "DatlingerBock2017.h5ad",
            "dataset_uri": "https://zenodo.org/records/13350497/files/DatlingerBock2017.h5ad",
            "dataset_description": "Processed .h5ad file",
            "dataset_file_name": "DatlingerBock2017.h5ad",
        }
    ]
)

Metadata for 'associated_datasets' successfully validated:
--------------------------------------------------
[{'dataset_accession': 'GSE92872',
  'dataset_description': 'Digital expression matrix',
  'dataset_file_name': 'GSE92872_CROP-seq_Jurkat_TCR.digital_expression.csv.gz',
  'dataset_uri': 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE92872'},
 {'dataset_accession': 'DatlingerBock2017.h5ad',
  'dataset_description': 'Processed .h5ad file',
  'dataset_file_name': 'DatlingerBock2017.h5ad',
  'dataset_uri': 'https://zenodo.org/records/13350497/files/DatlingerBock2017.h5ad'}]
--------------------------------------------------


### Validate metadata

In [42]:
cur_data.validate_exp_metadata()

Experiment metadata successfully validated:
--------------------------------------------------
{'assay': {'method_name': {'term_id': None, 'term_label': 'Perturb-seq'},
           'method_uri': None,
           'readout_dimensionality': {'term_id': None,
                                      'term_label': 'high-dimensional assay'},
           'readout_technology': {'term_id': None,
                                  'term_label': 'single-cell rna-seq'},
           'readout_type': {'term_id': None, 'term_label': 'transcriptomic'},
           'reference_genome': {'term_id': None, 'term_label': 'GRCh38'},
           'sequencing_library_kit': {'term_id': None,
                                      'term_label': 'Nextera XT'},
           'sequencing_platform': {'term_id': None,
                                   'term_label': 'Illumina HiSeq 4000'},
           'sequencing_strategy': {'term_id': None,
                                   'term_label': 'barcode sequencing'},
           'software

# Save the dataset

In [43]:
cur_data.save_curated_data()

  adata.obs = adata.obs.fillna(value=np.nan)


Curated data saved to ../curated/h5ad/datlinger_2017_curated.h5ad
