ASAP CRN Metadata validation 
# Team Scherzer. ASAP CRN Metadata validation 

10 Oct 2024
Andy Henrie

## scrnaseq_hybsel




In [1]:
import pandas as pd
from pathlib import Path
import os, sys

sys.path.append(os.path.abspath((os.path.join(os.getcwd(), 'src/crn_utils'))))

from util import read_CDE, NULL, prep_table, read_meta_table, create_metadata_package
from validate import validate_table, ReportCollector
from update_schema import v1_to_v2, v2_to_v3_PMDBS, intervention_typer
from checksums import extract_md5_from_details2, get_md5_hashes 
from bucket_util import authenticate_with_service_account, gsutil_ls, gsutil_cp, gsutil_mv 

%load_ext autoreload
%autoreload 2

root_path = Path.home() / ("Projects/ASAP/data/teams")


Streamlit NOT successfully imported


## CDEs
load the relavent CDEs

In [2]:
schema_version = "v1"
schema_path = Path.home() / "Projects/ASAP/crn-utils/resource/CDE"
CDEv1 = read_CDE(schema_version, local_path=schema_path)
schema_version = "v2.1"
CDEv2 = read_CDE(schema_version, local_path=schema_path)
schema_version = "v3.0"
CDEv3 = read_CDE(schema_version, local_path=schema_path)

metadata_version: ASAP_CDE_v1
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v1
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v1.csv
read local file
metadata_version: ASAP_CDE_v2.1
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v2.1
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v2.1.csv
read local file
metadata_version: ASAP_CDE_v3.0
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v3.0
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v3.0.csv
read local file


## Load original tables 
These were submitted as v2.1 (actually early v3.0.0 beta)


Team Scherzer


In [3]:
## convert 
team = "scherzer"
dataset_name = "sn-rnaseq-mtg-hybsel"

metadata_path = root_path / f"{team}/{dataset_name}/metadata"
og_path = metadata_path / "og"



In [4]:
sheets = ["SAMPLE","SUBJECT","CLINPATH","STUDY","PROTOCOL","DATA"]

excel_path = og_path / "ASAP CDE v3.0.0-beta_sks_new.xlsx"
STUDY = pd.read_excel(excel_path,sheet_name="STUDY")#,header=1)#.drop(columns="Field")
CLINPATH = pd.read_excel(excel_path,sheet_name="CLINPATH")#,header=1)#.drop(columns="Field")
SUBJECT = pd.read_excel(excel_path,sheet_name="SUBJECT")#,header=1)#.drop(columns="Field")
SAMPLE = pd.read_excel(excel_path,sheet_name="SAMPLE")#,header=1)#.drop(columns="Field")
PROTOCOL = pd.read_excel(excel_path,sheet_name="PROTOCOL")#,header=1)#.drop(columns="Field")
DATA = pd.read_excel(excel_path,sheet_name="DATA")#,header=1)#.drop(columns="Field")

metadata_version = "v2"

METADATA_VERSION_DATE = f"{metadata_version}_{pd.Timestamp.now().strftime('%Y%m%d')}"

In [5]:
# create csvs 
STUDY = STUDY.rename({'submittor_email':'submitter_email'}, axis=1)

STUDY.to_csv(og_path / f"STUDY.csv", index=False)
SAMPLE.to_csv(og_path / f"SAMPLE.csv", index=False)
SUBJECT.to_csv(og_path / f"SUBJECT.csv", index=False)
PROTOCOL.to_csv(og_path / f"PROTOCOL.csv", index=False)
CLINPATH.to_csv(og_path / f"CLINPATH.csv", index=False)
DATA.to_csv(og_path / f"DATA.csv", index=False)


In [6]:
CDE = CDEv2
tables = CDE['Table'].unique()

dfs = {}
for table in tables:
    df = read_meta_table(og_path / f"{table}.csv")
    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()
    dfs[table] = full_table

recoding number_of_brain_samples as int
All required fields are present in *STUDY* table.
🚨⚠️❗ **Missing Optional Fields in STUDY: PI_ORCHID**
🚨⚠️❗ **7 Fields with empty (NULL) values:**

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- PI_google_scholar_id: 1/1 empty rows (OPTIONAL)

	- preprocessing_references: 1/1 empty rows (OPTIONAL)

	- metadata_version_date: 1/1 empty rows (OPTIONAL)

	- alternate_dataset_id: 1/1 empty rows (OPTIONAL)
🚨⚠️❗ **1 Fields with invalid entries:**
- _*ASAP_team_name*_:  invalid values 💩'Team Scherzer'
    - valid ➡️ 'TEAM-LEE', 'TEAM-HAFLER', 'TEAM-HARDY', 'TEAM-JAKOBSSON', 'TEAM-SCHERZER', 'TEAM-SULZER', 'TEAM-VOET', 'TEAM-WOOD', 'NA'
🚨⚠️❗ **Extra field in STUDY: PI_ORCID**

All required fields are present in *PROTOCOL* table.
🚨⚠️❗ **6 Fields with empty (NULL) values:**

	- sample_collection_summary: 1/1 empty rows (REQUIRED)

	- cell_extraction_summar

In [7]:
STUDY = prep_table(dfs['STUDY'],CDEv2)
STUDY['team_dataset_id'] = dataset_name.replace(" ", "_").replace("-", "_")
STUDY['metadata_version_date'] = METADATA_VERSION_DATE
STUDY['ASAP_team_name'] = "TEAM-SCHERZER"


In [8]:
STUDY

Unnamed: 0,ASAP_team_name,ASAP_lab_name,project_name,team_dataset_id,project_dataset,project_description,PI_full_name,PI_email,contributor_names,submitter_name,...,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCID,PI_google_scholar_id,DUA_version,preprocessing_references,metadata_version_date,alternate_dataset_id,PI_ORCHID
0,TEAM-SCHERZER,"Scherzer, Dong, and Levin",Parkinson5D: deconstructing proximal disease m...,sn_rnaseq_mtg_hybsel,PD5D_MTG_snRNAseq_hybsel,Here we will develop a molecular atlas of Park...,Dr. Joshua Levin,Jlevin@broadinstitute.org,"Clemens, Scherzer; Xianjun, Dong; Joshua, Levi...",Dr. Sean Simmons,...,94,MTG,PD and control postmortem brains,,,ASAP Access & Use Policy,,v2_20241107,,


In [9]:
SUBJECT = prep_table(dfs['SUBJECT'],CDEv2)
SUBJECT['hx_melanoma'] = SUBJECT['hx_melanoma'].replace({"Present":"Yes"})

In [10]:
CLINPATH = prep_table(dfs['CLINPATH'],CDEv2)

# autopsy coding is too too idosyncratic.  leave 
# the path_nia_aa_a are coded as path_nia_ri... swap them 


# 'Low', 'Intermediate', 'High', 'None'
CLINPATH['path_nia_ri'] = CLINPATH['path_nia_aa_a'].replace({'Low ADNC': 'Low', 
                                                            'Not AD': 'None', 
                                                            'Intermediate ADNC': 'Intermediate'})
CLINPATH['path_nia_aa_a'] = NULL


# path_thal
CLINPATH['path_thal']= CLINPATH['path_thal'].replace({'Phase 0 (A0)':'0', 
                                'Phase 1 (A1)':'1', 
                                'Phase 2 (A1)':'2', 
                                'Phase 3 (A2)':'3', 
                                'Phase 4 (A3)':'4',
                                'Phase 5 (A3)':'5', 
                                'Unknown':NULL, 
                                'Missing/unknown':NULL})

CLINPATH['path_infarcs'] = CLINPATH['path_infarcs'].replace('Unknown', NULL)


# no idea what No, undx, yes maps to.  leave
CLINPATH['TDP43'] = CLINPATH['TDP43'].replace({'Unknown': NULL,'Undx':NULL})

# sn_neuronal_loss has alternate 0-3 coding.  leave

# add 'replicate' coding (nans)
mckeith_map = {'l. Olfactory Bulb-Only':"Olfactory bulb only", 'lla. Brainstem Predominant':"Brainstem",
       'lb. Limbic Predominant':"Limbic (transitional)", 'lV. Neocortical':"Neocortical",
       'lll. Brainstem/Limbic':"Amygdala Predominant", '0. No Lewy bodies':"Absent", "Unknown":NULL,
       'L. Olfactory Bulb-Only':"Olfactory bulb only", 'Lla. Brainstem Predominant':"Brainstem",
       'Lb. Limbic Predominant':"Limbic (transitional)", 'LV. Neocortical':"Neocortical",
       'Lll. Brainstem/Limbic':"Amygdala Predominant", 'Llb. Limbic Predominant': 'Limbic (amygdala) predominant'
       }
    


CLINPATH['path_mckeith'] = CLINPATH['path_mckeith'].replace(mckeith_map)

# leave path_nia_ri like this for now. not sure how to map "criteria not met" and "Not AD"

# leave amyloid_angiopathy_severity_scale like this for now. not sure how to map 'Cerebral amyloid angiopathy, temporal and occipital lobe','Cerebral amyloid angiopathy, frontal lobe']
CLINPATH



Unnamed: 0,subject_id,source_subject_id,duration_pmi,path_autopsy_dx_main,path_autopsy_second_dx,path_autopsy_third_dx,path_autopsy_fourth_dx,path_autopsy_fifth_dx,path_autopsy_sixth_dx,path_autopsy_seventh_dx,...,path_nia_ri,path_nia_aa_a,path_nia_aa_b,path_nia_aa_c,TDP43,arteriolosclerosis_severity_scale,amyloid_angiopathy_severity_scale,path_ad_level,dig_slide_avail,quant_path_avail
0,BN0009,00-09,4.00,PD/Dem,Charcot-Marie-Tooth disease (history),"GBA L444P/WT, L444P mutation",,,,,...,,,,,,,,,,
1,BN0329,03-29,4.50,PD/Dem,Seizure disorder (history),,,,,,...,,,,,,,,,,
2,BN0339,03-39,2.75,Control,Non-diagnostic Alzheimer's changes,CAA,,,,,...,,,,,,Mild,Moderate,,,
3,BN0341,03-41,2.50,Control,Non-diagnostic Alzheimer's changes,CWMR,,,,,...,,,,,No,Moderate,Mild,,,
4,BN0347,03-47,3.50,Control (MCI),Non-diagnostic Alzheimer's changes,"Argyrophilic grains, mesial temporal lobe",Infarct(s),CWMR,Several microscopic foci of cerebellar cortica...,"Etat crible, putamen and caudate nucleus",...,,,,,,Mild,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,BN2003,20-03,2.65,PD/Dem,"Microscopic changes of Alzheimer's disease, in...","Focal non-specific glial tauopathy, cortex of ...",,,,,...,Intermediate,,,,No,,,,,
90,BN2015,20-15,5.40,Control (history),"Microscopic changes of Alzheimer's disease, in...",Incidental Lewy body disease,,,,,...,,,,,No,,,,,
91,BN9944,99-44,2.16,Control,Non-diagnostic Alzheimer's changes,Alzheimer Type II astrocytosis consistent with...,,,,,...,,,,,,,,,,
92,BN9947,99-47,2.50,Control,Non-diagnostic Alzheimer's changes,Alzheimer type II astrocytosis,Inc LBs,,,,...,,,,,,,,,,



> leaving these incorrect encodings in place.  They are consistent between both team-scherzer-pmdbs datasets. 

🚨⚠️❗ **3 Fields with invalid entries:**
- _*path_autopsy_dx_main*_:  invalid values 💩'PD/Dem', 'Control', 'Control (MCI)', 'PD/ND', 'Control, Non-motoric', 'PDD', 'MCI', 'PD (MCI)', 'Control (history)', 'Control (diseased)', 'MCI (clinical history)', 'Control (clinical history)'
    - valid ➡️ 'Lewy body disease nos', 'Parkinson's disease', 'Parkinson's disease with dementia', 'Dementia with Lewy bodies', 'Multiple system atrophy (SND>OPCA)', 'Multiple system atrophy (OPCA<SND)', 'Multiple system atrophy (SND=OPCA)', 'Progressive supranuclear palsy', 'Corticobasal degeneration', 'Globular glial tauoapathy (GGT)', 'Chronic traumatic encephalopathy (CTE)', 'FTLD-Tau (Pick's)', 'FTLD-Tau (MAPT)', 'FTLD-Tau (AGD)', 'FTLD-TDP43, Type A', 'FTLD-TDP43, Type B', 'FTLD-TDP43, Type C', 'FTLD-TDP43, Type D', 'FTLD-TDP43, Type E', 'Motor neurone disease-TDP43 (MND or ALS)', 'FTLD-MND-TDP43', 'Huntington's disease', 'Spinocerebellar ataxia, nos', 'Prion disease, nos', 'Alzheimer's disease (high level neuropathological change)', 'Alzheimer's disease (intermediate level neuropathological change)', 'Control, Low level AD neuropathological change', 'Control, Limbic predominant age-related TDP43 proteinopathy (LATE)', 'Control, Argyrophilic grain disease', 'Control, Primary age-related tauopathy (PART)', 'Control, Ageing-related tau astrogliopathy (ARTAG)', 'Control, Cerebrovascular disease (atherosclerosis)', 'Control, Cerebrovascular disease (hyaline arteriolosclerosis)', 'Control, Cerebrovascular disease (cerebral amyloid angiopathy)', 'Control, no misfolded protein or significant vascular pathology', 'Other neurological disorder', 'NA'
- _*sn_neuronal_loss*_:  invalid values 💩'3.0', '0.0', '1.0', '2.0'
    - valid ➡️ 'None', 'Mild', 'Moderate', 'Severe', 'Not assessed', 'Unknown', 'NA'
- _*TDP43*_:  invalid values 💩'No', 'Yes'
    - valid ➡️ 'None in medial temporal lobe', 'Present in amygdala, only', 'Present in hippocampus, only', 'Present in amygdala and hippocampus, only', 'Present in medial temporal lobe and middle frontal gyrus (not FTLD pattern)', 'Unknown', 'NA'

In [11]:
schema = CDE[CDE['Table'] == "CLINPATH"]

report = ReportCollector(destination="NA")
full_table, report = validate_table(CLINPATH.copy(), table, schema, report)
report.print_log()


All required fields are present in *DATA* table.
🚨⚠️❗ **29 Fields with empty (NULL) values:**

	- path_autopsy_dx_main: 1/94 empty rows (REQUIRED)

	- path_autopsy_second_dx: 3/94 empty rows (OPTIONAL)

	- path_autopsy_third_dx: 11/94 empty rows (OPTIONAL)

	- path_autopsy_fourth_dx: 36/94 empty rows (OPTIONAL)

	- path_autopsy_fifth_dx: 59/94 empty rows (OPTIONAL)

	- path_autopsy_sixth_dx: 69/94 empty rows (OPTIONAL)

	- path_autopsy_seventh_dx: 81/94 empty rows (OPTIONAL)

	- path_autopsy_eight_dx: 91/94 empty rows (OPTIONAL)

	- path_year_death: 2/94 empty rows (REQUIRED)

	- other_cause_death_1: 94/94 empty rows (OPTIONAL)

	- other_cause_death_2: 94/94 empty rows (OPTIONAL)

	- path_braak_asyn: 94/94 empty rows (REQUIRED)

	- path_cerad: 44/94 empty rows (REQUIRED)

	- path_thal: 21/94 empty rows (REQUIRED)

	- known_pathogenic_mutation: 68/94 empty rows (OPTIONAL)

	- PD_pathogenic_mutation: 68/94 empty rows (OPTIONAL)

	- path_mckeith: 6/94 empty rows (OPTIONAL)

	- sn_neuronal

In [12]:
SAMPLE = prep_table(dfs['SAMPLE'],CDEv2)
# force the right sex_ontology_term_id
SAMPLE["organism_ontology_term_id"] = "NCBITaxon:9606"
SAMPLE["cell_type_ontology_term_id"] = "CL:0002319" # recode to fix encoding error
SAMPLE["molecular_source"] = "PolyA RNA" # fix encoding with underscore
SAMPLE["development_stage_ontology_term_id"] = "HsapDv:0000258"
hybrid_selection = SAMPLE['hybrid_selection'][0]
SAMPLE["alternate_sample_id"] = hybrid_selection

SAMPLE['region_level_1'] = SAMPLE['region_level_1'].replace({"Temporal Lobe": "Temporal lobe"})
SAMPLE['sex_ontology_term_id'] = SAMPLE['sex_ontology_term_id'].replace({"PATO:0000384":"PATO:0000384 (male)","PATO:0000383":"PATO:0000383 (female)"})

In [13]:
schema = CDE[CDE['Table'] == "SAMPLE"]

report = ReportCollector(destination="NA")
full_table, report = validate_table(SAMPLE.copy(), table, schema, report)
report.print_log()

recoding replicate_count as int
recoding repeated_sample as int
recoding input_cell_count as int
All required fields are present in *DATA* table.
🚨⚠️❗ **5 Fields with empty (NULL) values:**

	- self_reported_ethnicity_ontology_term_id: 94/94 empty rows (REQUIRED)

	- suspension_type: 94/94 empty rows (REQUIRED)

	- DV200: 94/94 empty rows (OPTIONAL)

	- pm_PH: 94/94 empty rows (OPTIONAL)

	- donor_id: 94/94 empty rows (OPTIONAL)
🚨⚠️❗ **1 Fields with invalid entries:**
- _*sequencing_length*_:  invalid values 💩'200'
    - valid ➡️ '25', '50', '100', '150', 'NA'
🚨⚠️❗ **Extra field in DATA: hybrid_selection**



In [14]:
DATA = prep_table(dfs['DATA'],CDEv2)
DATA

Unnamed: 0,sample_id,replicate,replicate_count,repeated_sample,batch,file_type,file_name,file_description,file_MD5,technology,omic,adjustment,content,time,header,annotation,configuration_file
0,BN1204_hybsel,rep1,1,0,batch30,fastq,novaseq_230914a_BN1204_BN0615_S6_L001_I1_001.f...,Index 1 Lane 1,d2220622e5e3e83a7537e82b4184bd64,SN,RNA,Raw,Reads,,,,
1,BN1204_hybsel,rep1,1,0,batch30,fastq,novaseq_230914a_BN1204_BN0615_S6_L001_I2_001.f...,Index 2 Lane 1,ea30b06f7583bc46c6a8d636f42eac25,SN,RNA,Raw,Reads,,,,
2,BN1204_hybsel,rep1,1,0,batch30,fastq,novaseq_230914a_BN1204_BN0615_S6_L001_R1_001.f...,Read 1 Lane 1,4de7de820f840117b32038cecf2efdb1,SN,RNA,Raw,Reads,,,,
3,BN1204_hybsel,rep1,1,0,batch30,fastq,novaseq_230914a_BN1204_BN0615_S6_L001_R2_001.f...,Read 2 Lane 1,a15ecacea5b8b3328535b08fcbe1dd0e,SN,RNA,Raw,Reads,,,,
4,BN1204_hybsel,rep1,1,0,batch30,fastq,novaseq_230914a_BN1204_BN0615_S6_L002_I1_001.f...,Index 1 Lane 2,490f9acf9d4651ce3896f286227a7262,SN,RNA,Raw,Reads,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
803,BN0952_hybsel,rep1,1,0,batch7,fastq,novaseq_230914c_BN1317_BN0952_S7_L001_R2_001.f...,Read 2 Lane 1,1cc4397535a2638c1e8db96563cd83a0,SN,RNA,Raw,Reads,,,,
804,BN0952_hybsel,rep1,1,0,batch7,fastq,novaseq_230914c_BN1317_BN0952_S7_L002_I1_001.f...,Index 1 Lane 2,7bca81bb1b5d7d6b9da4750170dfe1e6,SN,RNA,Raw,Reads,,,,
805,BN0952_hybsel,rep1,1,0,batch7,fastq,novaseq_230914c_BN1317_BN0952_S7_L002_I2_001.f...,Index 2 Lane 2,f8f366baec1c7a7e0b2a8eda4d3837f4,SN,RNA,Raw,Reads,,,,
806,BN0952_hybsel,rep1,1,0,batch7,fastq,novaseq_230914c_BN1317_BN0952_S7_L002_R1_001.f...,Read 1 Lane 2,1bf3c8c30cb8b6d4d127084bd694b2fa,SN,RNA,Raw,Reads,,,,


In [15]:
PROTOCOL = prep_table(dfs['PROTOCOL'],CDEv2)
PROTOCOL

Unnamed: 0,sample_collection_summary,cell_extraction_summary,lib_prep_summary,data_processing_summary,github_url,protocols_io_DOI,other_reference
0,,,,,,Dx.doi.org/10.17504/protocols.io.j8nlk4k55g5r/v1,


In [16]:
dfs['STUDY'] = STUDY
dfs['SUBJECT'] = SUBJECT
dfs['CLINPATH'] = CLINPATH
dfs['SAMPLE'] = SAMPLE
dfs['DATA'] = DATA 
dfs['PROTOCOL'] = PROTOCOL

In [17]:
### save extras as auxillary tables


In [18]:
# make tables conform to CDE and save extra columns as "auxiliary"
v2_path = metadata_path / "v2"

for table in tables:
    df = dfs[table]
    schema = CDE[CDE['Table'] == table]
    valid_fields = schema['Field'].unique()
    df_out = df[valid_fields]
    aux_fields = set(df.columns) - set(valid_fields)
    if aux_fields:
        df_aux = df[list(aux_fields)]
        df_aux.to_csv(v2_path / f"{table}_auxiliary.csv", index=False)
        print(f"Saved {table}_auxiliary.csv")
    df_out.to_csv(v2_path / f"{table}.csv", index=False)

Saved STUDY_auxiliary.csv
Saved SAMPLE_auxiliary.csv


In [19]:
dfs["SAMPLE"].columns

Index(['sample_id', 'subject_id', 'source_sample_id', 'replicate',
       'replicate_count', 'repeated_sample', 'batch', 'tissue', 'brain_region',
       'hemisphere', 'region_level_1', 'region_level_2', 'region_level_3',
       'RIN', 'source_RIN', 'molecular_source', 'input_cell_count', 'assay',
       'sequencing_end', 'sequencing_length', 'sequencing_instrument',
       'organism_ontology_term_id', 'development_stage_ontology_term_id',
       'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'disease_ontology_term_id', 'tissue_ontology_term_id',
       'cell_type_ontology_term_id', 'assay_ontology_term_id',
       'suspension_type', 'DV200', 'pm_PH', 'donor_id', 'alternate_sample_id',
       'hybrid_selection'],
      dtype='object')

## v2->v3

In [20]:
v3_meta_tables = ['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']
metadata_version = "v3.0"
METADATA_VERSION_DATE = f"{metadata_version}_{pd.Timestamp.now().strftime('%Y%m%d')}"


f"{v3_meta_tables}"

"['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']"

In [21]:
v3_path = metadata_path / "v3"

v3_tables, aux_tables = v2_to_v3_PMDBS(v2_path, v3_path, CDEv2, CDEv3)

recoding number_of_brain_samples as int
recoding age_at_onset as int
recoding age_at_diagnosis as int
recoding first_motor_symptom as int
recoding replicate_count as int
recoding repeated_sample as int
recoding input_cell_count as int
recoding replicate_count as int
recoding repeated_sample as int


### validate v3 tables


In [22]:
CDE = CDEv3
for table,df in v3_tables.items():
    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()

recoding number_samples as int
All required fields are present in *STUDY* table.
🚨⚠️❗ **7 Fields with empty (NULL) values:**

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- PI_ORCID: 1/1 empty rows (OPTIONAL)

	- PI_google_scholar_id: 1/1 empty rows (OPTIONAL)

	- preprocessing_references: 1/1 empty rows (OPTIONAL)

	- alternate_dataset_id: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *PROTOCOL* table.
🚨⚠️❗ **6 Fields with empty (NULL) values:**

	- sample_collection_summary: 1/1 empty rows (REQUIRED)

	- cell_extraction_summary: 1/1 empty rows (REQUIRED)

	- lib_prep_summary: 1/1 empty rows (REQUIRED)

	- data_processing_summary: 1/1 empty rows (REQUIRED)

	- github_url: 1/1 empty rows (REQUIRED)

	- other_reference: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *SUBJECT

In [23]:
v3_tables['CONDITION']

Field,condition_id,intervention_name,intervention_id,protocol_id,intervention_aux_table
0,idiopathic_pd,Case-Control,Case,,
1,healthy_control,Case-Control,Control,,
2,prodromal_motor_pd,Case-Control,Other,,


In [25]:
STUDY = v3_tables['STUDY']
STUDY

Unnamed: 0,ASAP_team_name,ASAP_lab_name,project_name,team_dataset_id,project_dataset,project_description,PI_full_name,PI_email,contributor_names,submitter_name,...,number_samples,sample_types,types_of_samples,DUA_version,metadata_tables,PI_ORCID,PI_google_scholar_id,preprocessing_references,metadata_version_date,alternate_dataset_id
0,TEAM-SCHERZER,"Scherzer, Dong, and Levin",Parkinson5D: deconstructing proximal disease m...,sn_rnaseq_mtg_hybsel,PD5D_MTG_snRNAseq_hybsel,Here we will develop a molecular atlas of Park...,Dr. Joshua Levin,Jlevin@broadinstitute.org,"Clemens, Scherzer; Xianjun, Dong; Joshua, Levi...",Dr. Sean Simmons,...,94,MTG,PD and control postmortem brains,ASAP Access & Use Policy,"['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DA...",,,,v3.0_20241107,


In [26]:
STUDY["metadata_tables"].values[0]

"['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']"

-------------------------
## check md5s



In [27]:
print(team)

source = "pmdbs"

bucket = f"asap-raw-team-{team}-{source}-{dataset_name}"
# bucket = f"asap-raw-data-team-{team}" # for now old locations


key_file_path = Path.home() / f"Projects/ASAP/{team}-credentials.json"

res = authenticate_with_service_account(key_file_path)
print(res)

# make sure to get ALL the fastq files in the bucket
prefix = "fastqs/*.gz"
bucket_files_md5 = get_md5_hashes( bucket, prefix)

scherzer
CompletedProcess(args='gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/scherzer-credentials.json', returncode=0, stdout='', stderr='Activated service account credentials for: [raw-admin-scherzer@dnastack-asap-parkinsons.iam.gserviceaccount.com]\n')
gsutil -u dnastack-asap-parkinsons hash -h gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mtg-hybsel/fastqs/*.gz


In [30]:
bucket_files_md5

{'nextseq041323_HybSel_Pooled_BN1412_S1_L001_I1_001.fastq.gz': '4f8e4f7bc5046066017b89bb31cfc5d5',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L001_I2_001.fastq.gz': '1fe521ad07dc6dc4988489603ce857c4',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L001_R1_001.fastq.gz': '0610c913dde97acc4db9cad26fe475ad',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L001_R2_001.fastq.gz': '318c67db1b35e31dfcca31bca78b8c36',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L002_I1_001.fastq.gz': 'da7d909c672aad17afb5b22b67ae933f',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L002_I2_001.fastq.gz': '3421a9cb928a2c482281ff406ee2dea1',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L002_R1_001.fastq.gz': 'a8dfca294e2b519407f98b35b5326865',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L002_R2_001.fastq.gz': 'e9628d673821bbc54ec70f964679925f',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L003_I1_001.fastq.gz': '8c2705def0906bd23fbfbb9cd9148432',
 'nextseq041323_HybSel_Pooled_BN1412_S1_L003_I2_001.fastq.gz': 'ed0f50db17acea49c8264477aa886ea4',
 'nextseq0

In [29]:
# def check_md5_sums()


checksum = v3_tables['DATA'][['sample_id','file_name','file_MD5']]
checksum['check2'] = checksum['file_name'].map(bucket_files_md5)
checksum['check1'] = checksum['file_MD5']
checksum[checksum.check1 != checksum.check2].file_name.to_list()
#empty means success!!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['check2'] = checksum['file_name'].map(bucket_files_md5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['check1'] = checksum['file_MD5']


[]

_____

## prep metadata in raw data bucket

steps:
- 1. archive whats there.  i.e. move to metadata/upload
- 2. copy metadata/upload to dataset upload (upload subdir)

In [31]:
metadata_subdir = "metadata"
bucket = f"asap-raw-team-{team}-{source}-{dataset_name}"
dataset_name, bucket



('sn-rnaseq-mtg-hybsel', 'asap-raw-team-scherzer-pmdbs-sn-rnaseq-mtg-hybsel')

In [34]:


current_files = gsutil_ls(bucket,"")
current_files

gsutil -u dnastack-asap-parkinsons ls gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mtg-hybsel/
gsutil command succeeded: gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mtg-hybsel/fastqs/



['gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mtg-hybsel/fastqs/', '']

Archive the uploaded metadata locally

But the email was shared via email, so no "upload" directory exists.

Will make a copy of og/"ASAP CDE v3.0.0-beta_sks_new.xlsx" to upload/ for completeness


PosixPath('/Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg-hybsel/metadata')

In [38]:
og_path

PosixPath('/Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg-hybsel/metadata/og')

In [39]:
import shutil 

upload_path = metadata_path / "upload"
upload_path.mkdir(exist_ok=True)

og_file_name = "ASAP CDE v3.0.0-beta_sks_new.xlsx"
shutil.copy(og_path / og_file_name , upload_path / og_file_name)

PosixPath('/Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg-hybsel/metadata/upload/ASAP CDE v3.0.0-beta_sks_new.xlsx')

--------------------
## Create metadata package

This will copy the final updated to v3.0 metadata to `asap-could-processing-resources`


In [40]:
metadata_source = metadata_path


source = "pmdbs"
archive_root = Path.home() / "Projects/ASAP/asap-crn-metadata/datasets"
dataset_path = archive_root / f"{team}-{source}-{dataset_name}"
# bucket = f"asap-raw-data-team-{team}" # for now old locations
metadata_source, dataset_path

(PosixPath('/Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg-hybsel/metadata'),
 PosixPath('/Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg-hybsel'))

________

In [41]:
fnms = create_metadata_package(metadata_source, dataset_path)


Copied /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg-hybsel/metadata/v2 to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg-hybsel/metadata/v2
Copied /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg-hybsel/metadata/v3 to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg-hybsel/metadata/v3
Copied /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg-hybsel/metadata/og to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg-hybsel/metadata/og
Copied /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg-hybsel/metadata/upload to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg-hybsel/metadata/upload


_____

generate ASAP IDs + transfering back to raw data bucket via `asap-crn-metadata` 