ASAP CRN Metadata validation 
# Team Scherzer. ASAP CRN Metadata validation 

10 Oct 2024
Andy Henrie




In [14]:
import pandas as pd
from pathlib import Path
import os, sys

sys.path.append(os.path.abspath((os.path.join(os.getcwd(), 'src/crn_utils'))))

from util import read_CDE, NULL, prep_table, read_meta_table, create_metadata_package
from validate import validate_table, ReportCollector
from update_schema import v1_to_v2, v2_to_v3_PMDBS, intervention_typer
from checksums import extract_md5_from_details2, get_md5_hashes 
from bucket_util import authenticate_with_service_account, gsutil_ls, gsutil_cp, gsutil_mv 

%load_ext autoreload
%autoreload 2

root_path = Path.home() / ("Projects/ASAP/data/teams")



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## CDEs
load the relavent CDEs

In [15]:
schema_version = "v1"
schema_path = Path.home() / "Projects/ASAP/crn-utils/resource/CDE"
CDEv1 = read_CDE(schema_version, local_path=schema_path)
schema_version = "v2.1"
CDEv2 = read_CDE(schema_version, local_path=schema_path)
schema_version = "v3.0"
CDEv3 = read_CDE(schema_version, local_path=schema_path)

metadata_version: ASAP_CDE_v1
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v1
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v1.csv
read local file
metadata_version: ASAP_CDE_v2.1
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v2.1
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v2.1.csv
read local file
metadata_version: ASAP_CDE_v3.0
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v3.0
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v3.0.csv
read local file


## Load original tables 
These were submitted as v2.1


Team Scherzer

In [16]:
## convert 
team = "scherzer"
dataset_name = "sn-rnaseq-mtg"

metadata_path = root_path / f"{team}/{dataset_name}/metadata"

og_path = metadata_path / "og"



In [17]:
sheets = ["SAMPLE","SUBJECT","CLINPATH","STUDY","PROTOCOL","DATA"]

excel_path = og_path / "ASAP_CDE-ScherzerTeam.xlsx"
STUDY = pd.read_excel(excel_path,sheet_name="STUDY")#,header=1)#.drop(columns="Field")
CLINPATH = pd.read_excel(excel_path,sheet_name="CLINPATH")#,header=1)#.drop(columns="Field")
SUBJECT = pd.read_excel(excel_path,sheet_name="SUBJECT")#,header=1)#.drop(columns="Field")
SAMPLE = pd.read_excel(excel_path,sheet_name="SAMPLE")#,header=1)#.drop(columns="Field")
PROTOCOL = pd.read_excel(excel_path,sheet_name="PROTOCOL")#,header=1)#.drop(columns="Field")
DATA = pd.read_excel(excel_path,sheet_name="DATA")#,header=1)#.drop(columns="Field")

metadata_version = "v2"

METADATA_VERSION_DATE = f"{metadata_version}_{pd.Timestamp.now().strftime('%Y%m%d')}"

In [18]:
STUDY = prep_table(STUDY,CDEv2)
STUDY['metadata_version_date'] = METADATA_VERSION_DATE
STUDY.rename({'submittor_email':'submitter_email'}, axis=1, inplace=True)
STUDY['ASAP_team_name'] = "TEAM-SCHERZER"
# STUDY['team_dataset_id'] = "sc_pmdbs" # "MTG_snRNAseq" #"Scherzer_MTG_snRNAseq"
STUDY['team_dataset_id'] = dataset_name.replace(" ", "_").replace("-", "_")


In [19]:
STUDY['alternate_dataset_id'] = NULL
STUDY

Unnamed: 0,ASAP_team_name,ASAP_lab_name,project_name,team_dataset_id,project_dataset,project_description,PI_full_name,PI_email,contributor_names,submitter_name,...,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,preprocessing_references,metadata_version_date,alternate_dataset_id
0,TEAM-SCHERZER,"Scherzer, Dong, and Levin",Parkinson5D: deconstructing proximal disease m...,sn_rnaseq_mtg,PD5D_MTG_snRNAseq,Here we will develop a molecular atlas of Park...,Dr.Clemens Scherzer,Clemens.scherzer@yale.edu,"Clemens, Scherzer; Xianjun, Dong; Joshua, Levi...",Dr.Xianjun Dong,...,Nan,94,MTG,PD and control postmortem brains,Nan,Nan,ASAP Access & Use Policy,Nan,v2_20241102,


In [20]:
# fix the excel saving source_subject_id as a date
SUBJECT = prep_table(SUBJECT,CDEv2)

recoder = {'2023-03-29 00:00:00': '03-29',
       '2023-04-15 00:00:00': '04-15', 
       '2023-06-02 00:00:00': '06-02',
        '2023-06-15 00:00:00': '06-15',
        '2023-07-04 00:00:00': '07-04', 
       '2023-10-22 00:00:00': '10-22',
       '2023-11-28 00:00:00': '11-28',
        '2023-12-04 00:00:00': '12-04',
       '2023-12-21 00:00:00': '12-21'}

SUBJECT['source_subject_id'] = SUBJECT['source_subject_id'].replace(recoder)

SUBJECT['source_subject_id'].unique()




array(['00-09', '03-29', '03-39', '03-41', '03-47', '03-48', '04-15',
       '04-52', '04-64', '05-36', '06-02', '06-15', '06-35', '06-44',
       '06-51', '06-55', '06-62', '07-04', '07-37', '07-46', '08-55',
       '09-34', '09-52', '10-22', '10-76', '11-28', '11-44', '11-60',
       '12-04', '12-06', '12-21', '12-66', '13-05', '13-08', '13-17',
       '13-39', '13-40', '13-51', '13-61', '14-12', '14-24', '14-30',
       '15-04', '15-06', '15-18', '15-35', '15-46', '15-54', '15-67',
       '15-78', '15-85', '16-14', '16-44', '17-19', '17-22', '17-26',
       '17-30', '17-37', '17-41', '17-47', '17-49', '17-52', '17-56',
       '17-62', '18-09', '18-12', '18-14', '18-05', '18-17', '18-22',
       '18-27', '18-36', '18-39', '18-42', '18-48', '18-49', '18-55',
       '18-62', '18-65', '18-72', '18-78', '19-02', '19-10', '19-34',
       '19-39', '19-57', '19-59', '19-60', '19-74', '20-03', '20-15',
       '99-44', '99-47', '99-66'], dtype=object)

In [21]:

SUBJECT['hx_melanoma'] = SUBJECT['hx_melanoma'].replace({'Present':'Yes', 'Unknown': NULL})
SUBJECT['cognitive_status'] = SUBJECT['cognitive_status'].replace({'Unknown': NULL })

# SUBJECT.replace("Unknown", NULL, inplace=True)

# eductation levels 
SUBJECT['education_level'] = SUBJECT['education_level'].replace({'Unknown': NULL })

# map "Unknown" to NULL

# map ['14', '12', '16', '18', '13', '8', '15', '20', '19', '17', '11', '22', '10', '21'] ???
SUBJECT

Unnamed: 0,subject_id,source_subject_id,AMPPD_id,GP2_id,biobank_name,organism,sex,age_at_collection,race,ethnicity,...,hx_dementia_mci,hx_melanoma,education_level,smoking_status,smoking_years,APOE_e4_status,cognitive_status,time_from_baseline,primary_diagnosis,primary_diagnosis_text
0,BN0009,00-09,Nan,Nan,BSHRI,Human,Male,64,White,Unknown,...,Yes,,,Former smoker,30,33,Dementia,Unknown,Idiopathic PD,
1,BN0329,03-29,Nan,Nan,BSHRI,Human,Male,79,White,Unknown,...,Yes,,14,Unknown,Unknown,23,Dementia,Unknown,Idiopathic PD,
2,BN0339,03-39,Nan,Nan,BSHRI,Human,Male,86,White,Unknown,...,No,,12,Former smoker,50,33,Normal,Unknown,Healthy Control,
3,BN0341,03-41,Nan,Nan,BSHRI,Human,Male,89,White,Unknown,...,No,,16,Never,0,34,Normal,Unknown,Healthy Control,
4,BN0347,03-47,Nan,Nan,BSHRI,Human,Female,95,White,Unknown,...,No,,12,Former smoker,Unknown,33,MCI,Unknown,Healthy Control,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,BN2003,20-03,Nan,Nan,BSHRI,Human,Male,72,White,Unknown,...,Yes,,16,Former smoker,Unknown,33,Dementia,Unknown,Idiopathic PD,
90,BN2015,20-15,Nan,Nan,BSHRI,Human,Male,90,White,Unknown,...,No,,20,Never,Unknown,34,Normal,Unknown,Prodromal motor PD,
91,BN9944,99-44,Nan,Nan,BSHRI,Human,Male,69,White,Unknown,...,No,,15,Former smoker,Unknown,34,Normal,Unknown,Healthy Control,
92,BN9947,99-47,Nan,Nan,BSHRI,Human,Male,84,White,Unknown,...,No,,20,Former smoker,Unknown,33,Normal,Unknown,Prodromal motor PD,


In [22]:
SAMPLE = prep_table(SAMPLE,CDEv2)

SAMPLE['region_level_1'] = SAMPLE['region_level_1'].replace({'Temporal Lobe':'Temporal lobe'})
SAMPLE['molecular_source'] = SAMPLE['molecular_source'].replace({'PolyA_RNA':'PolyA RNA'})

In [23]:
SAMPLE["organism_ontology_term_id"] = "NCBITaxon:9606"
subj_sex_mapper = dict(zip(SUBJECT['subject_id'], SUBJECT['sex']))
SAMPLE['sex_ontology_term_id'] = SAMPLE['subject_id'].map(subj_sex_mapper).replace({"Male":"PATO:0000384 (male)", "Female":"PATO:0000383 (female)" })

In [24]:

# force the right sex_ontology_term_id
SAMPLE["organism_ontology_term_id"] = "NCBITaxon:9606"

# autopsy coding is too too idosyncratic.  leave 
# the path_nia_aa_a are coded as path_nia_ri... swap them 

CLINPATH = prep_table(CLINPATH,CDEv2)


# 'Low', 'Intermediate', 'High', 'None'
CLINPATH['path_nia_ri'] = CLINPATH['path_nia_aa_a'].replace({'Low ADNC': 'Low', 
                                                            'Not AD': 'None', 
                                                            'Intermediate ADNC': 'Intermediate'})
CLINPATH['path_nia_aa_a'] = NULL


# path_thal
CLINPATH['path_thal']= CLINPATH['path_thal'].replace({'Phase 0 (A0)':'0', 
                                'Phase 1 (A1)':'1', 
                                'Phase 2 (A1)':'2', 
                                'Phase 3 (A2)':'3', 
                                'Phase 4 (A3)':'4',
                                'Phase 5 (A3)':'5', 
                                'Unknown':NULL, 
                                'Missing/unknown':NULL})

CLINPATH['path_infarcs'] = CLINPATH['path_infarcs'].replace('Unknown', NULL)


# no idea what No, undx, yes maps to.  leave
CLINPATH['TDP43'] = CLINPATH['TDP43'].replace('Unknown', NULL)

# sn_neuronal_loss has alternate 0-3 coding.  leave

# add 'replicate' coding (nans)

In [25]:

mckeith_map = {'l. Olfactory Bulb-Only':"Olfactory bulb only", 'lla. Brainstem Predominant':"Brainstem",
       'llb. Limbic Predominant':"Limbic (transitional)", 'lV. Neocortical':"Neocortical",
       'lll. Brainstem/Limbic':"Amygdala Predominant", '0. No Lewy bodies':"Absent", "Unknown":NULL}


CLINPATH['path_mckeith'] = CLINPATH['path_mckeith'].replace(mckeith_map)

# leave path_nia_ri like this for now. not sure how to map "criteria not met" and "Not AD"

# leave amyloid_angiopathy_severity_scale like this for now. not sure how to map 'Cerebral amyloid angiopathy, temporal and occipital lobe','Cerebral amyloid angiopathy, frontal lobe']
CLINPATH

Unnamed: 0,subject_id,source_subject_id,duration_pmi,path_autopsy_dx_main,path_autopsy_second_dx,path_autopsy_third_dx,path_autopsy_fourth_dx,path_autopsy_fifth_dx,path_autopsy_sixth_dx,path_autopsy_seventh_dx,...,path_nia_ri,path_nia_aa_a,path_nia_aa_b,path_nia_aa_c,TDP43,arteriolosclerosis_severity_scale,amyloid_angiopathy_severity_scale,path_ad_level,dig_slide_avail,quant_path_avail
0,BN0009,00-09,4.00,PD/Dem,Charcot-Marie-Tooth disease (history),"GBA L444P/WT, L444P mutation",,,,,...,Nan,,Nan,Nan,,Nan,Nan,Unknown,Nan,Nan
1,BN0329,03-29,4.50,PD/Dem,Seizure disorder (history),,,,,,...,Nan,,Nan,Nan,,Nan,Nan,Unknown,Nan,Nan
2,BN0339,03-39,2.75,Control,Non-diagnostic Alzheimer's changes,CAA,,,,,...,Nan,,Nan,Nan,,Mild,Moderate,Unknown,Nan,Nan
3,BN0341,03-41,2.50,Control,Non-diagnostic Alzheimer's changes,CWMR,,,,,...,Nan,,Nan,Nan,No,Moderate,Mild,Unknown,Nan,Nan
4,BN0347,03-47,3.50,Control (MCI),Non-diagnostic Alzheimer's changes,"Argyrophilic grains, mesial temporal lobe",Infarct(s),CWMR,Several microscopic foci of cerebellar cortica...,"Etat crible, putamen and caudate nucleus",...,Nan,,Nan,Nan,,Mild,Nan,Unknown,Nan,Nan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,BN2003,20-03,2.65,PD/Dem,"Microscopic changes of Alzheimer's disease, in...","Focal non-specific glial tauopathy, cortex of ...",,,,,...,Intermediate,,Nan,Nan,No,Unknown,Unknown,Unknown,Nan,Nan
90,BN2015,20-15,5.40,Control (history),"Microscopic changes of Alzheimer's disease, in...",Incidental Lewy body disease,,,,,...,Nan,,Nan,Nan,No,Unknown,Unknown,Unknown,Nan,Nan
91,BN9944,99-44,2.16,Control,Non-diagnostic Alzheimer's changes,Alzheimer Type II astrocytosis consistent with...,,,,,...,Nan,,Nan,Nan,,Unknown,Nan,Unknown,Nan,Nan
92,BN9947,99-47,2.50,Control,Non-diagnostic Alzheimer's changes,Alzheimer type II astrocytosis,Inc LBs,,,,...,Nan,,Nan,Nan,,Unknown,Nan,Unknown,Nan,Nan


Chat GPT help

```python

mapping = {
    'PD/Dem': ['Parkinson\'s disease with dementia', 'Dementia with Lewy bodies'],
    'Control': ['Control, no misfolded protein or significant vascular pathology'],
    'Control (MCI)': ['Control, Low level AD neuropathological change', 
                       'Control, Limbic predominant age-related TDP43 proteinopathy (LATE)'],
    'PD/ND': ['Parkinson\'s disease'],
    'Control, Non-motoric': ['Control, no misfolded protein or significant vascular pathology'],
    'PDD': ['Parkinson\'s disease with dementia', 'Dementia with Lewy bodies'],
    'MCI': ['Alzheimer\'s disease (intermediate level neuropathological change)', 
            'Control, Low level AD neuropathological change'],
    'PD (MCI)': ['Parkinson\'s disease with dementia', 'Parkinson\'s disease'],
    'Control (history)': ['Control, no misfolded protein or significant vascular pathology'],
    'Control (diseased)': ['Control, Cerebrovascular disease (atherosclerosis)', 
                           'Control, Cerebrovascular disease (hyaline arteriolosclerosis)', 
                           'Control, Cerebrovascular disease (cerebral amyloid angiopathy)', 
                           'Other neurological disorder'],
    'MCI (clinical history)': ['Control, Low level AD neuropathological change', 
                               'Control, Limbic predominant age-related TDP43 proteinopathy (LATE)'],
    'Control (clinical history)': ['Control, no misfolded protein or significant vascular pathology']
}


```

In [26]:
# path_autopsy_main_mapper = {
#     'PD/Dem': 'Parkinson\'s disease with dementia',
#     'Control': 'Control, no misfolded protein or significant vascular pathology',
#     'Control (MCI)': 'Control, Low level AD neuropathological change',
#     'PD/ND': 'Parkinson\'s disease',
#     'Control, Non-motoric': 'Control, no misfolded protein or significant vascular pathology',
#     'PDD': 'Parkinson\'s disease with dementia',
#     'MCI': 'Alzheimer\'s disease (intermediate level neuropathological change)',
#     'PD (MCI)': 'Parkinson\'s disease with dementia',
#     'Control (history)': 'Control, no misfolded protein or significant vascular pathology',
#     'Control (diseased)': 'Control, Cerebrovascular disease (atherosclerosis)',
#     'MCI (clinical history)': 'Control, Low level AD neuropathological change',
#     'Control (clinical history)': 'Control, no misfolded protein or significant vascular pathology'
# }

# CLINPATH['path_autopsy_dx_main'] = CLINPATH['path_autopsy_dx_main'].replace(path_autopsy_main_mapper)

In [30]:
DATA['replicate'] = DATA['replicate'].apply(lambda x: x[0].upper()+x[1:] if x is not NULL else x)
DATA

Unnamed: 0,sample_id,source_sample_id,replicate,replicate_count,repeated_sample,batch,file_type,file_name_source,file_name,local_path,file_description,file_MD5,technology,omic,adjustment,content,time,header,annotation,configuration_file
0,PD_BN0009_MTG_snRNAseq_rep1,BN0009,Rep1,1,0,batch1,fastq,BN0009_S2_L001_I1_001.fastq.gz,PD_BN0009_MTG_snRNAseq_rep1_L001_I1_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch1_ASAP_...,Index 1 Lane 1,bc037ea92d5359f3da1ba6da4f974e1c,SN,RNA,Raw,Reads,,,,
1,PD_BN0009_MTG_snRNAseq_rep1,BN0009,Rep1,1,0,batch1,fastq,BN0009_S2_L001_I2_001.fastq.gz,PD_BN0009_MTG_snRNAseq_rep1_L001_I2_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch1_ASAP_...,Index 2 Lane 1,0c8ef844b76212e82428af03d341048b,SN,RNA,Raw,Reads,,,,
2,PD_BN0009_MTG_snRNAseq_rep1,BN0009,Rep1,1,0,batch1,fastq,BN0009_S2_L001_R1_001.fastq.gz,PD_BN0009_MTG_snRNAseq_rep1_L001_R1_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch1_ASAP_...,Read 1 Lane 1,c6a41f29849152df2ba2391dc559149b,SN,RNA,Raw,Reads,,,,
3,PD_BN0009_MTG_snRNAseq_rep1,BN0009,Rep1,1,0,batch1,fastq,BN0009_S2_L001_R2_001.fastq.gz,PD_BN0009_MTG_snRNAseq_rep1_L001_R2_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch1_ASAP_...,Read 2 Lane 1,4d610fdf5a72c8206b2f1505d8fcecd7,SN,RNA,Raw,Reads,,,,
4,PD_BN0009_MTG_snRNAseq_rep1,BN0009,Rep1,1,0,batch1,fastq,BN0009_S2_L002_I1_001.fastq.gz,PD_BN0009_MTG_snRNAseq_rep1_L002_I1_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch1_ASAP_...,Index 1 Lane 2,e34917b3ec01cd09ea8b52c82db1dc83,SN,RNA,Raw,Reads,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547,HC_BN1762_MTG_snRNAseq_rep1,BN1762,Rep1,1,0,batch31,fastq,BN1762_S3_L003_R2_001.fastq.gz,HC_BN1762_MTG_snRNAseq_rep1_L003_R2_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch31_ASAP...,Read 2 Lane 3,887e3d000587b2774ce3cd7022078577,SN,RNA,Raw,Reads,,,,
1548,HC_BN1762_MTG_snRNAseq_rep1,BN1762,Rep1,1,0,batch31,fastq,BN1762_S3_L004_I1_001.fastq.gz,HC_BN1762_MTG_snRNAseq_rep1_L004_I1_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch31_ASAP...,Index 1 Lane 4,44ac0c6d4f7f3b4502eead39564c8723,SN,RNA,Raw,Reads,,,,
1549,HC_BN1762_MTG_snRNAseq_rep1,BN1762,Rep1,1,0,batch31,fastq,BN1762_S3_L004_I2_001.fastq.gz,HC_BN1762_MTG_snRNAseq_rep1_L004_I2_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch31_ASAP...,Index 2 Lane 4,dc114df490e1857072462ca090799005,SN,RNA,Raw,Reads,,,,
1550,HC_BN1762_MTG_snRNAseq_rep1,BN1762,Rep1,1,0,batch31,fastq,BN1762_S3_L004_R1_001.fastq.gz,HC_BN1762_MTG_snRNAseq_rep1_L004_R1_001.fastq.gz,/data/neurogen/ASAP/scRNAseq/data/batch31_ASAP...,Read 1 Lane 4,e24eddb1ac0b1f4d2bdd32b5f69dae27,SN,RNA,Raw,Reads,,,,


In [31]:
v2_path = metadata_path / "v2"
STUDY.to_csv(v2_path / f"STUDY.csv", index=False)
SAMPLE.to_csv(v2_path / f"SAMPLE.csv", index=False)
SUBJECT.to_csv(v2_path / f"SUBJECT.csv", index=False)
PROTOCOL.to_csv(v2_path / f"PROTOCOL.csv", index=False)
CLINPATH.to_csv(v2_path / f"CLINPATH.csv", index=False)
DATA.to_csv(v2_path / f"DATA.csv", index=False)

## validate v2 tables


In [32]:
CDE = CDEv2
tables = CDE['Table'].unique()

dfs = {}
for table in tables:
    df = read_meta_table(v2_path / f"{table}.csv")
    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()
    dfs[table] = full_table
    # df.to_csv(v1_path / f"{table}.csv", index=False)

recoding number_of_brain_samples as int
All required fields are present in *STUDY* table.
🚨⚠️❗ **7 Fields with empty (NULL) values:**

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- PI_ORCHID: 1/1 empty rows (OPTIONAL)

	- PI_google_scholar_id: 1/1 empty rows (OPTIONAL)

	- preprocessing_references: 1/1 empty rows (OPTIONAL)

	- alternate_dataset_id: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *PROTOCOL* table.
No empty entries (NULL) found .
No invalid entries found in Enum fields.

recoding age_at_onset as int
recoding age_at_diagnosis as int
recoding first_motor_symptom as int
All required fields are present in *SUBJECT* table.
🚨⚠️❗ **15 Fields with empty (NULL) values:**

	- AMPPD_id: 94/94 empty rows (REQUIRED)

	- GP2_id: 94/94 empty rows (REQUIRED)

	- ethnicity: 94/94 empty rows (REQUIRED)

	- family_history: 86/94 emp

### save extras as auxillary tables


In [33]:
# make tables conform to CDE and save extra columns as "auxiliary"
v2_path = metadata_path / "v2"

for table in tables:
    df = dfs[table]
    schema = CDE[CDE['Table'] == table]
    valid_fields = schema['Field'].unique()
    df_out = df[valid_fields]
    aux_fields = set(df.columns) - set(valid_fields)
    if aux_fields:
        df_aux = df[list(aux_fields)]
        df_aux.to_csv(v2_path / f"{table}_auxiliary.csv", index=False)
        print(f"Saved {table}_auxiliary.csv")
    df_out.to_csv(v2_path / f"{table}.csv", index=False)

Saved SAMPLE_auxiliary.csv
Saved DATA_auxiliary.csv


## v2->v3

In [34]:
v3_meta_tables = ['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']

f"{v3_meta_tables}"

"['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']"

In [35]:
v3_path = metadata_path / "v3"

v3_tables, aux_tables = v2_to_v3_PMDBS(v2_path, v3_path, CDEv2, CDEv3)

recoding number_of_brain_samples as int
recoding age_at_onset as int
recoding age_at_diagnosis as int
recoding first_motor_symptom as int
recoding replicate_count as int
recoding repeated_sample as int
recoding input_cell_count as int
recoding replicate_count as int
recoding repeated_sample as int


### validate v3 tables


In [36]:
CDE = CDEv3
for table,df in v3_tables.items():
    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()

recoding number_samples as int
All required fields are present in *STUDY* table.
🚨⚠️❗ **7 Fields with empty (NULL) values:**

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- PI_ORCID: 1/1 empty rows (OPTIONAL)

	- PI_google_scholar_id: 1/1 empty rows (OPTIONAL)

	- preprocessing_references: 1/1 empty rows (OPTIONAL)

	- alternate_dataset_id: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *PROTOCOL* table.
No empty entries (NULL) found .
No invalid entries found in Enum fields.

All required fields are present in *SUBJECT* table.
🚨⚠️❗ **1 Fields with empty (NULL) values:**

	- primary_diagnosis_text: 94/94 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

recoding replicate_count as int
recoding repeated_sample as int
All required fields are present in *SAMPLE* table.
🚨⚠️❗ **13 Fields with empty (NULL) values:**

	- o

In [37]:
v3_tables['CONDITION']

Field,condition_id,intervention_name,intervention_id,protocol_id,intervention_aux_table
0,idiopathic_pd,Case-Control,Case,,
1,healthy_control,Case-Control,Control,,
2,prodromal_motor_pd,Case-Control,Other,,


In [38]:
STUDY = v3_tables['STUDY']
STUDY

Unnamed: 0,ASAP_team_name,ASAP_lab_name,project_name,team_dataset_id,project_dataset,project_description,PI_full_name,PI_email,contributor_names,submitter_name,...,number_samples,sample_types,types_of_samples,DUA_version,metadata_tables,PI_ORCID,PI_google_scholar_id,preprocessing_references,metadata_version_date,alternate_dataset_id
0,TEAM-SCHERZER,"Scherzer, Dong, and Levin",Parkinson5D: deconstructing proximal disease m...,sn_rnaseq_mtg,PD5D_MTG_snRNAseq,Here we will develop a molecular atlas of Park...,Dr.Clemens Scherzer,Clemens.scherzer@yale.edu,"Clemens, Scherzer; Xianjun, Dong; Joshua, Levi...",Dr.Xianjun Dong,...,94,MTG,PD and control postmortem brains,ASAP Access & Use Policy,"['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DA...",,,,v3.0_20241102,


-------------------------
## check md5s

Team Scherzer uploaded their data originally with gcloud tools which by default don't create md5s.  We have prefiously checked the crc hashes instead.  


In [39]:
print(team)

source = "pmdbs"

bucket = f"asap-raw-team-{team}-{source}-{dataset_name}"


key_file_path = Path.home() / f"Projects/ASAP/{team}-credentials.json"

res = authenticate_with_service_account(key_file_path)
print(res)

# # make sure to get ALL the fastq files in the bucket
# prefix = "fastqs/*.gz"
# bucket_files_md5 = get_md5_hashes( bucket, prefix)


scherzer
CompletedProcess(args='gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/scherzer-credentials.json', returncode=0, stdout='', stderr='Activated service account credentials for: [raw-admin-scherzer@dnastack-asap-parkinsons.iam.gserviceaccount.com]\n')


In [40]:
# # def check_md5_sums()


# checksum = v3_tables['DATA'][['file_name','file_MD5']]
# checksum['check2'] = checksum['file_name'].map(bucket_files_md5)
# checksum['check1'] = checksum['file_MD5']
# checksum[checksum.check1 != checksum.check2].file_name.to_list()
# #empty means success!!

_____

## prep metadata in raw data bucket

steps:
- 1. archive whats there.  i.e. move to metadata/upload
- 2. copy metadata/upload to dataset upload (upload subdir)

In [41]:
metadata_subdir = "metadata"
bucket = f"asap-raw-team-{team}-{source}-{dataset_name}"
dataset_name_ = "sn-rnaseq-mfc"
bucket = f"asap-raw-team-{team}-{source}-{dataset_name_}"

current_files = gsutil_ls(bucket,metadata_subdir)

gsutil -u dnastack-asap-parkinsons ls gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mfc/metadata


gsutil command succeeded: gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mfc/metadata/upload/



In [42]:
metadata_subdir2 = "metadata/upload"
bucket = current_files[0].split("/")[2]

for file in current_files:
    if file == "":
        continue
    file_nm = Path(file).name
    is_dir = not file_nm.__contains__(".")

    source = f"gs://{bucket}/{metadata_subdir}/{file_nm}"

    destination = f"gs://{bucket}/{metadata_subdir2}/{file_nm}"
    gsutil_mv(source, destination, is_dir)

gsutil -u dnastack-asap-parkinsons mv -r gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mfc/metadata/upload gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mfc/metadata/upload/upload
gsutil command succeeded: 


In [43]:

metadata_subdir = "metadata/upload"
current_files = gsutil_ls(bucket,metadata_subdir)

gsutil -u dnastack-asap-parkinsons ls gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mfc/metadata/upload
gsutil command succeeded: gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mfc/metadata/upload/upload/



Archive the uploaded metadata locally



In [44]:
file_source = f"gs://{bucket}/{metadata_subdir}"
destination = f"{metadata_path}"

In [45]:

is_dir = True
gsutil_cp(file_source, destination, is_dir)

gsutil -u dnastack-asap-parkinsons cp -r gs://asap-raw-team-scherzer-pmdbs-sn-rnaseq-mfc/metadata/upload /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg/metadata
gsutil command succeeded: 


''

--------------------
## Create metadata package

This will copy the final updated to v3.0 metadata to `asap-could-processing-resources`


In [46]:
metadata_source = metadata_path


source = "pmdbs"
archive_root = Path.home() / "Projects/ASAP/asap-crn-metadata/datasets"
dataset_path = archive_root / f"{team}-{source}-{dataset_name}"
# bucket = f"asap-raw-data-team-{team}" # for now old locations
metadata_source, dataset_path

(PosixPath('/Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg/metadata'),
 PosixPath('/Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg'))

________

In [47]:
fnms = create_metadata_package(metadata_source, dataset_path)


Skipping empty folder /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg/metadata/v1
Copied /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg/metadata/v2 to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg/metadata/v2
Copied /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg/metadata/v3 to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg/metadata/v3
Copied /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg/metadata/og to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg/metadata/og
Copied /Users/ergonyc/Projects/ASAP/data/teams/scherzer/sn-rnaseq-mtg/metadata/upload to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/scherzer-pmdbs-sn-rnaseq-mtg/metadata/upload


_____

generate ASAP IDs + transfering back to raw data bucket via `asap-crn-metadata` 

_____

## transfer metadata to raw data bucket

steps:
- 1. archive whats there.  i.e. move to metadata/archive
- 2. copy package to metadata/ . i.e. /og/*.csv, /v??/*.csv

In [44]:

from utils.checksums import extract_md5_from_details, extract_md5_from_details2,  extract_crc32c_from_details2, extract_hashes_from_gcloudstorage

In [45]:
source_hash = Path.home() / "Projects/ASAP/team-scherzer/scherzer_hash.log"

source_file_crc = extract_crc32c_from_details2(source_hash)
source_file_md5 = extract_md5_from_details2(source_hash)

source_hash.exists()

True

In [46]:
source_file_crc,source_file_md5 = extract_hashes_from_gcloudstorage(source_hash)

## crc32c hashes 

due to gcloud defaults we have to use crc hashes to validate the fastqs

In [47]:
bucket_files_md5 = extract_md5_from_details2("scherzer_hexhash.log")


In [48]:
bucket_files_crc = extract_crc32c_from_details2("scherzer_hexhash.log")


In [49]:

checksum = DATA[['file_name','file_MD5']]


checksum['check_src_crc'] = checksum['file_name'].map(source_file_crc)
checksum['check_bucket_crc'] = checksum['file_name'].map(bucket_files_crc)


checksum['meta_MD5'] = checksum['file_MD5'].str.strip()
checksum['check_src_md5'] = checksum['file_name'].map(source_file_md5)
checksum['check_bucket_md5'] = checksum['file_name'].map(bucket_files_md5)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['check_src_crc'] = checksum['file_name'].map(source_file_crc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['check_bucket_crc'] = checksum['file_name'].map(bucket_files_crc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['meta_MD5'] = checksum['file_MD5'].str.strip()
A v

In [50]:
checksum

Unnamed: 0,file_name,file_MD5,check_src_crc,check_bucket_crc,meta_MD5,check_src_md5,check_bucket_md5
0,PD_BN0009_MTG_snRNAseq_rep1_L001_I1_001.fastq.gz,bc037ea92d5359f3da1ba6da4f974e1c,c99915b4,c99915b4,bc037ea92d5359f3da1ba6da4f974e1c,bc037ea92d5359f3da1ba6da4f974e1c,
1,PD_BN0009_MTG_snRNAseq_rep1_L001_I2_001.fastq.gz,0c8ef844b76212e82428af03d341048b,f0af5e35,f0af5e35,0c8ef844b76212e82428af03d341048b,0c8ef844b76212e82428af03d341048b,
2,PD_BN0009_MTG_snRNAseq_rep1_L001_R1_001.fastq.gz,c6a41f29849152df2ba2391dc559149b,151d9d97,151d9d97,c6a41f29849152df2ba2391dc559149b,c6a41f29849152df2ba2391dc559149b,
3,PD_BN0009_MTG_snRNAseq_rep1_L001_R2_001.fastq.gz,4d610fdf5a72c8206b2f1505d8fcecd7,58bc244e,58bc244e,4d610fdf5a72c8206b2f1505d8fcecd7,4d610fdf5a72c8206b2f1505d8fcecd7,
4,PD_BN0009_MTG_snRNAseq_rep1_L002_I1_001.fastq.gz,e34917b3ec01cd09ea8b52c82db1dc83,600d0703,600d0703,e34917b3ec01cd09ea8b52c82db1dc83,e34917b3ec01cd09ea8b52c82db1dc83,
...,...,...,...,...,...,...,...
1547,HC_BN1762_MTG_snRNAseq_rep1_L003_R2_001.fastq.gz,887e3d000587b2774ce3cd7022078577,93e1775e,93e1775e,887e3d000587b2774ce3cd7022078577,887e3d000587b2774ce3cd7022078577,
1548,HC_BN1762_MTG_snRNAseq_rep1_L004_I1_001.fastq.gz,44ac0c6d4f7f3b4502eead39564c8723,a361f850,a361f850,44ac0c6d4f7f3b4502eead39564c8723,44ac0c6d4f7f3b4502eead39564c8723,
1549,HC_BN1762_MTG_snRNAseq_rep1_L004_I2_001.fastq.gz,dc114df490e1857072462ca090799005,b617330f,b617330f,dc114df490e1857072462ca090799005,dc114df490e1857072462ca090799005,
1550,HC_BN1762_MTG_snRNAseq_rep1_L004_R1_001.fastq.gz,e24eddb1ac0b1f4d2bdd32b5f69dae27,e49ff1eb,e49ff1eb,e24eddb1ac0b1f4d2bdd32b5f69dae27,e24eddb1ac0b1f4d2bdd32b5f69dae27,
