ASAP CRN Metadata validation - wave 1

# Team Lee. ASAP CRN Metadata validation - wave 1

9 Oct 2024

Andy Henrie




In [3]:
import pandas as pd
from pathlib import Path
import os, sys

sys.path.append(os.path.abspath((os.path.join(os.getcwd(), 'src/crn_utils'))))

from util import read_CDE, NULL, prep_table, read_meta_table
from validate import validate_table, ReportCollector
from update_schema import v1_to_v2, v2_to_v3_PMDBS, create_upload_medadata_package

%load_ext autoreload
%autoreload 2



## CDEs
load the relavent CDEs

In [4]:
schema_version = "v1"
schema_path = Path.home() / "Projects/ASAP/crn-utils/resource/CDE"
CDEv1 = read_CDE(schema_version, local_path=schema_path)
schema_version = "v2.1"
CDEv2 = read_CDE(schema_version, local_path=schema_path)
schema_version = "v3.0"
CDEv3 = read_CDE(schema_version, local_path=schema_path)

metadata_version: ASAP_CDE_v1
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v1
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v1.csv
read local file
metadata_version: ASAP_CDE_v2.1
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v2.1
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v2.1.csv
read local file
metadata_version: ASAP_CDE_v3.0
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v3.0
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v3.0.csv
read local file


> SANITY CHECK: verify reading from google doc works.

```python
CDEv1_ = read_CDE("v1")
CDEv2_ = read_CDE("v2.1")
CDEv3_ = read_CDE("v3.0")
```


## Clean V1 Table
write clean metadata tables according to CDE v1

### Team Lee

In [5]:
## convert 
team = "lee"
collection = "sn_pmdbs"

root_path = Path.home() / ("Projects/ASAP/data/teams")

metadata_path = root_path / f"{team}/{collection}/metadata/"
og_path = metadata_path / "og"

# v1_tables = ['STUDY', 'PROTOCOL', 'SUBJECT', 'CLINPATH', 'SAMPLE']

# STUDY = read_meta_table(og_path / "STUDY.csv")
# PROTOCOL = read_meta_table(og_path / "PROTOCOL.csv")
# SUBJECT = read_meta_table(og_path / "SUBJECT.csv")
# CLINPATH = read_meta_table(og_path / "CLINPATH.csv")
# SAMPLE = read_meta_table(og_path / "SAMPLE.csv")

SUBJECT = pd.read_csv(f"{og_path}/SUBJECT.tsv", delimiter="\t")
SAMPLE = pd.read_csv(f"{og_path}/SAMPLE.tsv",delimiter="\t")

CLINPATH = pd.read_csv(f"{og_path}/CLINPATH.csv",delimiter=",")
STUDY = pd.read_csv(f"{og_path}/STUDY.tsv",delimiter="\t")
PROTOCOL = pd.read_csv(f"{og_path}/PROTOCOL.tsv",delimiter="\t")

metadata_version = "v1"
METADATA_VERSION_DATE = f"{metadata_version}_{pd.Timestamp.now().strftime('%Y%m%d')}"

In [6]:
STUDY = pd.read_csv(og_path / "STUDY.tsv",delimiter="\t")
STUDY.to_csv(og_path / "STUDY_.csv")
STUDY = pd.read_csv(og_path / "STUDY_.csv")
STUDY.head()


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Team-Lee-Bras-Lab-Info,Field,Description,Data type,Validation,Note,Required/Optional
0,Is senescence a component of human PD and does...,project_name,Project Name/Title,String,,Unique and clear title.,Required,,,
1,Human snRNA-seq PD Senesence Jose Bras Team Lee,project_dataset,Dataset name,String,,A Dataset name is required for each submission...,Required,,,
2,Characterize the neuropathological progression...,project_description,Brief description of the goals and objectives ...,String,,,Required,,,
3,TEAM-LEE,ASAP_team_name,"ASAP Team e.g. ""Scherzer""",Enum,"[""TEAM-LEE"",""TEAM-HAFLER"",""TEAM-HARDY"",....]",,Required,,,
4,Bras,ASAP_lab_name,"ASAP Lab under the above team e.g. ""Dong""",String,,,Required,,,


In [7]:
# fix STUDY formatting
tmp = pd.DataFrame()
tmp = STUDY[["Unnamed: 1","Unnamed: 0"]].transpose().reset_index().drop(columns=["index"])
tmp.columns = tmp.iloc[0]
STUDY = tmp.drop([0])
# STUDY[["Unnamed: 1"]].transpose().reset_index().drop(columns=["index"]), tmp
STUDY.head()

Unnamed: 0,project_name,project_dataset,project_description,ASAP_team_name,ASAP_lab_name,PI_full_name,PI_email,submitter_id,submitter_name,submittor_email,...,other_funding_source,publication_DOI,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,metadata_version_date
1,Is senescence a component of human PD and does...,Human snRNA-seq PD Senesence Jose Bras Team Lee,Characterize the neuropathological progression...,TEAM-LEE,Bras,"Jose, Bras",jose.bras@vai.org,"Lee, L, Marshall ; Kimberly, E, Paquette ; Kai...",Kaitlyn E Westra,kaitlyn.westra@vai.org,...,,,,75,hippocampus; middle frontal gyrus; substantia ...,human PD and control postmortem brains,,,unsure,


In [8]:
metadata_version_date = "v1, SEPT2023"
STUDY["metadata_version_date"] = METADATA_VERSION_DATE
# Testing the function with STUDY.csv and CDE.csv
STUDY.replace("Nan", "", inplace=True)

# Need to rename submitter_id to contributor_names
STUDY = STUDY.rename(columns={"submitter_id": "contributor_names"})
STUDY


Unnamed: 0,project_name,project_dataset,project_description,ASAP_team_name,ASAP_lab_name,PI_full_name,PI_email,contributor_names,submitter_name,submittor_email,...,other_funding_source,publication_DOI,publication_PMID,number_of_brain_samples,brain_regions,types_of_samples,PI_ORCHID,PI_google_scholar_id,DUA_version,metadata_version_date
1,Is senescence a component of human PD and does...,Human snRNA-seq PD Senesence Jose Bras Team Lee,Characterize the neuropathological progression...,TEAM-LEE,Bras,"Jose, Bras",jose.bras@vai.org,"Lee, L, Marshall ; Kimberly, E, Paquette ; Kai...",Kaitlyn E Westra,kaitlyn.westra@vai.org,...,,,,75,hippocampus; middle frontal gyrus; substantia ...,human PD and control postmortem brains,,,unsure,v1_20241016


Collect additional metadata from covar.csv .. i.e. batch

In [9]:

aux_metadata_path = Path.home() / ("Projects/ASAP/team-lee/metadata")
HIP_covar = pd.read_csv(f"{aux_metadata_path}/HIP/covar.csv")
HIP_cases = pd.read_csv(f"{aux_metadata_path}/HIP/PD_ASAP_Sample_batch_information_banner_cases.csv").dropna(axis=0,how='all')
HIP_control = pd.read_csv(f"{aux_metadata_path}/HIP/PD_ASAP_Sample_batch_information_banner_controls.csv")

MFG_covar = pd.read_csv(f"{aux_metadata_path}/MFG/covar.csv") # includes 'PMI' ?
MFG_cases = pd.read_csv(f"{aux_metadata_path}/MFG/PD_ASAP_Sample_batch_information_banner_cases.csv").dropna(axis=0,how='all')
MFG_control = pd.read_csv(f"{aux_metadata_path}/MFG/PD_ASAP_Sample_batch_information_banner_controls.csv")


SN_covar = pd.read_csv(f"{aux_metadata_path}/SN/covar.csv")
SN_cases = pd.read_csv(f"{aux_metadata_path}/SN/PD_ASAP_Sample_batch_information_banner_cases.csv").dropna(axis=0,how='all')
SN_control = pd.read_csv(f"{aux_metadata_path}/SN/PD_ASAP_Sample_batch_information_banner_controls.csv")

In [10]:
# Hippocampus samples
# HIP_cases["GROUPcv"]="PD"
# HIP_control["GROUPcv"]="HC"

HIP_meta = pd.concat([HIP_cases, HIP_control], axis=0, ignore_index=True)
HIP_meta["GROUPcv"]= HIP_meta["PD"].apply(lambda x: "PD" if (x=="yes") else "HC")


In [11]:


HIP_meta['MERGE_ID'] = "HIP_" + HIP_meta['GROUPcv'] +"_" + HIP_meta['CaseID'].str.replace('-','')
HIP_covar['MERGE_ID'] = HIP_covar['COUNT_ID']
# the fastqs follow COUNT_ID insteald of SEQ_ID naming convention
HIP_covar['SEQ_ID'] = HIP_covar['COUNT_ID']



In [12]:
# there's a bug in the meta table... skip for now
HIP_TABLE = pd.merge(HIP_covar, HIP_meta, on='MERGE_ID', how='outer')

# HIP_TABLE = HIP_covar
HIP_TABLE['subdir']="HIP"


In [13]:
test = HIP_TABLE[["MERGE_ID","SEQ_ID","GROUPcv","subdir",'PD']]

In [14]:
### medial frontal gyrus samples
MFG_meta = pd.concat([MFG_cases, MFG_control], axis=0, ignore_index=True)
MFG_meta["GROUPcv"]= MFG_meta["PD"].apply(lambda x: "PD" if (x=="yes") else "HC")

# make a MERGE_ID column because the formatting is inconsistent
MFG_meta['MERGE_ID'] = "MFG_" + MFG_meta['GROUPcv'] +"_" + MFG_meta['CaseID'].str.replace('-','')
MFG_covar['MERGE_ID'] = MFG_covar['SAMPLE']
# the fastqs are in SEQ_ID 

# there's a bug in the meta table... skip for now
MFG_TABLE = pd.merge(MFG_covar, MFG_meta, on='MERGE_ID', how='inner')
MFG_TABLE['subdir']="MFG"



# Substantia Nigra
SN_meta = pd.concat([SN_cases, SN_control], axis=0, ignore_index=True)
SN_meta["GROUPcv"] = SN_meta["PD"].apply(lambda x: "PD" if (x=="yes") else "HC")

SN_meta['MERGE_ID'] = "SN_" + MFG_meta['GROUPcv'] +"_" + MFG_meta['CaseID'].str.replace('-','')
SN_covar['MERGE_ID'] = SN_covar['SAMPLE']

# there's a bug in the meta table... skip for now
SN_TABLE = pd.merge(SN_covar, SN_meta, on='MERGE_ID', how='outer')
SN_TABLE['subdir']="SN"


### concatenate SN, MSG, and HIP tables into one 'all_samples' table
all_samples = pd.concat([HIP_TABLE, MFG_TABLE, SN_TABLE], axis=0, ignore_index=True)


In [15]:

SAMPLE_ALL = SAMPLE.merge(all_samples, left_on='sample_id', right_on='MERGE_ID', how='left')


SAMPLE_ALL.to_csv(og_path / "alternate_metadata.csv")

In [16]:
SAMPLE_og = SAMPLE.copy()
SAMPLE['batch'] = SAMPLE_ALL['BATCH']

In [17]:
# Extract the fields with DataType as "Enum" or "String" for the "sample" table from CDE.csv
SAMPLE = prep_table(SAMPLE, CDEv1)
# for field in string_enum_fields:
#     if field in SAMPLE.columns:
#         SAMPLE[field] = SAMPLE[field].astype(str)


In [18]:
# write the updated tables to the og folder with _ appended to the name
SAMPLE.to_csv(og_path / "SAMPLE_.csv")
SUBJECT.to_csv(og_path / "SUBJECT_.csv")
CLINPATH.to_csv(og_path / "CLINPATH_.csv")
STUDY.to_csv(og_path / "STUDY_.csv")
PROTOCOL.to_csv(og_path / "PROTOCOL_.csv")



In [49]:

# reload the tables with read_meta_table
SAMPLE = read_meta_table(og_path / "SAMPLE_.csv")
SUBJECT = read_meta_table(og_path / "SUBJECT_.csv")
CLINPATH = read_meta_table(og_path / "CLINPATH_.csv")
STUDY = read_meta_table(og_path / "STUDY_.csv")
PROTOCOL = read_meta_table(og_path / "PROTOCOL_.csv")



In [50]:
SUBJECT['primary_diagnosis'].unique()

array(['No PD nor other neurological disorder',
       'Other neurological disorder', 'Idiopathic PD'], dtype=object)

In [51]:
# SUBJECT['primary_diagnosis'] = SUBJECT['primary_diagnosis'].replace('No PD nor other neurological disorder','No PD or other neurological disorder')

In [52]:
SAMPLE['file_type'] = SAMPLE['file_type'].replace({"Fastq":"fastq"})


In [53]:
# fix file_name and file_MD5 which need to be exploded (do this last for simplicity. i.e. to keep one sample per row rather than one file per row)

# Step 1: Split the values in the columns based on commas
SAMPLE['file_name'] = SAMPLE['file_name'].str.split(',')
SAMPLE['file_MD5(R1,R2)'] = SAMPLE['file_MD5(R1,R2)'].str.split(',')

# Step 2: Explode both 'file_name' and 'file_MD5(R1,R2)' columns together
SAMPLE = SAMPLE.explode(['file_name', 'file_MD5(R1,R2)'])

# Step 3: Rename the "file_MD5(R1,R2)" column to "file_MD5"
SAMPLE = SAMPLE.rename(columns={"file_MD5(R1,R2)": "file_MD5"})



In [54]:
# fix subject id in SAMPLE

samp_id_mapper = dict(zip(SUBJECT['source_subject_id'], SUBJECT['subject_id']))
SAMPLE['subject_id'] = SAMPLE['subject_id'].map(samp_id_mapper)

In [55]:
SAMPLE.head()

Unnamed: 0,sample_id,source_sample_id,subject_id,replicate,replicate_count,repeated_sample,tissue,brain_region,source_RIN,RIN,...,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV2000,pm_PH,donor_id,batch
0,MFG_HC_1225,12-25,HC_1225,rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
0,MFG_HC_1225,12-25,HC_1225,rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
1,MFG_HC_0602,06-02,HC_0602,rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
1,MFG_HC_0602,06-02,HC_0602,rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4
2,MFG_PD_0009,00-09,PD_0009,rep1,1,0,Brain,Middle_Frontal_Gyrus,,,...,Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,,BATCH_4


In [56]:

SAMPLE['file_name'] = SAMPLE['file_name'].replace({'HIP_PD_0348_S0_L000_R2_001.fastq.gzgz':'HIP_PD_0348_S0_L000_R2_001.fastq.gz',
                              'HIP_HC_1939_S6_L001_R2_001.fastq.gzgz':  'HIP_HC_1939_S6_L001_R2_001.fastq.gz'})
SAMPLE.file_name.to_list()


['MFGHC1225_S9_L001_R1_001.fastq.gz',
 'MFGHC1225_S9_L001_R2_001.fastq.gz',
 'MFGHC0602_S2_L001_R1_001.fastq.gz',
 'MFGHC0602_S2_L001_R2_001.fastq.gz',
 'MFGPD0009_S3_L001_R1_001.fastq.gz',
 'MFGPD0009_S3_L001_R2_001.fastq.gz',
 'MFGPD1921_S9_L001_R1_001.fastq.gz',
 'MFGPD1921_S9_L001_R2_001.fastq.gz',
 'MFGPD2058_S5_L001_R1_001.fastq.gz',
 'MFGPD2058_S5_L001_R2_001.fastq.gz',
 'MFGPD1441_S9_L001_R1_001.fastq.gz',
 'MFGPD1441_S9_L001_R2_001.fastq.gz',
 'MFGPD1344_S7_L001_R1_001.fastq.gz',
 'MFGPD1344_S7_L001_R2_001.fastq.gz',
 'MFGHC1939_S15_L001_R1_001.fastq.gz',
 'MFGHC1939_S15_L001_R2_001.fastq.gz',
 'MFGHC1308_S8_L002_R1_001.fastq.gz',
 'MFGHC1308_S8_L002_R2_001.fastq.gz',
 'MFGHC1862_S5_L004_R1_001.fastq.gz',
 'MFGHC1862_S5_L004_R2_001.fastq.gz',
 'MFGHC1864_S1_L002_R1_001.fastq.gz',
 'MFGHC1864_S1_L002_R2_001.fastq.gz',
 'MFGHC2057_S7_L002_R1_001.fastq.gz',
 'MFGHC2057_S7_L002_R2_001.fastq.gz',
 'MFGHC2061_S7_L004_R1_001.fastq.gz',
 'MFGHC2061_S7_L004_R2_001.fastq.gz',
 'MFGHC206

In [57]:
CLINPATH['region_level_2'].unique()

# change "Hippocampus" to "CA1-CA4"
CLINPATH['region_level_2'] = CLINPATH['region_level_2'].replace('Hippocampus', 'CA1-CA4')

# skip hx_melanoma and education level for now as there is not a "Unknown" or "Not Reported" option in the CDE

# leave te APOE_e4_status as is for now . multiple are coded as "2,3" 
# leave cognitive status as is, since there is no "Unknown" or "Not Reported" option in the CDE

# potential "path_braak_asyn" coding 
braak_map = {'L. Olfactory Bulb-Only':"1/2", 'Lla. Brainstem Predominant':"3",
       'Llb. Limbic Predominant':"3/4", 'LV. Neocortical':"5",
       'Lll. Brainstem/Limbic':"3/4", '0. No Lewy bodies':"0"}
# set to NaN for now since this is actualy path_mckeith coding

CLINPATH['path_braak_asyn'] = ""

mckeith_map = {'L. Olfactory Bulb-Only':"Olfactory bulb only", 'Lla. Brainstem Predominant':"Brainstem",
       'Llb. Limbic Predominant':"Limbic (transitional)", 'LV. Neocortical':"Neocortical",
       'Lll. Brainstem/Limbic':"Amygdala Predominant", 'l. Olfactory Bulb-Only':"Olfactory bulb only", 'lla. Brainstem Predominant':"Brainstem",
       'llb. Limbic Predominant':"Limbic (transitional)", 'lV. Neocortical':"Neocortical",
       'lll. Brainstem/Limbic':"Amygdala Predominant", '0. No Lewy bodies':"Absent"}


CLINPATH['path_mckeith'] = CLINPATH['path_mckeith'].replace(mckeith_map)

# leave path_nia_ri like this for now. not sure how to map "criteria not met" and "Not AD"

# leave amyloid_angiopathy_severity_scale like this for now. not sure how to map 'Cerebral amyloid angiopathy, temporal and occipital lobe','Cerebral amyloid angiopathy, frontal lobe']


In [58]:
# - _*region_level_3*_:  invalid values 💩'unknown'
#     - valid ➡️ 'Grey matter', 'White matter', 'Grey and white matter', 'Unknown', 'NA'
# - _*path_mckeith*_:  invalid values 💩'l. Olfactory Bulb-Only', 'lla. Brainstem Predominant', 'llb. Limbic Predominant', 'lV. Neocortical', 'lll. Brainstem/Limbic'
#     - valid ➡️ 'Neocortical', 'Limbic (transitional)', 'Brainstem', 'Amygdala Predominant', 'Olfactory bulb only', 'Limbic, transitional (brainstem and limbic involvement)', 'Diffuse, neocortical (brainstem, limbic and neocortical involvement)', 'Olfactory Bulb-Only', 'Limbic (amygdala) predominant', 'Absent', 'Present, but extent unknown', 'NA'
# - _*path_nia_ri*_:  invalid values 💩'Criteria not met', 'Not AD', 'criteria not met'
#     - valid ➡️ 'Low', 'Intermediate', 'High', 'None', 'NA'
# - _*TDP43*_:  invalid values 💩'Na'
#     - valid ➡️ 'None in medial temporal lobe', 'Present in amygdala, only', 'Present in hippocampus, only', 'Present in amygdala and hippocampus, only', 'Present in medial temporal lobe and middle frontal gyrus (not FTLD pattern)', 'Unknown', 'NA'
# - _*amyloid_angiopathy_severity_scale*_:  invalid values 💩'Cerebral amyloid angiopathy, temporal and occipital lobe', 'Cerebral amyloid angiopathy, frontal lobe'
#     - valid ➡️ 'None', 'Mild', 'Moderate', 'Severe', 'Not assessed', 'Unknown', 'NA'
# - _*path_ad_level*_:  invalid values 💩'Microscopic changes of Alzheimer's disease, insufficient for diagnosis', 'Microscopic lesions of Alzheimer's disease, insufficient for diagnosis'
#     - valid ➡️ 'No evidence of Alzheimer's disease neuropathological change', 'Low level Alzheimer's disease neuropathological change', 'At least low level Alzheimer's disease neuropathological change', 'Intermediate level Alzheimer's disease neuropathological change', 'At least intermediate level Alzheimer's disease neuropathological change', 'High level Alzheimer's disease neuropathological change', 'Unknown', 'NA'
CLINPATH['region_level_3'] = CLINPATH['region_level_3'].replace('unknown', 'Unknown')
# CLINPATH['path_mckeith'] = CLINPATH['path_mckeith'].replace(braak_map)
CLINPATH['path_nia_ri'] = CLINPATH['path_nia_ri'].replace({'Criteria not met': 'None', 'Not AD': 'None', 'criteria not met': 'None'})
CLINPATH['TDP43'] = CLINPATH['TDP43'].replace('Na', 'NA')
CLINPATH['amyloid_angiopathy_severity_scale'] = CLINPATH['amyloid_angiopathy_severity_scale'].replace({'Cerebral amyloid angiopathy, temporal and occipital lobe': 'Severe', 'Cerebral amyloid angiopathy, frontal lobe': 'Severe'})
CLINPATH['path_ad_level'] = CLINPATH['path_ad_level'].replace({'Microscopic changes of Alzheimer\'s disease, insufficient for diagnosis': 'Low level Alzheimer\'s disease neuropathological change', 'Microscopic lesions of Alzheimer\'s disease, insufficient for diagnosis': 'Unknown'})


In [59]:

SAMPLE_ALL_CP = SAMPLE_ALL.merge(CLINPATH, on='sample_id', how='outer')


In [60]:
v1_path = metadata_path / "v1"
v1_path

PosixPath('/Users/ergonyc/Projects/ASAP/data/teams/lee/sn_pmdbs/metadata/v1')

In [61]:

SAMPLE_ALL_CP.to_csv((v1_path / f"auxiluary_metadata.csv"))
SAMPLE.to_csv((v1_path / f"SAMPLE.csv"))
SUBJECT.to_csv((v1_path / f"SUBJECT.csv"))
CLINPATH.to_csv((v1_path / f"CLINPATH.csv"))
STUDY.to_csv((v1_path / f"STUDY.csv"))
PROTOCOL.to_csv((v1_path / f"PROTOCOL.csv"))


In [62]:
CDE = CDEv1
tables = CDE['Table'].unique()
v1_path = metadata_path / "v1"

dfs = {}
for table in tables:
    df = read_meta_table(v1_path / f"{table}.csv")
    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()
    dfs[table] = full_table
    # df.to_csv(v1_path / f"{table}.csv", index=False)

recoding number_of_brain_samples as int
🚨⚠️❗ **Missing Required Fields in STUDY: submitter_email**
🚨⚠️❗ **5 Fields with empty (NULL) values:**

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- PI_ORCHID: 1/1 empty rows (OPTIONAL)

	- PI_google_scholar_id: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.
🚨⚠️❗ **Extra field in STUDY: submittor_email**

All required fields are present in *SUBJECT* table.
🚨⚠️❗ **1 Fields with empty (NULL) values:**

	- primary_diagnosis_text: 23/25 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

recoding replicate_count as int
recoding repeated_sample as int
recoding input_cell_count as int
All required fields are present in *SAMPLE* table.
🚨⚠️❗ **Missing Optional Fields in SAMPLE: DV200**
🚨⚠️❗ **7 Fields with empty (NULL) values:**

	- source_RIN: 150/150 empty rows (REQUIRED)

	- RIN: 150/150 empty rows (REQUIRED)

	- hea

In [63]:
# make tables conform to CDE and save extra columns as "auxiliary"

for table in tables:
    df = dfs[table]
    schema = CDE[CDE['Table'] == table]
    valid_fields = schema['Field'].unique()
    df_out = df[valid_fields]
    aux_fields = set(df.columns) - set(valid_fields)
    if aux_fields:
        df_aux = df[list(aux_fields)]
        df_aux.to_csv(v1_path / f"{table}_auxiliary.csv", index=False)
        print(f"Saved {table}_auxiliary.csv")
    df_out.to_csv(v1_path / f"{table}.csv", index=False)
    

Saved STUDY_auxiliary.csv
Saved SAMPLE_auxiliary.csv


In [64]:
dfs['SUBJECT']['primary_diagnosis'].unique()

array(['No PD nor other neurological disorder',
       'Other neurological disorder', 'Idiopathic PD'], dtype=object)

## Update the table to v2

In [65]:
v2_path = metadata_path / "v2"
team_dataset_id = "sc_pmdbs"

v2_tables, aux_tables = v1_to_v2(v1_path, v2_path, CDEv1, CDEv2, team_dataset_id=team_dataset_id)




recoding number_of_brain_samples as int
recoding age_at_onset as int
recoding age_at_diagnosis as int
recoding first_motor_symptom as int
recoding replicate_count as int
recoding repeated_sample as int
recoding input_cell_count as int


In [66]:
CDE = CDEv2
for table,df in v2_tables.items():
    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()



recoding number_of_brain_samples as int
All required fields are present in *STUDY* table.
🚨⚠️❗ **7 Fields with empty (NULL) values:**

	- submitter_email: 1/1 empty rows (REQUIRED)

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- PI_ORCHID: 1/1 empty rows (OPTIONAL)

	- PI_google_scholar_id: 1/1 empty rows (OPTIONAL)

	- alternate_dataset_id: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *PROTOCOL* table.
🚨⚠️❗ **1 Fields with empty (NULL) values:**

	- other_reference: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

recoding age_at_onset as int
recoding age_at_diagnosis as int
recoding first_motor_symptom as int
All required fields are present in *SUBJECT* table.
🚨⚠️❗ **12 Fields with empty (NULL) values:**

	- AMPPD_id: 25/25 empty rows (REQUIRED)

	- GP2_id: 25/25 empty rows (REQUIRED)

	- last_diagnosis: 25

In [67]:
v3_meta_tables = ['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']

f"{v3_meta_tables}"


"['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']"

In [68]:
v3_path = metadata_path / "v3"

v3_tables, aux_tables = v2_to_v3_PMDBS(v2_path, v3_path, CDEv2, CDEv3)

recoding number_of_brain_samples as int
recoding age_at_onset as int
recoding age_at_diagnosis as int
recoding first_motor_symptom as int
recoding replicate_count as int
recoding repeated_sample as int
recoding input_cell_count as int
recoding replicate_count as int
recoding repeated_sample as int


In [69]:
CDE = CDEv3
for table,df in v3_tables.items():
    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()

recoding number_samples as int
All required fields are present in *STUDY* table.
🚨⚠️❗ **7 Fields with empty (NULL) values:**

	- submitter_email: 1/1 empty rows (REQUIRED)

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- PI_ORCID: 1/1 empty rows (OPTIONAL)

	- PI_google_scholar_id: 1/1 empty rows (OPTIONAL)

	- alternate_dataset_id: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *PROTOCOL* table.
🚨⚠️❗ **1 Fields with empty (NULL) values:**

	- other_reference: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *SUBJECT* table.
🚨⚠️❗ **1 Fields with empty (NULL) values:**

	- primary_diagnosis_text: 23/25 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

recoding replicate_count as int
recoding repeated_sample as int
All required fields are present in *SAMPLE* table.


In [70]:
STUDY = v3_tables['STUDY']
STUDY

Unnamed: 0,ASAP_team_name,ASAP_lab_name,project_name,team_dataset_id,project_dataset,project_description,PI_full_name,PI_email,contributor_names,submitter_name,...,number_samples,sample_types,types_of_samples,DUA_version,metadata_tables,PI_ORCID,PI_google_scholar_id,preprocessing_references,metadata_version_date,alternate_dataset_id
0,TEAM-LEE,Bras,Is senescence a component of human PD and does...,sc_pmdbs,Human snRNA-seq PD Senesence Jose Bras Team Lee,Characterize the neuropathological progression...,"Jose, Bras",jose.bras@vai.org,"Lee, L, Marshall ; Kimberly, E, Paquette ; Kai...",Kaitlyn E Westra,...,75,hippocampus; middle frontal gyrus; substantia ...,human PD and control postmortem brains,unsure,"['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DA...",,,NA(raw data),v1_20241016,


### Create metadata package


In [71]:
export_path = root_path / f"{team}"

create_upload_medadata_package(export_path, v3_tables)

In [72]:
v3_tables['SAMPLE']['condition_id']

0     no_pd_nor_other_neurological_disorder
1     no_pd_nor_other_neurological_disorder
2     no_pd_nor_other_neurological_disorder
3               other_neurological_disorder
4               other_neurological_disorder
                      ...                  
70                            idiopathic_pd
71                            idiopathic_pd
72                            idiopathic_pd
73                            idiopathic_pd
74                            idiopathic_pd
Name: condition_id, Length: 75, dtype: object

In [73]:
v3_tables['CONDITION']

Field,condition_id,intervention_name,intervention_id,protocol_id,intervention_aux_table
0,no_pd_nor_other_neurological_disorder,Case-Control,Control,,
1,other_neurological_disorder,Case-Control,Other,,
2,idiopathic_pd,Case-Control,Case,,


Transfer cleaned metadata to raw buckets 




## Lee

In [131]:
# Lee
!gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/lee-credentials.json 


Activated service account credentials for: [raw-admin-lee@dnastack-asap-parkinsons.iam.gserviceaccount.com]


In [132]:

!gsutil -u dnastack-asap-parkinsons ls -al "gs://asap-raw-data-team-lee/metadata/v2"


      6243  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/CLINPATH.csv#1701296766077305  metageneration=1
     24218  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/DATA.csv#1701296766689376  metageneration=1
       968  2023-11-29T22:26:07Z  gs://asap-raw-data-team-lee/metadata/v2/PROTOCOL.csv#1701296767085192  metageneration=1
     23092  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/SAMPLE.csv#1701296766486332  metageneration=1
      1054  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/STUDY.csv#1701296766878081  metageneration=1
      4277  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/SUBJECT.csv#1701296766288192  metageneration=1
TOTAL: 6 objects, 59852 bytes (58.45 KiB)


In [47]:
!gsutil -u dnastack-asap-parkinsons rm -r "gs://asap-raw-data-team-lee/metadata/v2/v2_20231128"

Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/CLINPATH.csv#1701213902124849...
Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/DATA.csv#1701213902800745...
Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/PROTOCOL.csv#1701213903243233...
Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/SAMPLE.csv#1701213902582568...
/ [4 objects]                                                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m rm ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/STUDY.csv#1701213903050409...
Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/SUBJECT.csv#1701213902358354...
/ [6 objects]                                                                   
Operation completed over 6 objec

In [None]:
Path.cwd()

PosixPath('/Users/ergonyc/Projects/ASAP/meta-clean')

In [133]:

!gsutil -u dnastack-asap-parkinsons cp -r "./clean/team-Lee/v2_20231130/*.csv"  "gs://asap-raw-data-team-lee/metadata/v2"

Copying file://./clean/team-Lee/v2_20231130/CLINPATH.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/SUBJECT.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/SAMPLE.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/DATA.csv [Content-Type=text/csv]... 
- [4 files][ 56.5 KiB/ 56.5 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./clean/team-Lee/v2_20231130/STUDY.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/PROTOCOL.csv [Content-Type=text/csv]...
\ [6 files][ 58.4 KiB/ 58.4 KiB]                                                
Operation completed over 6 objects/58.4 KiB.                                     


In [44]:
!gsutil -u dnastack-asap-parkinsons cp -r "./clean/team-Lee/*.csv"  "gs://asap-raw-data-team-lee/metadata/clean/"

Copying file://./clean/team-Lee/CLINPATH.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/SUBJECT.csv [Content-Type=text/csv]...          
Copying file://./clean/team-Lee/auxiluary_metadata.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/SAMPLE.csv [Content-Type=text/csv]...           
\ [4 files][187.4 KiB/187.4 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./clean/team-Lee/STUDY.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/auxilarry_metadata.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/PROTOCOL.csv [Content-Type=text/csv]...         
Copying file://./clean/team-Lee/CDE.csv [Content-Type=text/csv]...              
| [8 files][328.0 KiB/328.0 KiB]      

In [49]:
!gsutil -u dnastack-asap-parkinsons ls -al "gs://asap-raw-data-team-lee/metadata/clean"


     37741  2023-11-28T21:58:33Z  gs://asap-raw-data-team-lee/metadata/clean/CDE.csv#1701208713502213  metageneration=1
     24241  2023-11-28T21:58:31Z  gs://asap-raw-data-team-lee/metadata/clean/CLINPATH.csv#1701208711158586  metageneration=1
       968  2023-11-28T21:58:33Z  gs://asap-raw-data-team-lee/metadata/clean/PROTOCOL.csv#1701208713057031  metageneration=1
     60412  2023-11-28T21:58:32Z  gs://asap-raw-data-team-lee/metadata/clean/SAMPLE.csv#1701208712230186  metageneration=1
       969  2023-11-28T21:58:32Z  gs://asap-raw-data-team-lee/metadata/clean/STUDY.csv#1701208712490381  metageneration=1
      3078  2023-11-28T21:58:31Z  gs://asap-raw-data-team-lee/metadata/clean/SUBJECT.csv#1701208711431682  metageneration=1
    104336  2023-11-28T21:58:32Z  gs://asap-raw-data-team-lee/metadata/clean/auxilarry_metadata.csv#1701208712852367  metageneration=1
    109904  2023-09-22T10:45:50Z  gs://asap-raw-data-team-lee/metadata/clean/auxiliary_metadata.csv#1695379550068707  metagene

### copy to workflow-dev bucket

First copy each set of metadata locally ...

In [135]:
!gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/wf-credentials.json

Activated service account credentials for: [admin-workflow-dev@dnastack-asap-parkinsons.iam.gserviceaccount.com]


In [51]:
# !gsutil  ls -al "gs://asap-workflow-dev/CDE"

!gsutil  ls -al "gs://asap-workflow-dev/metadata/v2/lee"


      6243  2023-11-28T22:21:42Z  gs://asap-workflow-dev/metadata/v2/lee/CLINPATH.csv#1701210102669978  metageneration=1
     24218  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/DATA.csv#1701210103715354  metageneration=1
       968  2023-11-28T22:21:44Z  gs://asap-workflow-dev/metadata/v2/lee/PROTOCOL.csv#1701210104255218  metageneration=1
     23092  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/SAMPLE.csv#1701210103292269  metageneration=1
      1054  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/STUDY.csv#1701210103992608  metageneration=1
      4277  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/SUBJECT.csv#1701210103010639  metageneration=1
                                 gs://asap-workflow-dev/metadata/v2/lee/v2_20231128/
TOTAL: 6 objects, 59852 bytes (58.45 KiB)


In [136]:
!gsutil  cp -r "./clean/team-Lee/v2_20231130/*.csv" "gs://asap-workflow-dev/metadata/v2/lee"


Copying file://./clean/team-Lee/v2_20231130/CLINPATH.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/SUBJECT.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/SAMPLE.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/DATA.csv [Content-Type=text/csv]... 
- [4 files][ 56.5 KiB/ 56.5 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./clean/team-Lee/v2_20231130/STUDY.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/PROTOCOL.csv [Content-Type=text/csv]...
\ [6 files][ 58.4 KiB/ 58.4 KiB]                                                
Operation completed over 6 objects/58.4 KiB.                                     


In [56]:
!gsutil ls -al "gs://asap-workflow-dev/metadata/v2/lee"


      6243  2023-11-28T22:21:42Z  gs://asap-workflow-dev/metadata/v2/lee/CLINPATH.csv#1701210102669978  metageneration=1
     24218  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/DATA.csv#1701210103715354  metageneration=1
       968  2023-11-28T22:21:44Z  gs://asap-workflow-dev/metadata/v2/lee/PROTOCOL.csv#1701210104255218  metageneration=1
     23092  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/SAMPLE.csv#1701210103292269  metageneration=1
      1054  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/STUDY.csv#1701210103992608  metageneration=1
      4277  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/SUBJECT.csv#1701210103010639  metageneration=1
TOTAL: 6 objects, 59852 bytes (58.45 KiB)


## check file md5s

In [39]:
from utils.checksums import extract_md5_from_details, extract_md5_from_details2


In [36]:
!gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/lee-credentials.json  



Activated service account credentials for: [raw-admin-lee@dnastack-asap-parkinsons.iam.gserviceaccount.com]


In [37]:

# !gcloud storage hash "gs://asap-raw-data-team-lee/**/*.gz"  --skip-crc32c --hex  --billing-project dnastack-asap-parkinsons > hardy_hexhash.log

!gsutil -u dnastack-asap-parkinsons hash -h "gs://asap-raw-data-team-lee/**/*.gz" > lee_hexhash.log


using the module's C extension, so checksumming will run very slowly. For help
installing the extension, please see "gsutil help crcmod".



In [40]:
bucket_files_md5 = extract_md5_from_details2("lee_hexhash.log")



checksum = DATAv2[['file_name','file_MD5']]
checksum['check1'] = checksum['file_MD5'].str.strip()
checksum['check2'] = checksum['file_name'].map(bucket_files_md5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['check'] = checksum['file_name'].map(bucket_files_md5)


In [41]:
checksum[checksum.check1 != checksum.check2].file_name.to_list()
#empty means success!!


[]

In [42]:
checksum

Unnamed: 0,file_name,file_MD5,check
0,MFGHC1225_S9_L001_R1_001.fastq.gz,9977258e598d6a52130c29c71aef6925,9977258e598d6a52130c29c71aef6925
1,MFGHC1225_S9_L001_R2_001.fastq.gz,fe2cf93257801227b7072a4fb7d18792,fe2cf93257801227b7072a4fb7d18792
2,MFGHC0602_S2_L001_R1_001.fastq.gz,110ca4864cf6938faca67567bebfb6cc,110ca4864cf6938faca67567bebfb6cc
3,MFGHC0602_S2_L001_R2_001.fastq.gz,0dcc67217e43ab53bae0d0676f9bfe8b,0dcc67217e43ab53bae0d0676f9bfe8b
4,MFGPD0009_S3_L001_R1_001.fastq.gz,a2608d0bd192333b0076d7091c1c50ea,a2608d0bd192333b0076d7091c1c50ea
...,...,...,...
145,SN_1973_PD_S1_L000_R2_001.fastq.gz,53f6c6b4a00299fb41f32b34509835fa,53f6c6b4a00299fb41f32b34509835fa
146,SN_2005_PD_S1_L001_R1_001.fastq.gz,5bde8fdd8ab28e2c00e3ae327fecc80b,5bde8fdd8ab28e2c00e3ae327fecc80b
147,SN_2005_PD_S1_L001_R2_001.fastq.gz,14364dc52760fbd3ad08b5ca582d849b,14364dc52760fbd3ad08b5ca582d849b
148,SN_2038_PD_S1_L000_R1_001.fastq.gz,3215d4e4a68a85546183e820b74cac1f,3215d4e4a68a85546183e820b74cac1f
