ASAP CRN Metadata compilation

# Team Wood. ASAP CRN Metadata scrubbing

confirm with Team Wood bulkRNAseq metadata

29 Oct 2024
Andy Henrie





In [45]:
import pandas as pd
from pathlib import Path
import os, sys

sys.path.append(os.path.abspath((os.path.join(os.getcwd(), 'src/crn_utils'))))

from util import read_CDE, NULL, prep_table, read_meta_table, create_metadata_package
from validate import validate_table, ReportCollector
from update_schema import v1_to_v2, v2_to_v3_PMDBS, intervention_typer
from checksums import extract_md5_from_details2, get_md5_hashes 
from bucket_util import authenticate_with_service_account, gsutil_ls, gsutil_cp, gsutil_mv 

%load_ext autoreload
%autoreload 2

root_path = Path.home() / ("Projects/ASAP/data/teams")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## CDEs
load the relavent CDEs

In [46]:
schema_version = "v1"
schema_path = Path.home() / "Projects/ASAP/crn-utils/resource/CDE"
CDEv1 = read_CDE(schema_version, local_path=schema_path)
schema_version = "v2.1"
CDEv2 = read_CDE(schema_version, local_path=schema_path)
schema_version = "v3.0"
CDEv3 = read_CDE(schema_version, local_path=schema_path)

metadata_version: ASAP_CDE_v1
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v1
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v1.csv
read local file
metadata_version: ASAP_CDE_v2.1
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v2.1
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v2.1.csv
read local file
metadata_version: ASAP_CDE_v3.0
https://docs.google.com/spreadsheets/d/1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc/gviz/tq?tqx=out:csv&sheet=v3.0
/Users/ergonyc/Projects/ASAP/crn-utils/resource/CDE/ASAP_CDE_v3.0.csv
read local file


## Load original tables 
These were submitted v3.0 (actually  v3.0.0 beta)


Team Wood bulk-rnaseq


In [47]:
## convert 
team = "wood"
dataset_name = "bulk-rnaseq"

metadata_path = root_path / f"{team}/{dataset_name}/metadata/"
v3_path = metadata_path / "v3"
og_path = metadata_path / "og"

v3_meta_tables = ['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']
og_meta_tables = ['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']

in_tables = [table_name for table_name in v3_meta_tables if f"{table_name}.csv" in os.listdir(og_path)]


In [48]:
og_tables = {}
for table_name in in_tables:
    df = read_meta_table(f"{og_path}/{table_name}.csv")
    og_tables[table_name] = df
    

In [49]:
og_tables.keys()

dict_keys(['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq'])

In [50]:
# rationalize the team_dataset_id
og_tables['STUDY']['team_dataset_id'] = dataset_name.replace("-", "_").replace(" ", "_")


In [51]:
og_tables['STUDY']

Unnamed: 0,ASAP_team_name,ASAP_lab_name,project_name,team_dataset_id,project_dataset,project_description,PI_full_name,PI_email,contributor_names,submitter_name,...,number_samples,sample_types,types_of_samples,DUA_version,metadata_tables,PI_ORCID,PI_google_scholar_id,preprocessing_references,metadata_version_date,alternate_dataset_id
0,TEAM-WOOD,Ryten Lab,Parkinson's aggregate mapping,bulk_rnaseq,Wood_pmdbs_bulk-rnaseq,Oligomer mapping and functional genomic charac...,Mina Ryten,mina.ryten@ucl.ac.uk,Aine Fairbrother-Browne ; Christina Toomey ; J...,Jonathan Brenton,...,234,"Substantia Nigra, Caudate, Putamen, Parahippoc...",(Braak 3-4) PD and control post-mortem brains,,"STUDY, PROTOCOL, SUBJECT, SAMPLE, DATA, CLINPA...",0000-0001-9520-6957,https://scholar.google.co.uk/citations?user=lt...,https://github.com/Jbrenton191/RNAseq_splicing...,"Version 3, 10/2024",


In [52]:
CDE = CDEv3
dfs = {}
for table,df in og_tables.items():

    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()
    dfs[table] = full_table

recoding number_samples as int
All required fields are present in *STUDY* table.
🚨⚠️❗ **5 Fields with empty (NULL) values:**

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- DUA_version: 1/1 empty rows (REQUIRED)

	- alternate_dataset_id: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *PROTOCOL* table.
🚨⚠️❗ **3 Fields with empty (NULL) values:**

	- cell_extraction_summary: 1/1 empty rows (REQUIRED)

	- protocols_io_DOI: 1/1 empty rows (REQUIRED)

	- other_reference: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *SUBJECT* table.
🚨⚠️❗ **1 Fields with empty (NULL) values:**

	- race: 234/234 empty rows (REQUIRED)
No invalid entries found in Enum fields.

recoding replicate_count as int
recoding repeated_sample as int
All required fields are present in *SAMPLE* table.
🚨⚠️❗ **Mi

In [53]:
STUDY = dfs['STUDY']
PROTOCOL = dfs['PROTOCOL']
SUBJECT = dfs['SUBJECT']
SAMPLE = dfs['SAMPLE']
DATA = dfs['DATA']
CLINPATH = dfs['CLINPATH']
PMDBS = dfs['PMDBS']
ASSAY_RNAseq = dfs['ASSAY_RNAseq']
CONDITION = dfs['CONDITION']

In [54]:
CONDITION

Unnamed: 0,condition_id,intervention_name,intervention_id,protocol_id,intervention_aux_table
0,idiopathic_pd,Case-Control,Case,,
1,no_pd_nor_other_neurological_disorder,Case-Control,Control,,
2,Hemiparkinson/hemiatrophy syndrome,Case-Control,Case,,
3,other_neurological_disorder,Case-Control,Other,,


In [55]:
STUDY['metadata_tables'] = f"{v3_meta_tables}"
STUDY['metadata_tables']

0    ['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DA...
Name: metadata_tables, dtype: object

In [56]:
metadata_version = "v3.0"
METADATA_VERSION_DATE = f"{metadata_version}_{pd.Timestamp.now().strftime('%Y%m%d')}"
STUDY['metadata_version_date'] = METADATA_VERSION_DATE


In [57]:

# SUBJECT["primary_diagnosis"] = SUBJECT["primary_diagnosis"].replace({"NA":"Hemiparkinson/hemiatrophy syndrome"})
# SUBJECT["primary_diagnosis"].unique()
SUBJECT_ = SUBJECT.drop_duplicates(subset=["subject_id"]).reset_index(drop=True)


In [None]:
SUBJECT_ = SUBJECT.drop_duplicates(subset=["subject_id"]).reset_index(drop=True)

# SAMPLE 
SAMPLE['condition_id'] = SAMPLE['condition_id'].str.lower().str.replace(" ", "_").str.replace("/", "_")

# CONDITION
# construct this table.  needs to be checked by hand
diagnosis_mapper = dict(zip(SUBJECT['subject_id'], SUBJECT['primary_diagnosis']))

v3_tables["SAMPLE"]['condition_id'] = v3_tables["SAMPLE"]['subject_id'].map(diagnosis_mapper)

CONDITIONv3 = pd.DataFrame(columns=CDEv3[CDEv3['Table'] == "CONDITION"]['Field'])
# 
CONDITIONv3['condition_id'] = SUBJECT['primary_diagnosis'].unique()
CONDITIONv3['intervention_name'] = "Case-Control"
CONDITIONv3['intervention_id'] = CONDITIONv3['condition_id'].apply(intervention_typer)
CONDITIONv3['condition_id'] = CONDITIONv3['condition_id'].str.lower().str.replace(" ", "_").str.replace("/", "_")
CONDITIONv3 = CONDITIONv3.fillna(NULL)
CONDITIONv3

Field,condition_id,intervention_name,intervention_id,protocol_id,intervention_aux_table
0,no_pd_nor_other_neurological_disorder,Case-Control,Control,,
1,idiopathic_pd,Case-Control,Case,,
2,hemiparkinson_hemiatrophy_syndrome,Case-Control,Case,,


In [59]:
table = "SUBJECT"
schema = CDE[CDE['Table'] == table]

report = ReportCollector(destination="NA")
full_table, report = validate_table(dfs[table].copy(), table, schema, report)
report.print_log()


All required fields are present in *SUBJECT* table.
🚨⚠️❗ **1 Fields with empty (NULL) values:**

	- race: 234/234 empty rows (REQUIRED)
No invalid entries found in Enum fields.



In [60]:
table = "SAMPLE"
schema = CDE[CDE['Table'] == table]

report = ReportCollector(destination="NA")
full_table, report = validate_table(dfs[table].copy(), table, schema, report)
report.print_log()

recoding replicate_count as int
recoding repeated_sample as int
All required fields are present in *SAMPLE* table.
🚨⚠️❗ **13 Fields with empty (NULL) values:**

	- time: 234/234 empty rows (OPTIONAL)

	- development_stage_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- sex_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- self_reported_ethnicity_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- disease_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- tissue_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- assay_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- donor_id: 234/234 empty rows (OPTIONAL)

	- pm_PH: 234/234 empty rows (OPTIONAL)

	- cell_type_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- source_RIN: 234/234 empty rows (OPTIONAL)

	- DV200: 234/234 empty rows (OPTIONAL)

	- suspension_type: 234/234 empty rows (OPTIONAL)
No invalid entries found in Enum fields.



In [61]:
table = "CLINPATH"
schema = CDE[CDE['Table'] == table]

report = ReportCollector(destination="NA")
full_table, report = validate_table(dfs[table].copy(), table, schema, report)
report.print_log()

recoding age_at_onset as int
recoding age_at_diagnosis as int
recoding first_motor_symptom as int
All required fields are present in *CLINPATH* table.
🚨⚠️❗ **43 Fields with empty (NULL) values:**

	- AMPPD_id: 234/234 empty rows (OPTIONAL)

	- GP2_id: 234/234 empty rows (OPTIONAL)

	- ethnicity: 234/234 empty rows (OPTIONAL)

	- family_history: 200/234 empty rows (OPTIONAL)

	- last_diagnosis: 234/234 empty rows (OPTIONAL)

	- age_at_onset: 113/234 empty rows (OPTIONAL)

	- age_at_diagnosis: 113/234 empty rows (OPTIONAL)

	- first_motor_symptom: 234/234 empty rows (OPTIONAL)

	- hx_dementia_mci: 24/234 empty rows (OPTIONAL)

	- hx_melanoma: 234/234 empty rows (OPTIONAL)

	- education_level: 234/234 empty rows (OPTIONAL)

	- smoking_status: 234/234 empty rows (OPTIONAL)

	- smoking_years: 234/234 empty rows (OPTIONAL)

	- APOE_e4_status: 234/234 empty rows (OPTIONAL)

	- cognitive_status: 234/234 empty rows (OPTIONAL)

	- time_from_baseline: 234/234 empty rows (OPTIONAL)

	- path_autops

In [62]:
table = "SAMPLE"
schema = CDE[CDE['Table'] == table]

report = ReportCollector(destination="NA")
full_table, report = validate_table(dfs[table].copy(), table, schema, report)
report.print_log()

recoding replicate_count as int
recoding repeated_sample as int
All required fields are present in *SAMPLE* table.
🚨⚠️❗ **13 Fields with empty (NULL) values:**

	- time: 234/234 empty rows (OPTIONAL)

	- development_stage_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- sex_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- self_reported_ethnicity_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- disease_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- tissue_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- assay_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- donor_id: 234/234 empty rows (OPTIONAL)

	- pm_PH: 234/234 empty rows (OPTIONAL)

	- cell_type_ontology_term_id: 234/234 empty rows (OPTIONAL)

	- source_RIN: 234/234 empty rows (OPTIONAL)

	- DV200: 234/234 empty rows (OPTIONAL)

	- suspension_type: 234/234 empty rows (OPTIONAL)
No invalid entries found in Enum fields.



In [63]:
CLINPATH

Unnamed: 0,subject_id,source_subject_id,duration_pmi,age_at_death,family_history,age_at_onset,age_at_diagnosis,hx_dementia_mci,path_autopsy_dx_main,path_autopsy_second_dx,...,last_diagnosis,first_motor_symptom,hx_melanoma,education_level,smoking_status,smoking_years,APOE_e4_status,cognitive_status,time_from_baseline,PD_pathogenic_mutation
0,C073,C073,29.0,71,,,,,"Control, Low level AD neuropathological change",,...,,,,,,,,,,
1,C073,C073,29.0,71,,,,,"Control, Low level AD neuropathological change",,...,,,,,,,,,,
2,C073,C073,29.0,71,,,,,"Control, Low level AD neuropathological change",,...,,,,,,,,,,
3,C073,C073,29.0,71,,,,,"Control, Low level AD neuropathological change",,...,,,,,,,,,,
4,C073,C073,29.0,71,,,,,"Control, Low level AD neuropathological change",,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,PDC085,PDC085,20.0,82,,,,No,"Control, Low level AD neuropathological change",,...,,,,,,,,,,
230,PDC085,PDC085,20.0,82,,,,No,"Control, Low level AD neuropathological change",,...,,,,,,,,,,
231,PDC085,PDC085,20.0,82,,,,No,"Control, Low level AD neuropathological change",,...,,,,,,,,,,
232,PDC085,PDC085,20.0,82,,,,No,"Control, Low level AD neuropathological change",,...,,,,,,,,,,


In [64]:
CONDITION

Unnamed: 0,condition_id,intervention_name,intervention_id,protocol_id,intervention_aux_table
0,idiopathic_pd,Case-Control,Case,,
1,no_pd_nor_other_neurological_disorder,Case-Control,Control,,
2,Hemiparkinson/hemiatrophy syndrome,Case-Control,Case,,
3,other_neurological_disorder,Case-Control,Other,,


In [None]:
# now save the tables
v3_tables = {
    "STUDY": STUDY,
    "PROTOCOL": PROTOCOL,
    "SUBJECT": SUBJECT,
    "SAMPLE": SAMPLE,
    "DATA": DATA,
    "CLINPATH": CLINPATH,
    "PMDBS": PMDBS,
    "CONDITION": CONDITIONv3,
    "ASSAY_RNAseq": ASSAY_RNAseq
}

### save extras as auxillary tables


In [66]:

for table,df in v3_tables.items():
    schema = CDE[CDE['Table'] == table]
    valid_fields = schema['Field'].unique()
    df_out = df[valid_fields]
    aux_fields = set(df.columns) - set(valid_fields)
    if aux_fields:
        df_aux = df[list(aux_fields)]
        df_aux.to_csv(og_path / f"{table}_auxiliary.csv", index=False)
        print(f"Saved {table}_auxiliary.csv")
    df_out.to_csv(v3_path / f"{table}.csv", index=False)

### validate v3 tables


In [67]:
CDE = CDEv3
tables = CDE['Table'].unique()
v3_meta_tables = ['STUDY', 'PROTOCOL', 'SUBJECT', 'SAMPLE', 'DATA', 'CLINPATH', 'PMDBS', 'CONDITION', 'ASSAY_RNAseq']

v3_tables = {}
for table in v3_meta_tables:
    df = read_meta_table(v3_path / f"{table}.csv")
    schema = CDE[CDE['Table'] == table]

    report = ReportCollector(destination="NA")
    full_table, report = validate_table(df.copy(), table, schema, report)
    report.print_log()
    v3_tables[table] = full_table

recoding number_samples as int
All required fields are present in *STUDY* table.
🚨⚠️❗ **5 Fields with empty (NULL) values:**

	- other_funding_source: 1/1 empty rows (REQUIRED)

	- publication_DOI: 1/1 empty rows (REQUIRED)

	- publication_PMID: 1/1 empty rows (REQUIRED)

	- DUA_version: 1/1 empty rows (REQUIRED)

	- alternate_dataset_id: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *PROTOCOL* table.
🚨⚠️❗ **3 Fields with empty (NULL) values:**

	- cell_extraction_summary: 1/1 empty rows (REQUIRED)

	- protocols_io_DOI: 1/1 empty rows (REQUIRED)

	- other_reference: 1/1 empty rows (OPTIONAL)
No invalid entries found in Enum fields.

All required fields are present in *SUBJECT* table.
🚨⚠️❗ **1 Fields with empty (NULL) values:**

	- race: 234/234 empty rows (REQUIRED)
No invalid entries found in Enum fields.

recoding replicate_count as int
recoding repeated_sample as int
All required fields are present in *SAMPLE* table.
🚨⚠️❗ **13

-------------------------
## check md5s



In [69]:
print(team)

source = "pmdbs"

bucket = f"asap-raw-team-{team}-{source}-{dataset_name}"
bucket = f"asap-raw-data-team-{team}" # for now old locations


key_file_path = Path.home() / f"Projects/ASAP/{team}-credentials.json"

res = authenticate_with_service_account(key_file_path)
print(res)

# make sure to get ALL the fastq files in the bucket
prefix = "Wood_bulkrnaseq_pm_hs/**/*.gz" #"**/*.gz" # Wood_bulkrnaseq_pm_hs/
bucket_files_md5 = get_md5_hashes( bucket, prefix)

wood
CompletedProcess(args='gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/wood-credentials.json', returncode=0, stdout='', stderr='Activated service account credentials for: [raw-admin-wood@dnastack-asap-parkinsons.iam.gserviceaccount.com]\n')
gsutil -u dnastack-asap-parkinsons hash -h gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/**/*.gz


In [70]:
# def check_md5_sums()


checksum = v3_tables['DATA'][['file_name','file_MD5']]
checksum['check2'] = checksum['file_name'].map(bucket_files_md5)
checksum['check1'] = checksum['file_MD5']
checksum[checksum.check1 != checksum.check2].file_name.to_list()
#empty means success!!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['check2'] = checksum['file_name'].map(bucket_files_md5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['check1'] = checksum['file_MD5']


[]

_____

## prep metadata in raw data bucket

steps:
- 1. archive whats there.  i.e. move to metadata/upload
- 2. copy metadata/upload to dataset upload (upload subdir)

In [71]:
metadata_subdir = "Wood_bulkrnaseq_pm_hs/metadata"
current_files = gsutil_ls(bucket,metadata_subdir)

gsutil -u dnastack-asap-parkinsons ls gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata
gsutil command succeeded: gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/ASSAY_RNAseq.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/CLINPATH.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/CONDITION.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/DATA.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/PMDBS.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/PROTOCOL.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/SAMPLE.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/STUDY.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/SUBJECT.csv



In [72]:
metadata_subdir2 = "Wood_bulkrnaseq_pm_hs/metadata/upload"
bucket = current_files[0].split("/")[2]

for file in current_files:
    if file == "":
        continue
    file_nm = Path(file).name
    is_dir = not file_nm.__contains__(".")

    source = f"gs://{bucket}/{metadata_subdir}/{file_nm}"

    destination = f"gs://{bucket}/{metadata_subdir2}/{file_nm}"
    gsutil_mv(source, destination, is_dir)

gsutil -u dnastack-asap-parkinsons mv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/ASSAY_RNAseq.csv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/ASSAY_RNAseq.csv
gsutil command succeeded: 
gsutil -u dnastack-asap-parkinsons mv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/CLINPATH.csv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/CLINPATH.csv
gsutil command succeeded: 
gsutil -u dnastack-asap-parkinsons mv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/CONDITION.csv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/CONDITION.csv
gsutil command succeeded: 
gsutil -u dnastack-asap-parkinsons mv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/DATA.csv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/DATA.csv
gsutil command succeeded: 
gsutil -u dnastack-asap-parkinsons mv gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/PMDBS.csv gs://asap-raw-data

In [73]:

metadata_subdir = "Wood_bulkrnaseq_pm_hs/metadata/upload"
current_files = gsutil_ls(bucket,metadata_subdir)

gsutil -u dnastack-asap-parkinsons ls gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload
gsutil command succeeded: gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/ASSAY_RNAseq.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/CLINPATH.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/CONDITION.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/DATA.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/PMDBS.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/PROTOCOL.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/SAMPLE.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/STUDY.csv
gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload/SUBJECT.csv



Archive the uploaded metadata locally



In [74]:
file_source = f"gs://{bucket}/{metadata_subdir}"
destination = f"{metadata_path}"
file_source,destination

('gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload',
 '/Users/ergonyc/Projects/ASAP/data/teams/wood/bulk-rnaseq/metadata')

In [75]:

is_dir = True
gsutil_cp(file_source, destination, is_dir)

gsutil -u dnastack-asap-parkinsons cp -r gs://asap-raw-data-team-wood/Wood_bulkrnaseq_pm_hs/metadata/upload /Users/ergonyc/Projects/ASAP/data/teams/wood/bulk-rnaseq/metadata
gsutil command succeeded: 


''

--------------------
## Create metadata package

This will copy the final updated to v3.0 metadata to `asap-could-processing-resources`


In [76]:
metadata_source = metadata_path


source = "pmdbs"
archive_root = Path.home() / "Projects/ASAP/asap-crn-metadata/datasets"
dataset_path = archive_root / f"{team}-{source}-{dataset_name}"
# bucket = f"asap-raw-data-team-{team}" # for now old locations
metadata_source, dataset_path

(PosixPath('/Users/ergonyc/Projects/ASAP/data/teams/wood/bulk-rnaseq/metadata'),
 PosixPath('/Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/wood-pmdbs-bulk-rnaseq'))

________

In [77]:
fnms = create_metadata_package(metadata_source, dataset_path)


Skipping empty folder /Users/ergonyc/Projects/ASAP/data/teams/wood/bulk-rnaseq/metadata/v1
Skipping empty folder /Users/ergonyc/Projects/ASAP/data/teams/wood/bulk-rnaseq/metadata/v2
Copied /Users/ergonyc/Projects/ASAP/data/teams/wood/bulk-rnaseq/metadata/v3 to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/wood-pmdbs-bulk-rnaseq/metadata/v3
Copied /Users/ergonyc/Projects/ASAP/data/teams/wood/bulk-rnaseq/metadata/og to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/wood-pmdbs-bulk-rnaseq/metadata/og
Copied /Users/ergonyc/Projects/ASAP/data/teams/wood/bulk-rnaseq/metadata/upload to /Users/ergonyc/Projects/ASAP/asap-crn-metadata/datasets/wood-pmdbs-bulk-rnaseq/metadata/upload


_____

generate ASAP IDs + transfering back to raw data bucket via `asap-crn-metadata` 

_____

## transfer metadata to raw data bucket

steps:
- 1. archive whats there.  i.e. move to metadata/archive
- 2. copy package to metadata/ . i.e. /og/*.csv, /v??/*.csv