ASAP CRN Metadata compilation

# Team Lee (Biederer). ASAP CRN Metadata construction


This is a bulkRNAseq dataset which was originally transfered along with the scRNAseq dataset.  The metadata should be identical, save the details of the bulkRNAseq assay compared to the sc/snRNAseq previously platformed.



15 Sept 2024
Andy Henrie




In [1]:
import pandas as pd

from pathlib import Path


%load_ext autoreload
%autoreload 2


In [3]:
!gsutil -u dnastack-asap-parkinsons hash -h "gs://asap-raw-data-team-lee/fastqs/bulk_MFG/*.gz" > lee_hexhash.log


using the module's C extension, so checksumming will run very slowly. For help
installing the extension, please see "gsutil help crcmod".



### Helpers (eventually load these from validate.py + io_helpers.py, etc)

In [101]:
def read_CDE(metadata_version:str="v3.0-beta", local_path:str|bool|Path=False):
    """
    Load CDE from local csv and cache it, return a dataframe and dictionary of dtypes
    """
    # Construct the path to CSD.csv
    GOOGLE_SHEET_ID = "1c0z5KvRELdT2AtQAH2Dus8kwAyyLrR0CROhKOjpU4Vc"

    if metadata_version == "v1":
        sheet_name = "ASAP_CDE_v1"
    elif metadata_version == "v2":
        sheet_name = "ASAP_CDE_v2"
    elif metadata_version == "v2.1":
        sheet_name = "ASAP_CDE_v2.1"
    elif metadata_version in ["v3.0","v3.0-beta"]:
        sheet_name = "ASAP_CDE_v3.0.0-beta"
    else:
        sheet_name = "ASAP_CDE_v2.1"


    if metadata_version in ["v1","v2","v2.1","v3.0-beta"]:
        print(f"metadata_version: {sheet_name}")
    else:
        print(f"Unsupported metadata_version: {sheet_name}")
        return 0,0
    
    cde_url = f"https://docs.google.com/spreadsheets/d/{GOOGLE_SHEET_ID}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
    if local_path:
        cde_url = Path(local_path) / f"{sheet_name}.csv"
        print(cde_url)

    
    try:
        CDE_df = pd.read_csv(cde_url)
        read_source = "url" if not local_path else "local file"
        print(f"read {read_source}")
    except:
        CDE_df = pd.read_csv(f"{sheet_name}.csv")
        print("read local file")

    # drop rows with no table name (i.e. ASAP_ids)
    CDE_df.dropna(subset=['Table'], inplace=True)

    return CDE_df


# Function to parse the file to extract MD5 and filenames
def extract_md5_from_details(md5_file):
    md5s = {}
    with open(md5_file, "r") as f:
        lines = f.readlines()
        current_file = None
        for line in lines:
            if line.startswith("gs://"):
                current_file = line.strip().rstrip(":")
                current_file = current_file.split("/")[-1]
            if "Hash (md5)" in line:
                md5s[current_file] = line.split(":")[1].strip()
    return md5s


# Function to parse the file to extract MD5 and filenames
def extract_md5_from_details2(md5_file):
    md5s = {}
    with open(md5_file, "r") as f:
        lines = f.readlines()
        current_file = None
        for line in lines:
            if line.startswith("Hashes [hex]"):
                current_file = line.strip().rstrip(":")
                current_file = current_file.split("/")[-1]
            if "Hash (md5)" in line:
                md5s[current_file] = line.split(":")[1].strip()
    return md5s



# Function to parse the file to extract crc32c and filenames
def extract_crc32c_from_details2(md5_file):
    crcs = {}
    with open(md5_file, "r") as f:
        lines = f.readlines()
        current_file = None
        for line in lines:
            if line.startswith("Hashes [hex]"):
                current_file = line.strip().rstrip(":")
                current_file = current_file.split("/")[-1]
            if "Hash (crc32c)" in line:
                crcs[current_file] = line.split(":")[1].strip()
    return crcs



# Function to parse the file to extract crc32c and filenames
def extract_hashes_from_gcloudstorage(source_hash):

    crcs = {}
    md5s = {}

    with open(source_hash, "r") as f:
        lines = f.readlines()
        current_file = None
        for line in lines:
            
            if line.startswith("crc32c_hash:"):
                curr_crc =  line.split(":")[1].strip()

            elif line.startswith("md5_hash:"):
                curr_md5 =  line.split(":")[1].strip()

            elif line.startswith("url:"):
                current_file = line.split("/")[-1].strip()
                crcs[current_file] = curr_crc
                md5s[current_file] = curr_md5
            # else:
            #     print(f'cruff:{line.strip()}')


    return crcs, md5s



# Function to parse the file to extract crc32c and filenames
def extract_hashes_from_gsutil(source_hash):

    crcs = {}
    md5s = {}

    with open(source_hash, "r") as f:
        lines = f.readlines()
        current_file = None
        for line in lines:
            if line.startswith("Hashes [hex]"):
                current_file = line.strip().rstrip(":")
                current_file = current_file.split("/")[-1]
            if "Hash (crc32c)" in line:
                crcs[current_file] = line.split(":")[1].strip()
            if "Hash (md5)" in line:
                md5s[current_file] = line.split(":")[1].strip()

    return crcs, md5s

NULL = "NA"

def read_file(data_file):
    """
    TODO: depricate dtypes
    """
    encoding = 'latin1'

    print(f"reading {data_file} txt/csv, encoding={encoding}")
    df = pd.read_csv(data_file, dtype="str", encoding=encoding, index_col=0)        

    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].str.encode('latin1', errors='replace').str.decode('utf-8', errors='replace')

    df.replace({"":NULL, pd.NA:NULL}, inplace=True)

    return df
    

bucket_files_md5 = extract_md5_from_details2("lee_hexhash.log")


In [93]:
bucket_files_md5

{'0009PD_MFG_bulk_L000_R1_001.fastq.gz': 'c9d64ec5b02de6b09a45dd7ac6610854',
 '0009PD_MFG_bulk_L000_R2_001.fastq.gz': '43a19a5fafd6892a0726660a560c4993',
 '0348PD_MFG_bulk_L000_R1_001.fastq.gz': '0ad336467cb40bfa3f6e7f582ce53203',
 '0348PD_MFG_bulk_L000_R2_001.fastq.gz': '8b6c00b59f2f0e08c053cd437a17584d',
 '0413PD_MFG_bulk_L000_R1_001.fastq.gz': 'a1e499ed8fd73cc2ae75835792ea23d0',
 '0413PD_MFG_bulk_L000_R2_001.fastq.gz': 'e2be134da750bc4cd0672dee9b507965',
 '0602HC_MFG_bulk_L000_R1_001.fastq.gz': '653a5622dcc2b0e8aa3b43a5ef364deb',
 '0602HC_MFG_bulk_L000_R2_001.fastq.gz': '39c58383dbad4b68645a0f601046e0ef',
 '1225HC_MFG_bulk_L000_R1_001.fastq.gz': 'cff31843f6024c7d1440378b3cbbfadd',
 '1225HC_MFG_bulk_L000_R2_001.fastq.gz': '47fc7147767b66c6049c5e647495c49a',
 '1308HC_MFG_bulk_L000_R1_001.fastq.gz': 'c57154c6ae4921eb67cb577cac60b216',
 '1308HC_MFG_bulk_L000_R2_001.fastq.gz': 'ef773e48069d43944a4b5dd94984d910',
 '1312PD_MFG_bulk_L000_R1_001.fastq.gz': '8e8df75e784d2593bdec8d8d201f4ca6',

In [94]:
df = pd.DataFrame(bucket_files_md5.items(), columns=["filename", "md5"])


In [95]:
df['id'] = df['filename'].str.split("_").str[0]

In [96]:
df['subject_id'] = df['id'].apply(lambda x: f"{x[-2:]}_{x[:-2]}")
df.head()

Unnamed: 0,filename,md5,id,subject_id
0,0009PD_MFG_bulk_L000_R1_001.fastq.gz,c9d64ec5b02de6b09a45dd7ac6610854,0009PD,PD_0009
1,0009PD_MFG_bulk_L000_R2_001.fastq.gz,43a19a5fafd6892a0726660a560c4993,0009PD,PD_0009
2,0348PD_MFG_bulk_L000_R1_001.fastq.gz,0ad336467cb40bfa3f6e7f582ce53203,0348PD,PD_0348
3,0348PD_MFG_bulk_L000_R2_001.fastq.gz,8b6c00b59f2f0e08c053cd437a17584d,0348PD,PD_0348
4,0413PD_MFG_bulk_L000_R1_001.fastq.gz,a1e499ed8fd73cc2ae75835792ea23d0,0413PD,PD_0413


In [97]:
df['sample_id'] = "MFG_" + df['subject_id']
df.head()

Unnamed: 0,filename,md5,id,subject_id,sample_id
0,0009PD_MFG_bulk_L000_R1_001.fastq.gz,c9d64ec5b02de6b09a45dd7ac6610854,0009PD,PD_0009,MFG_PD_0009
1,0009PD_MFG_bulk_L000_R2_001.fastq.gz,43a19a5fafd6892a0726660a560c4993,0009PD,PD_0009,MFG_PD_0009
2,0348PD_MFG_bulk_L000_R1_001.fastq.gz,0ad336467cb40bfa3f6e7f582ce53203,0348PD,PD_0348,MFG_PD_0348
3,0348PD_MFG_bulk_L000_R2_001.fastq.gz,8b6c00b59f2f0e08c053cd437a17584d,0348PD,PD_0348,MFG_PD_0348
4,0413PD_MFG_bulk_L000_R1_001.fastq.gz,a1e499ed8fd73cc2ae75835792ea23d0,0413PD,PD_0413,MFG_PD_0413


## Clean V1 Table
write clean metadata tables according to CDE v1

### Team Lee

In [102]:
## convert 
metadata_path = Path.home() / ("Projects/ASAP/meta-clean") / "clean/team-Lee/v2"

# SUBJECT = pd.read_csv(f"{metadata_path}/SUBJECT.csv", index_col=0)
SUBJECT = read_file(f"{metadata_path}/SUBJECT.csv")
# SAMPLE = pd.read_csv(f"{metadata_path}/SAMPLE.csv", index_col=0)
SAMPLE = read_file(f"{metadata_path}/SAMPLE.csv")
# CLINPATH = pd.read_csv(f"{metadata_path}/CLINPATH.csv", index_col=0)
CLINPATH = read_file(f"{metadata_path}/CLINPATH.csv")

# STUDY = pd.read_csv(f"{metadata_path}/STUDY.csv", index_col=0)
STUDY = read_file(f"{metadata_path}/STUDY.csv")
# PROTOCOL = pd.read_csv(f"{metadata_path}/PROTOCOL.csv", index_col=0)
PROTOCOL = read_file(f"{metadata_path}/PROTOCOL.csv")
# DATA = pd.read_csv(f"{metadata_path}/DATA.csv", index_col=0)
DATA = read_file(f"{metadata_path}/DATA.csv")


reading /Users/ergonyc/Projects/ASAP/meta-clean/clean/team-Lee/v2/SUBJECT.csv txt/csv, encoding=latin1
reading /Users/ergonyc/Projects/ASAP/meta-clean/clean/team-Lee/v2/SAMPLE.csv txt/csv, encoding=latin1
reading /Users/ergonyc/Projects/ASAP/meta-clean/clean/team-Lee/v2/CLINPATH.csv txt/csv, encoding=latin1
reading /Users/ergonyc/Projects/ASAP/meta-clean/clean/team-Lee/v2/STUDY.csv txt/csv, encoding=latin1
reading /Users/ergonyc/Projects/ASAP/meta-clean/clean/team-Lee/v2/PROTOCOL.csv txt/csv, encoding=latin1
reading /Users/ergonyc/Projects/ASAP/meta-clean/clean/team-Lee/v2/DATA.csv txt/csv, encoding=latin1


In [103]:
bool(Path.cwd())

True

In [104]:
metadata_version = "v3.0-beta"
CDE_df = read_CDE(metadata_version, local_path=Path.cwd())




metadata_version: ASAP_CDE_v3.0.0-beta
/Users/ergonyc/Projects/ASAP/harmonized-wf-dev/data/teams/lee/ASAP_CDE_v3.0.0-beta.csv
read local file


In [105]:
CDE_df.head()

Unnamed: 0,Table,Field,Description,DataType,Required,Validation,V0,comment,denormalized,dataset relavent
2,STUDY,ASAP_team_name,ASAP Team Name: Name of the ASAP CRN Team. i...,Enum,Required,"[""TEAM-LEE"",""TEAM-HAFLER"",""TEAM-HARDY"", ""TEAM-...",,,,
3,STUDY,ASAP_lab_name,Lab Name. : Lab name that is submitting data...,String,Required,,,,,
4,STUDY,project_name,Project Name: A Title of the overall project...,String,Required,,,,,
5,STUDY,team_dataset_id,"The ""project_name"" is often too verbose for pr...",String,Required,,,,,
6,STUDY,project_dataset,Dataset Name: A unique name is required for ...,String,Required,,,,,


In [106]:
DATA.head()

Unnamed: 0,sample_id,replicate,replicate_count,repeated_sample,batch,file_type,file_name,file_description,file_MD5,technology,omic,adjustment,content,time,header,annotation,configuration_file
0,MFG_HC_1225,rep1,1,0,BATCH_4,fastq,MFGHC1225_S9_L001_R1_001.fastq.gz,Raw sequencing data,9977258e598d6a52130c29c71aef6925,SN,RNA,Raw,Reads,0,,,NA(raw data)
1,MFG_HC_1225,rep1,1,0,BATCH_4,fastq,MFGHC1225_S9_L001_R2_001.fastq.gz,Raw sequencing data,fe2cf93257801227b7072a4fb7d18792,SN,RNA,Raw,Reads,0,,,NA(raw data)
2,MFG_HC_0602,rep1,1,0,BATCH_4,fastq,MFGHC0602_S2_L001_R1_001.fastq.gz,Raw sequencing data,110ca4864cf6938faca67567bebfb6cc,SN,RNA,Raw,Reads,0,,,NA(raw data)
3,MFG_HC_0602,rep1,1,0,BATCH_4,fastq,MFGHC0602_S2_L001_R2_001.fastq.gz,Raw sequencing data,0dcc67217e43ab53bae0d0676f9bfe8b,SN,RNA,Raw,Reads,0,,,NA(raw data)
4,MFG_PD_0009,rep1,1,0,BATCH_4,fastq,MFGPD0009_S3_L001_R1_001.fastq.gz,Raw sequencing data,a2608d0bd192333b0076d7091c1c50ea,SN,RNA,Raw,Reads,0,,,NA(raw data)


In [107]:
SAMPLE.head()

Unnamed: 0,sample_id,subject_id,source_sample_id,replicate,replicate_count,repeated_sample,batch,tissue,brain_region,hemisphere,...,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV200,pm_PH,donor_id
0,MFG_HC_1225,HC_1225,12-25,rep1,1,0,BATCH_4,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
1,HIP_HC_1225,HC_1225,12-25,rep1,1,0,BATCH_9,Brain,Hippocampus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
2,SN_HC_1225,HC_1225,12-25,rep1,1,0,BATCH_7,Brain,Substantia_Nigra,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
3,MFG_HC_0602,HC_0602,06-02,rep1,1,0,BATCH_4,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
4,HIP_HC_0602,HC_0602,06-02,rep1,1,0,BATCH_9,Brain,Hippocampus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,


In [108]:
# merge df and DATA on sample_id
DATA_ = df.merge(DATA, left_on="sample_id", right_on="sample_id", how="left")


In [109]:
# drop duplicates on md5
DATA_.drop_duplicates(subset="md5", inplace=True)

df.shape, DATA.shape, DATA_.shape

((50, 5), (150, 17), (50, 21))

In [110]:
DATA_.columns

Index(['filename', 'md5', 'id', 'subject_id', 'sample_id', 'replicate',
       'replicate_count', 'repeated_sample', 'batch', 'file_type', 'file_name',
       'file_description', 'file_MD5', 'technology', 'omic', 'adjustment',
       'content', 'time', 'header', 'annotation', 'configuration_file'],
      dtype='object')

copy 'filename' -> 'file_name'

copy  'md5' -> 'file_md5'

change 'technology' -> 'bulk'

In [111]:
DATA_['file_MD5'] = DATA_['md5']
DATA_['file_name'] = DATA_['filename']
DATA_['technology'] = 'bulk'

# set batch to "batch_x"
DATA_['batch'] = "batch_x"

Next take the unique sample_id and subset SAMPLE, and CLINPATH to only include the bulk samples.


In [112]:
sample_ids = DATA_['sample_id'].unique()
subject_ids = DATA_['subject_id'].unique()
sample_ids, subject_ids

(array(['MFG_PD_0009', 'MFG_PD_0348', 'MFG_PD_0413', 'MFG_HC_0602',
        'MFG_HC_1225', 'MFG_HC_1308', 'MFG_PD_1312', 'MFG_PD_1317',
        'MFG_PD_1344', 'MFG_PD_1441', 'MFG_PD_1504', 'MFG_PD_1858',
        'MFG_HC_1862', 'MFG_HC_1864', 'MFG_PD_1902', 'MFG_PD_1921',
        'MFG_HC_1939', 'MFG_PD_1973', 'MFG_PD_2005', 'MFG_PD_2038',
        'MFG_HC_2057', 'MFG_PD_2058', 'MFG_HC_2061', 'MFG_HC_2062',
        'MFG_HC_2067'], dtype=object),
 array(['PD_0009', 'PD_0348', 'PD_0413', 'HC_0602', 'HC_1225', 'HC_1308',
        'PD_1312', 'PD_1317', 'PD_1344', 'PD_1441', 'PD_1504', 'PD_1858',
        'HC_1862', 'HC_1864', 'PD_1902', 'PD_1921', 'HC_1939', 'PD_1973',
        'PD_2005', 'PD_2038', 'HC_2057', 'PD_2058', 'HC_2061', 'HC_2062',
        'HC_2067'], dtype=object))

In [113]:
SAMPLE_ = SAMPLE[SAMPLE['sample_id'].isin(sample_ids)].copy()
CLINPATH_ = CLINPATH[CLINPATH['subject_id'].isin(subject_ids)].copy()


In [114]:

SUBJECT_ = SUBJECT[SUBJECT['subject_id'].isin(subject_ids)].copy()

In [115]:
# reindex SAMPLE_ and CLINPATH_
SAMPLE_.reset_index(inplace=True, drop=True)
CLINPATH_.reset_index(inplace=True, drop=True)


# set batch to batch_x
SAMPLE_['batch'] = "batch_x"

In [116]:
SAMPLE_

Unnamed: 0,sample_id,subject_id,source_sample_id,replicate,replicate_count,repeated_sample,batch,tissue,brain_region,hemisphere,...,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV200,pm_PH,donor_id
0,MFG_HC_1225,HC_1225,12-25,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
1,MFG_HC_0602,HC_0602,06-02,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
2,MFG_PD_0009,PD_0009,00-09,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
3,MFG_PD_1921,PD_1921,19-21,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
4,MFG_PD_2058,PD_2058,20-58,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
5,MFG_PD_1441,PD_1441,14-41,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000383 (female),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
6,MFG_PD_1344,PD_1344,13-44,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000383 (female),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
7,MFG_HC_1939,HC_1939,19-39,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000383 (female),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
8,MFG_HC_1308,HC_1308,13-08,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
9,MFG_HC_1862,HC_1862,18-62,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,


In [117]:
# wrape this in try/except to make suing the ReportCollector portable
# probably an abstract base class would be better

class DummyStreamlit:
    @staticmethod
    def markdown(self,msg):
        pass
    def error(self,msg):
        pass
    def header(self,msg):
        pass        
    def subheader(self,msg):
        pass    
    def divider(self):
        pass
st = DummyStreamlit()


def get_log(log_file):
    """ grab logged information from the log file."""
    with open(log_file, 'r') as f:
        report_content = f.read()
    return report_content

def columnize( itemlist ):
    NEWLINE_DASH = ' \n- '
    if len(itemlist) > 1:
        return f"- {itemlist[0]}{NEWLINE_DASH.join(itemlist[1:])}"
    else:
        return f"- {itemlist[0]}"
    
def read_meta_table(table_path):
    # read the whole table
    try:
        table_df = pd.read_csv(table_path,dtype=str)
    except UnicodeDecodeError:
        table_df = pd.read_csv(table_path, encoding='latin1',dtype=str)

    # drop the first column if it is just the index
    if table_df.columns[0] == "Unnamed: 0":
        table_df = table_df.drop(columns=["Unnamed: 0"])

    return table_df


class ReportCollector:
    """
    Class to collect and log messages, errors, and markdown to a log file and/or streamlit
    """

    def __init__(self, destination="both"):
        self.entries = []
        self.filename = None

        if destination in ["both", "streamlit"]:
            self.publish_to_streamlit = True
        else:
            self.publish_to_streamlit = False


    def add_markdown(self, msg):
        self.entries.append(("markdown", msg))
        if self.publish_to_streamlit:
            st.markdown(msg)


    def add_error(self, msg):
        self.entries.append(("error", msg))
        if self.publish_to_streamlit:
            st.error(msg)

    def add_header(self, msg):
        self.entries.append(("header", msg))
        if self.publish_to_streamlit:    
            st.header(msg)

    def add_subheader(self, msg):
        self.entries.append(("subheader", msg))
        if self.publish_to_streamlit:    
            st.subheader(msg)

    def add_divider(self):
        self.entries.append(("divider", None))
        if self.publish_to_streamlit:    
            st.divider()

    
    def write_to_file(self, filename):
        self.filename = filename
        with open(filename, 'w') as f:
            report_content = self.get_log()
            f.write(report_content)
    

    def get_log(self):
        """ grab logged information from the log file."""
        report_content = []
        for msg_type, msg in self.entries:
            if msg_type == "markdown":
                report_content += msg + '\n'
            elif msg_type == "error":
                report_content += f"🚨⚠️❗ **{msg}**\n"
            elif msg_type == "header":
                report_content += f"# {msg}\n"
            elif msg_type == "subheader":
                report_content += f"## {msg}\n"
            elif msg_type == "divider":
                report_content += 60*'-' + '\n'
        
        return "".join(report_content)

    def reset(self):
        self.entries = []
        self.filename = None

    def print_log(self):
        print(self.get_log())


def validate_table(df: pd.DataFrame, table_name: str, specific_cde_df: pd.DataFrame, out: ReportCollector ):
    """
    Validate the table against the specific table entries from the CDE
    """
    df.replace({"":NULL, pd.NA:NULL}, inplace=True)
    def my_str(x):
        return f"'{str(x)}'"
    missing_required = []
    missing_optional = []
    null_fields = []
    invalid_entries = []
    total_rows = df.shape[0]
    df_out = pd.DataFrame()
    for field in specific_cde_df["Field"]:
        entry_idx = specific_cde_df["Field"]==field
        print(f"validating {field}")
        
        opt_req = "REQUIRED" if specific_cde_df.loc[entry_idx, "Required"].item()=="Required" else "OPTIONAL"

        if field not in df.columns:
            if opt_req == "REQUIRED":
                missing_required.append(field)
            else:
                missing_optional.append(field)

            # print(f"missing {opt_req} column {field}")
            df_out[field] = [NULL]*total_rows

        else:
            datatype = specific_cde_df.loc[entry_idx,"DataType"]
            if datatype.item() == "Integer":
                # recode "Unknown" as NULL
                df.replace({"Unknown":NULL, "unknown":NULL}, inplace=True)
                df[field].apply(lambda x: int(x) if x!=NULL else x )
                # test that all are integer or NULL, flag NULL entries
            elif datatype.item() == "Float":
                # recode "Unknown" as NULL
                df.replace({"Unknown":NULL, "unknown":NULL}, inplace=True)
                df[field].apply(lambda x: float(x) if x!=NULL else x )
                # test that all are float or NULL, flag NULL entries
            elif datatype.item() == "Enum":

                valid_values = eval(specific_cde_df.loc[entry_idx,"Validation"].item())
                entries = df[field]
                valid_entries = entries.apply(lambda x: x in valid_values)
                invalid_values = entries[~valid_entries].unique()
                n_invalid = invalid_values.shape[0]
                if n_invalid > 0:
                    valstr = ', '.join(map(my_str, valid_values))
                    invalstr = ', '.join(map(my_str,invalid_values))
                    invalid_entries.append((opt_req, field, n_invalid, valstr, invalstr))
            else: #dtype == String
                pass
            
            n_null = (df[field]==NULL).sum()
            if n_null > 0:            
                null_fields.append((opt_req, field, n_null))
            
            df_out[field] = df[field]
            

    # now compose report...
    if len(missing_required) > 0:
        out.add_error(f"Missing Required Fields in {table_name}: {', '.join(missing_required)}")
    else:
        out.add_markdown(f"All required fields are present in *{table_name}* table.")

    if len(missing_optional) > 0:
        out.add_error(f"Missing Optional Fields in {table_name}: {', '.join(missing_optional)}")
    

    if len(null_fields) > 0:
        # print(f"{opt_req} {field} has {n_null}/{df.shape[0]} NULL entries ")
        out.add_error(f"{len(null_fields)} Fields with empty (NULL) values:")
        for opt_req, field, count in null_fields:
            out.add_markdown(f"\n\t- {field}: {count}/{total_rows} empty rows ({opt_req})")
    else:
        out.add_markdown(f"No empty entries (NULL) found .")


    if len(invalid_entries) > 0:
        out.add_error(f"{len(invalid_entries)} Fields with invalid entries:")
        for opt_req, field, count, valstr, invalstr in invalid_entries:
            str_out = f"- _*{field}*_:  invalid values 💩{invalstr}\n"
            str_out += f"    - valid ➡️ {valstr}"
            out.add_markdown(str_out)
    else:
        out.add_markdown(f"No invalid entries found in Enum fields.")


    return df, df_out, out


report = ReportCollector(destination="log")

df = DATA_.copy()
table_choice = "DATA"
# perform the valadation

# specific_cde_df = CDE_df[CDE_df['Table'] == table_choice]
specific_cde_df = CDE_df[CDE_df['Table'].str.startswith(table_choice)]

retval = validate_table(df, table_choice, specific_cde_df, report)

report.print_log()

DATA_export, DATA_out, report = retval

validating sample_id
validating replicate
validating replicate_count
validating repeated_sample
validating batch
validating file_type
validating file_name
validating file_description
validating file_MD5
validating adjustment
validating content
validating header
validating annotation
validating configuration_file
All required fields are present in *DATA* table.
🚨⚠️❗ **2 Fields with empty (NULL) values:**

	- header: 50/50 empty rows (OPTIONAL)

	- annotation: 50/50 empty rows (OPTIONAL)
No invalid entries found in Enum fields.



In [118]:
report = ReportCollector(destination="log")

df = SAMPLE_.copy()
table_choice = "SAMPLE"
# perform the valadation

# specific_cde_df = CDE_df[CDE_df['Table'] == table_choice]
specific_cde_df = CDE_df[CDE_df['Table'].str.startswith(table_choice)]

retval = validate_table(df, table_choice, specific_cde_df, report)

report.print_log()

validating sample_id
validating subject_id
validating source_sample_id
validating replicate
validating replicate_count
validating repeated_sample
validating batch
validating condition
validating tissue
validating time
validating alternate_id
All required fields are present in *SAMPLE* table.
🚨⚠️❗ **Missing Optional Fields in SAMPLE: condition, time, alternate_id**
No empty entries (NULL) found .
No invalid entries found in Enum fields.



In [119]:
SAMPLE_export, SAMPLE_out, report = retval

In [120]:
SAMPLE_export.shape, SAMPLE_.shape, SAMPLE.shape, SAMPLE_out.shape

((25, 33), (25, 33), (75, 33), (25, 11))

In [121]:
report = ReportCollector(destination="log")

df = CLINPATH_.copy()
table_choice = "CLINPATH"
# perform the valadation

# specific_cde_df = CDE_df[CDE_df['Table'] == table_choice]
specific_cde_df = CDE_df[CDE_df['Table'].str.startswith(table_choice)]

retval = validate_table(df, table_choice, specific_cde_df, report)

report.print_log()

CLINPATH_export, CLINPATH_out, report = retval



validating subject_id
validating source_subject_id
validating duration_pmi
validating age_at_death
validating path_autopsy_dx_main
validating path_autopsy_second_dx
validating path_autopsy_third_dx
validating path_autopsy_fourth_dx
validating path_autopsy_fifth_dx
validating path_autopsy_sixth_dx
validating path_autopsy_seventh_dx
validating path_autopsy_eight_dx
validating path_year_death
validating cause_death
validating other_cause_death_1
validating other_cause_death_2
validating brain_weight
validating path_braak_nft
validating path_braak_asyn
validating path_cerad
validating path_thal
validating known_pathogenic_mutation
validating PD_pathogenic_mutation
validating path_mckeith
validating sn_neuronal_loss
validating path_infarcs
validating path_nia_ri
validating path_nia_aa_a
validating path_nia_aa_b
validating path_nia_aa_c
validating TDP43
validating arteriolosclerosis_severity_scale
validating amyloid_angiopathy_severity_scale
validating path_ad_level
validating dig_slide_avai

In [122]:
import numpy as np
df.replace({"":NULL, pd.NA:NULL, np.NaN:NULL}, inplace=True)


In [123]:
NULL

'NA'

In [124]:
SUBJECT_['age_at_onset'].values[0]

'NA'

In [125]:
report = ReportCollector(destination="log")

df = SUBJECT_.copy()
table_choice = "SUBJECT"
# perform the valadation

# specific_cde_df = CDE_df[CDE_df['Table'] == table_choice]
specific_cde_df = CDE_df[CDE_df['Table'].str.startswith(table_choice)]

retval = validate_table(df, table_choice, specific_cde_df, report)

report.print_log()

SUBJECT_export, SUBJECT_out, report = retval



validating subject_id
validating source_subject_id
validating biobank_name
validating organism
validating sex
validating age_at_collection
validating race
validating primary_diagnosis
validating primary_diagnosis_text
validating AMPPD_id
validating GP2_id
validating ethnicity
validating family_history
validating last_diagnosis
validating age_at_onset
validating age_at_diagnosis
validating first_motor_symptom
validating hx_dementia_mci
validating hx_melanoma
validating education_level
validating smoking_status
validating smoking_years
validating APOE_e4_status
validating cognitive_status
validating time_from_baseline
All required fields are present in *SUBJECT* table.
🚨⚠️❗ **12 Fields with empty (NULL) values:**

	- primary_diagnosis_text: 23/25 empty rows (OPTIONAL)

	- AMPPD_id: 25/25 empty rows (OPTIONAL)

	- GP2_id: 25/25 empty rows (OPTIONAL)

	- last_diagnosis: 25/25 empty rows (OPTIONAL)

	- age_at_onset: 25/25 empty rows (OPTIONAL)

	- age_at_diagnosis: 25/25 empty rows (OPTIONA

Now compose a PROTOCOL, and STUDY


ASSAY:
>> VAI’s Genomics Core (RRID:SCR_022913; they sequenced this data for us), and I can make some inferences after looking through it. From what I can see, this was likely sequenced on the NovaSeq 6000, with a run length of 2x100bp, using stranded total RNA, with a target genome coverage of 30M reads

In [126]:
PROTOCOL

Unnamed: 0,sample_collection_summary,cell_extraction_summary,lib_prep_summary,data_processing_summary,github_url,protocols_io_DOI,other_reference
0,The samples were collected by the Banner Sun H...,See protocol.,Library preps were generated via the 5' 10x Ge...,We use cellranger count (6.0.1) to align and g...,private repo at https://github.com/TheGBLab/ASAP,https://dx.doi.org/10.17504/protocols.io.bzmgp43w,


In [127]:
STUDY['brain_regions'] = "middle frontal gyrus"
STUDY['team_dataset_id'] = "Human_bulkRNA_seq_PD_Senesence_Jose_Bras_Team_Lee"

Create ASSAY table:


In [128]:
STUDY['project_dataset']= 'Human bulkRNA-seq PD Senesence Jose Bras Team Lee'

In [129]:

metadata_path = Path.home() / ("Projects/ASAP/team-lee/metadata")

MFG_covar = pd.read_csv(f"{metadata_path}/MFG/covar.csv") # includes 'PMI' ?
MFG_cases = pd.read_csv(f"{metadata_path}/MFG/PD_ASAP_Sample_batch_information_banner_cases.csv").dropna(axis=0,how='all')
MFG_control = pd.read_csv(f"{metadata_path}/MFG/PD_ASAP_Sample_batch_information_banner_controls.csv")


In [130]:
### medial frontal gyrus samples
MFG_meta = pd.concat([MFG_cases, MFG_control], axis=0, ignore_index=True)
MFG_meta["GROUPcv"]= MFG_meta["PD"].apply(lambda x: "PD" if (x=="yes") else "HC")

# make a MERGE_ID column because the formatting is inconsistent
MFG_meta['MERGE_ID'] = "MFG_" + MFG_meta['GROUPcv'] +"_" + MFG_meta['CaseID'].str.replace('-','')
MFG_covar['MERGE_ID'] = MFG_covar['SAMPLE']
# the fastqs are in SEQ_ID 

# there's a bug in the meta table... skip for now
MFG_TABLE = pd.merge(MFG_covar, MFG_meta, on='MERGE_ID', how='inner')
MFG_TABLE['subdir']="MFG"



Adapt ASSAY_bulkRNAseq table from team Hardy.

In [131]:
metadata_path = Path.home() / "Projects/ASAP/meta-clean/data/teams/hardy/bulkRNAseq/metadata"

ASSAY_comp = pd.read_csv(metadata_path / "ASSAY_bulkRNAseq.csv")

In [132]:
ASSAY_comp

Unnamed: 0,sample_id,tissue,technology,omic,RIN,molecular_source,assay,sequencing_end,sequencing_length,sequencing_instrument,hemisphere,region_level_1,region_level_2,region_level_3,fragment_library_types
0,Hardy_001,Brain,Bulk,RNA,4.1,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Left,Frontal lobe,Middle frontal gyrus,Grey matter,ISR
1,Hardy_002,Brain,Bulk,RNA,3.1,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Left,Temporal lobe,Middle temporal gyrus,Grey matter,ISR
2,Hardy_003,Brain,Bulk,RNA,3.2,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Left,Parietal lobe,Inferior parietal lobule,Grey matter,ISR
3,Hardy_004,Brain,Bulk,RNA,3.9,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Left,Cingulate gyrus,Anterior cingulate gyrus,Grey matter,ISR
4,Hardy_005,Brain,Bulk,RNA,4.3,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Right,Frontal lobe,Middle frontal gyrus,Grey matter,ISR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,Hardy_284,Brain,Bulk,RNA,6.8,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Right,Cingulate gyrus,Anterior cingulate gyrus,Grey matter,ISR
284,Hardy_285,Brain,Bulk,RNA,6.4,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Right,Frontal lobe,Middle frontal gyrus,Grey matter,ISR
285,Hardy_286,Brain,Bulk,RNA,6.2,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Right,Temporal lobe,Middle temporal gyrus,Grey matter,ISR
286,Hardy_287,Brain,Bulk,RNA,5.6,PolyA RNA,Other,Paired-end,150,Illumina NovaSeq 6000,Right,Parietal lobe,Inferior parietal lobule,Grey matter,ISR


In [133]:
ASSAY = SAMPLE_ 



In [134]:
report = ReportCollector(destination="log")

df = ASSAY.copy()
table_choice = "ASSAY_bulkRNAseq"
# perform the valadation

# specific_cde_df = CDE_df[CDE_df['Table'] == table_choice]
specific_cde_df = CDE_df[CDE_df['Table'].str.startswith(table_choice)]

retval = validate_table(df, table_choice, specific_cde_df, report)

report.print_log()

ASSAY_export, ASSAY_out, report = retval

ASSAY_export

All required fields are present in *ASSAY_bulkRNAseq* table.
No empty entries (NULL) found .
No invalid entries found in Enum fields.



Unnamed: 0,sample_id,subject_id,source_sample_id,replicate,replicate_count,repeated_sample,batch,tissue,brain_region,hemisphere,...,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV200,pm_PH,donor_id
0,MFG_HC_1225,HC_1225,12-25,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
1,MFG_HC_0602,HC_0602,06-02,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
2,MFG_PD_0009,PD_0009,00-09,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
3,MFG_PD_1921,PD_1921,19-21,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
4,MFG_PD_2058,PD_2058,20-58,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
5,MFG_PD_1441,PD_1441,14-41,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000383 (female),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
6,MFG_PD_1344,PD_1344,13-44,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000383 (female),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
7,MFG_HC_1939,HC_1939,19-39,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000383 (female),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
8,MFG_HC_1308,HC_1308,13-08,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
9,MFG_HC_1862,HC_1862,18-62,rep1,1,0,batch_x,Brain,Middle_Frontal_Gyrus,Unknown,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,


In [135]:
export_path = Path.cwd() / "data/lee/bulk_rna_seq/metadata"

if not export_path.exists():
    export_path.mkdir(parents=True)


In [136]:
# fix the column order

# write the clean metadata
STUDY.to_csv(export_path / "STUDY.csv")
PROTOCOL.to_csv(export_path / "PROTOCOL.csv")
CLINPATH_export.to_csv(export_path / "CLINPATH.csv")
SAMPLE_export.to_csv(export_path / "SAMPLE.csv")
SUBJECT_export.to_csv(export_path / "SUBJECT.csv")

DATA_export.to_csv(export_path / "DATA.csv")

MFG_TABLE.to_csv(export_path / "aux_meta.csv")
# also writh them to clean...
# 
#  
ASSAY_export.to_csv(export_path / "ASSAY_bulkRNAseq.csv")

In [137]:
export_path / "ASSAY_bulkRNAseq.csv"

PosixPath('/Users/ergonyc/Projects/ASAP/harmonized-wf-dev/data/teams/lee/data/lee/bulk_rna_seq/metadata/ASSAY_bulkRNAseq.csv')

## Update the table to v2

In [126]:
SAMPLE.head()

Unnamed: 0,sample_id,source_sample_id,subject_id,replicate,replicate_count,repeated_sample,batch,tissue,brain_region,source_RIN,...,sex_ontology_term_id,self_reported_ethnicity_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,assay_ontology_term_id,suspension_type,DV200,pm_PH,donor_id
0,MFG_HC_1225,12-25,HC_1225,rep1,1,0,BATCH_4,Brain,Middle_Frontal_Gyrus,,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
0,MFG_HC_1225,12-25,HC_1225,rep1,1,0,BATCH_4,Brain,Middle_Frontal_Gyrus,,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
1,MFG_HC_0602,06-02,HC_0602,rep1,1,0,BATCH_4,Brain,Middle_Frontal_Gyrus,,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
1,MFG_HC_0602,06-02,HC_0602,rep1,1,0,BATCH_4,Brain,Middle_Frontal_Gyrus,,...,PATO:0000384 (male),Unknown,PATO:0000461,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,
2,MFG_PD_0009,00-09,PD_0009,rep1,1,0,BATCH_4,Brain,Middle_Frontal_Gyrus,,...,PATO:0000384 (male),Unknown,MONDO:0005180,UBERON:0002702,NA(multiple),EFO:0030004,nucleus,,,


In [127]:
from update_schema import update_tables_to_CDEv2

tables_path = Path.cwd() / "clean/team-Lee"


CDEv1 = pd.read_csv( Path.cwd() / "ASAP_CDE_v1.csv" )
CDEv2 = pd.read_csv( Path.cwd() / "ASAP_CDE_v2.csv" )


STUDYv2, PROTOCOLv2, SAMPLEv2, SUBJECTv2, CLINPATHv2, DATAv2 = update_tables_to_CDEv2(tables_path, CDEv1, CDEv2)


In [128]:

export_root = Path.cwd() / "clean/team-Lee/v2"
if not export_root.exists():
    export_root.mkdir(parents=True, exist_ok=True)


In [129]:

STUDYv2.to_csv( export_root / "STUDY.csv")
PROTOCOLv2.to_csv(export_root / "PROTOCOL.csv")
SAMPLEv2.to_csv(export_root / "SAMPLE.csv")
SUBJECTv2.to_csv(export_root / "SUBJECT.csv")
CLINPATHv2.to_csv(export_root / "CLINPATH.csv")
DATAv2.to_csv(export_root / "DATA.csv")


In [130]:
export_root

PosixPath('/Users/ergonyc/Projects/ASAP/meta-clean/clean/team-Lee/v2')

Transfer cleaned metadata to raw buckets 




## Lee

In [131]:
# Lee
!gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/lee-credentials.json 


Activated service account credentials for: [raw-admin-lee@dnastack-asap-parkinsons.iam.gserviceaccount.com]


In [132]:

!gsutil -u dnastack-asap-parkinsons ls -al "gs://asap-raw-data-team-lee/metadata/v2"


      6243  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/CLINPATH.csv#1701296766077305  metageneration=1
     24218  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/DATA.csv#1701296766689376  metageneration=1
       968  2023-11-29T22:26:07Z  gs://asap-raw-data-team-lee/metadata/v2/PROTOCOL.csv#1701296767085192  metageneration=1
     23092  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/SAMPLE.csv#1701296766486332  metageneration=1
      1054  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/STUDY.csv#1701296766878081  metageneration=1
      4277  2023-11-29T22:26:06Z  gs://asap-raw-data-team-lee/metadata/v2/SUBJECT.csv#1701296766288192  metageneration=1
TOTAL: 6 objects, 59852 bytes (58.45 KiB)


In [47]:
!gsutil -u dnastack-asap-parkinsons rm -r "gs://asap-raw-data-team-lee/metadata/v2/v2_20231128"

Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/CLINPATH.csv#1701213902124849...
Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/DATA.csv#1701213902800745...
Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/PROTOCOL.csv#1701213903243233...
Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/SAMPLE.csv#1701213902582568...
/ [4 objects]                                                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m rm ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/STUDY.csv#1701213903050409...
Removing gs://asap-raw-data-team-lee/metadata/v2/v2_20231128/SUBJECT.csv#1701213902358354...
/ [6 objects]                                                                   
Operation completed over 6 objec

In [None]:
Path.cwd()

PosixPath('/Users/ergonyc/Projects/ASAP/meta-clean')

In [133]:

!gsutil -u dnastack-asap-parkinsons cp -r "./clean/team-Lee/v2_20231130/*.csv"  "gs://asap-raw-data-team-lee/metadata/v2"

Copying file://./clean/team-Lee/v2_20231130/CLINPATH.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/SUBJECT.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/SAMPLE.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/DATA.csv [Content-Type=text/csv]... 
- [4 files][ 56.5 KiB/ 56.5 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./clean/team-Lee/v2_20231130/STUDY.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/PROTOCOL.csv [Content-Type=text/csv]...
\ [6 files][ 58.4 KiB/ 58.4 KiB]                                                
Operation completed over 6 objects/58.4 KiB.                                     


In [44]:
!gsutil -u dnastack-asap-parkinsons cp -r "./clean/team-Lee/*.csv"  "gs://asap-raw-data-team-lee/metadata/clean/"

Copying file://./clean/team-Lee/CLINPATH.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/SUBJECT.csv [Content-Type=text/csv]...          
Copying file://./clean/team-Lee/auxiluary_metadata.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/SAMPLE.csv [Content-Type=text/csv]...           
\ [4 files][187.4 KiB/187.4 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./clean/team-Lee/STUDY.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/auxilarry_metadata.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/PROTOCOL.csv [Content-Type=text/csv]...         
Copying file://./clean/team-Lee/CDE.csv [Content-Type=text/csv]...              
| [8 files][328.0 KiB/328.0 KiB]      

In [49]:
!gsutil -u dnastack-asap-parkinsons ls -al "gs://asap-raw-data-team-lee/metadata/clean"


     37741  2023-11-28T21:58:33Z  gs://asap-raw-data-team-lee/metadata/clean/CDE.csv#1701208713502213  metageneration=1
     24241  2023-11-28T21:58:31Z  gs://asap-raw-data-team-lee/metadata/clean/CLINPATH.csv#1701208711158586  metageneration=1
       968  2023-11-28T21:58:33Z  gs://asap-raw-data-team-lee/metadata/clean/PROTOCOL.csv#1701208713057031  metageneration=1
     60412  2023-11-28T21:58:32Z  gs://asap-raw-data-team-lee/metadata/clean/SAMPLE.csv#1701208712230186  metageneration=1
       969  2023-11-28T21:58:32Z  gs://asap-raw-data-team-lee/metadata/clean/STUDY.csv#1701208712490381  metageneration=1
      3078  2023-11-28T21:58:31Z  gs://asap-raw-data-team-lee/metadata/clean/SUBJECT.csv#1701208711431682  metageneration=1
    104336  2023-11-28T21:58:32Z  gs://asap-raw-data-team-lee/metadata/clean/auxilarry_metadata.csv#1701208712852367  metageneration=1
    109904  2023-09-22T10:45:50Z  gs://asap-raw-data-team-lee/metadata/clean/auxiliary_metadata.csv#1695379550068707  metagene

### copy to workflow-dev bucket

First copy each set of metadata locally ...

In [135]:
!gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/wf-credentials.json

Activated service account credentials for: [admin-workflow-dev@dnastack-asap-parkinsons.iam.gserviceaccount.com]


In [51]:
# !gsutil  ls -al "gs://asap-workflow-dev/CDE"

!gsutil  ls -al "gs://asap-workflow-dev/metadata/v2/lee"


      6243  2023-11-28T22:21:42Z  gs://asap-workflow-dev/metadata/v2/lee/CLINPATH.csv#1701210102669978  metageneration=1
     24218  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/DATA.csv#1701210103715354  metageneration=1
       968  2023-11-28T22:21:44Z  gs://asap-workflow-dev/metadata/v2/lee/PROTOCOL.csv#1701210104255218  metageneration=1
     23092  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/SAMPLE.csv#1701210103292269  metageneration=1
      1054  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/STUDY.csv#1701210103992608  metageneration=1
      4277  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/SUBJECT.csv#1701210103010639  metageneration=1
                                 gs://asap-workflow-dev/metadata/v2/lee/v2_20231128/
TOTAL: 6 objects, 59852 bytes (58.45 KiB)


In [136]:
!gsutil  cp -r "./clean/team-Lee/v2_20231130/*.csv" "gs://asap-workflow-dev/metadata/v2/lee"


Copying file://./clean/team-Lee/v2_20231130/CLINPATH.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/SUBJECT.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/SAMPLE.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/DATA.csv [Content-Type=text/csv]... 
- [4 files][ 56.5 KiB/ 56.5 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://./clean/team-Lee/v2_20231130/STUDY.csv [Content-Type=text/csv]...
Copying file://./clean/team-Lee/v2_20231130/PROTOCOL.csv [Content-Type=text/csv]...
\ [6 files][ 58.4 KiB/ 58.4 KiB]                                                
Operation completed over 6 objects/58.4 KiB.                                     


In [56]:
!gsutil ls -al "gs://asap-workflow-dev/metadata/v2/lee"


      6243  2023-11-28T22:21:42Z  gs://asap-workflow-dev/metadata/v2/lee/CLINPATH.csv#1701210102669978  metageneration=1
     24218  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/DATA.csv#1701210103715354  metageneration=1
       968  2023-11-28T22:21:44Z  gs://asap-workflow-dev/metadata/v2/lee/PROTOCOL.csv#1701210104255218  metageneration=1
     23092  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/SAMPLE.csv#1701210103292269  metageneration=1
      1054  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/STUDY.csv#1701210103992608  metageneration=1
      4277  2023-11-28T22:21:43Z  gs://asap-workflow-dev/metadata/v2/lee/SUBJECT.csv#1701210103010639  metageneration=1
TOTAL: 6 objects, 59852 bytes (58.45 KiB)


## check file md5s

In [39]:
from utils.checksums import extract_md5_from_details, extract_md5_from_details2


In [36]:
!gcloud auth activate-service-account --key-file=/Users/ergonyc/Projects/ASAP/lee-credentials.json  



Activated service account credentials for: [raw-admin-lee@dnastack-asap-parkinsons.iam.gserviceaccount.com]


In [37]:

# !gcloud storage hash "gs://asap-raw-data-team-lee/**/*.gz"  --skip-crc32c --hex  --billing-project dnastack-asap-parkinsons > hardy_hexhash.log

!gsutil -u dnastack-asap-parkinsons hash -h "gs://asap-raw-data-team-lee/**/*.gz" > lee_hexhash.log


using the module's C extension, so checksumming will run very slowly. For help
installing the extension, please see "gsutil help crcmod".



In [40]:
bucket_files_md5 = extract_md5_from_details2("lee_hexhash.log")



checksum = DATAv2[['file_name','file_MD5']]
checksum['check1'] = checksum['file_MD5'].str.strip()
checksum['check2'] = checksum['file_name'].map(bucket_files_md5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checksum['check'] = checksum['file_name'].map(bucket_files_md5)


In [41]:
checksum[checksum.check1 != checksum.check2].file_name.to_list()
#empty means success!!


[]

In [42]:
checksum

Unnamed: 0,file_name,file_MD5,check
0,MFGHC1225_S9_L001_R1_001.fastq.gz,9977258e598d6a52130c29c71aef6925,9977258e598d6a52130c29c71aef6925
1,MFGHC1225_S9_L001_R2_001.fastq.gz,fe2cf93257801227b7072a4fb7d18792,fe2cf93257801227b7072a4fb7d18792
2,MFGHC0602_S2_L001_R1_001.fastq.gz,110ca4864cf6938faca67567bebfb6cc,110ca4864cf6938faca67567bebfb6cc
3,MFGHC0602_S2_L001_R2_001.fastq.gz,0dcc67217e43ab53bae0d0676f9bfe8b,0dcc67217e43ab53bae0d0676f9bfe8b
4,MFGPD0009_S3_L001_R1_001.fastq.gz,a2608d0bd192333b0076d7091c1c50ea,a2608d0bd192333b0076d7091c1c50ea
...,...,...,...
145,SN_1973_PD_S1_L000_R2_001.fastq.gz,53f6c6b4a00299fb41f32b34509835fa,53f6c6b4a00299fb41f32b34509835fa
146,SN_2005_PD_S1_L001_R1_001.fastq.gz,5bde8fdd8ab28e2c00e3ae327fecc80b,5bde8fdd8ab28e2c00e3ae327fecc80b
147,SN_2005_PD_S1_L001_R2_001.fastq.gz,14364dc52760fbd3ad08b5ca582d849b,14364dc52760fbd3ad08b5ca582d849b
148,SN_2038_PD_S1_L000_R1_001.fastq.gz,3215d4e4a68a85546183e820b74cac1f,3215d4e4a68a85546183e820b74cac1f
