In [13]:
import os
import json
import xml.etree.ElementTree as ET

In [14]:
def get_text_element(parent, path, ns):
    element = parent.find(path, ns)
    return element.text if element is not None else None

def extract_dna_methylation_filenames(dna_methylation_folder_path):
    """
    This function searches for DNA methylation files in a specified folder.
    It checks for files ending in "betas.txt" and returns a list of these filenames.

    :param dna_methylation_folder_path: Path to the folder containing the methylation data files.
    :return: A list of filenames ending with "betas.txt", or None if no such files are found.
    """
    # List to store files that end with "betas.txt"
    dna_methylation_files = []

    # Iterate through the files in the DNA methylation folder
    for filename in os.listdir(dna_methylation_folder_path):
        # Check if the file ends with "betas.txt"
        if filename.endswith("betas.txt"):
            dna_methylation_files.append(filename)

    # Return the list of files if found, otherwise None
    return dna_methylation_files if dna_methylation_files else None

def extract_clinical_data(clinical_folder_path):
    """
    This function searches for a clinical XML file within a given folder.
    It extracts relevant clinical data from the XML.
    
    :param biospecimen_folder_path: Path to the folder containing clinical data files.
    :return: JSON representation of the clinical data.
    """
    xml_file_path = None
    for filename in os.listdir(clinical_folder_path):
        if "clinical." in filename and filename.endswith(".xml"):
            xml_file_path = os.path.join(clinical_folder_path, filename)
            break

    if xml_file_path is None:
        print("No clinical XML file found.")
        return None

    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # XML namespace definition
    namespaces = {
        'coad': 'http://tcga.nci/bcr/xml/clinical/coad/2.7',
        'admin': 'http://tcga.nci/bcr/xml/administration/2.7',
        'clin_shared': 'http://tcga.nci/bcr/xml/clinical/shared/2.7',
        'shared': 'http://tcga.nci/bcr/xml/shared/2.7',
        'shared_stage': 'http://tcga.nci/bcr/xml/clinical/shared/stage/2.7',
        'coad_read_shared': 'http://tcga.nci/bcr/xml/clinical/shared/coad_read/2.7',
        'coad_nte': 'http://tcga.nci/bcr/xml/clinical/coad/shared/new_tumor_event/2.7/1.0',
        'nte': 'http://tcga.nci/bcr/xml/clinical/shared/new_tumor_event/2.7',
        'rx': 'http://tcga.nci/bcr/xml/clinical/pharmaceutical/2.7',
        'rad': 'http://tcga.nci/bcr/xml/clinical/radiation/2.7'
    }

    clinical_data = {}

    # Extract key patient data
    patient = root.find('coad:patient', namespaces)
    
    clinical_data['patient_id'] = get_text_element(patient, 'shared:patient_id', namespaces)
    clinical_data['bcr_patient_barcode'] = get_text_element(patient, 'shared:bcr_patient_barcode', namespaces)
    clinical_data['gender'] = get_text_element(patient, 'shared:gender', namespaces)
    clinical_data['tumor_tissue_site'] = get_text_element(patient, 'clin_shared:tumor_tissue_site', namespaces)
    clinical_data['days_to_birth'] = get_text_element(patient, 'clin_shared:days_to_birth', namespaces)
    clinical_data['days_to_last_followup'] = get_text_element(patient, 'clin_shared:days_to_last_followup', namespaces)
    clinical_data['days_to_death'] = get_text_element(patient, 'clin_shared:days_to_death', namespaces)
    clinical_data['histological_type'] = get_text_element(patient, 'shared:histological_type', namespaces)

    # Extract AJCC staging information
    stage_event = patient.find('shared_stage:stage_event', namespaces)
    clinical_data['ajcc_stage_version'] = get_text_element(stage_event, 'shared_stage:system_version', namespaces)
    clinical_data['pathologic_stage'] = get_text_element(stage_event, 'shared_stage:pathologic_stage', namespaces)

    # TNM categories (T, N, M)
    tnm = stage_event.find('shared_stage:tnm_categories/shared_stage:pathologic_categories', namespaces)
    clinical_data['pathologic_T'] = get_text_element(tnm, 'shared_stage:pathologic_T', namespaces)
    clinical_data['pathologic_N'] = get_text_element(tnm, 'shared_stage:pathologic_N', namespaces)
    clinical_data['pathologic_M'] = get_text_element(tnm, 'shared_stage:pathologic_M', namespaces)

    return json.dumps(clinical_data, indent=4)

def extract_biospecimen_data(biospecimen_folder_path, slide_files):
    """
    This function searches for a biospecimen CSV file and .svs image files within a given folder.
    It extracts relevant biospecimen data from the CSV and associates each sample with an .svs image if available.
    
    :param biospecimen_folder_path: Path to the folder containing biospecimen data files.
    :return: JSON representation of the biospecimen data.
    """
    biospecimen_data = []

    xml_file_path = None
    for filename in os.listdir(biospecimen_folder_path):
        if "biospecimen." in filename and filename.endswith(".xml"):
            xml_file_path = os.path.join(biospecimen_folder_path, filename)
            break

    if xml_file_path is None:
        print("No biospecimen XML file found.")
        return None
    
    # XML namespace definitions
    namespaces = {
        'bio': 'http://tcga.nci/bcr/xml/biospecimen/2.7',
        'admin': 'http://tcga.nci/bcr/xml/administration/2.7',
        'shared': 'http://tcga.nci/bcr/xml/shared/2.7'
    }

    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    for sample in root.findall("bio:patient/bio:samples/bio:sample", namespaces):
        sample_data = {
            "sample_type": get_text_element(sample, 'bio:sample_type', namespaces),
            "sample_barcode": get_text_element(sample, 'bio:bcr_sample_barcode', namespaces),
            "sample_uuid": get_text_element(sample, 'bio:bcr_sample_uuid', namespaces),
            "slides": []
        }
        
        # locate portions that may contain slides
        for portion in sample.findall("bio:portions/bio:portion", namespaces):
            for slide in portion.findall("bio:slides/bio:slide", namespaces):
                slide_image_name = get_text_element(slide, 'shared:image_file_name', namespaces)

                if slide_image_name in slide_files:
                    slide_data = {
                        "slide_barcode": get_text_element(slide, 'shared:bcr_slide_barcode', namespaces),
                        "slide_uuid": get_text_element(slide, 'shared:bcr_slide_uuid', namespaces),
                        "image_file_name": get_text_element(slide, 'shared:image_file_name', namespaces),
                        "section_location": get_text_element(slide, 'bio:section_location', namespaces),
                        "percent_tumor_cells": get_text_element(slide, 'bio:percent_tumor_cells', namespaces),
                        "percent_tumor_nuclei": get_text_element(slide, 'bio:percent_tumor_nuclei', namespaces),
                        "percent_normal_cells": get_text_element(slide, 'bio:percent_normal_cells', namespaces),
                        "percent_necrosis": get_text_element(slide, 'bio:percent_necrosis', namespaces),
                        "percent_stromal_cells": get_text_element(slide, 'bio:percent_stromal_cells', namespaces),
                    }
                    sample_data["slides"].append(slide_data)

        if sample_data["slides"]:
            biospecimen_data.append(sample_data)

    return json.dumps(biospecimen_data, indent=4)


In [15]:
def aggregate_case_data(cases_folder="./cases", biospecimen_folder="Biospecimen", clinical_folder="Clinical", dna_methylation_folder="DNA Methylation"):
    """
    This function iterates through each case folder in the cases folder, 
    checks for the existence of the Biospecimen, Clinical, and DNA Methylation folders, 
    and aggregates data into a JSON object.

    :param cases_folder: Path to the folder containing all case subfolders (case_001, case_002, etc.).
    :param biospecimen_folder: Folder name where Biospecimen data resides.
    :param clinical_folder: Folder name where Clinical data resides.
    :param dna_methylation_folder: Folder name where DNA Methylation data resides.
    """
    for case_id in os.listdir(cases_folder):
        case_folder = os.path.join(cases_folder, case_id)

        if os.path.isdir(case_folder):
            print(f"Processing {case_id}...")

            # initialize JSON object
            case_data = {
                "case_id": case_id,
                "biospecimen": {"has_data": False},
                "clinical": {"has_data": False},
                "methylation": {"has_data": False}
            }

            # check if DNA Methylation folder exists and extract the filenames needed
            dna_methylation_path = os.path.join(case_folder, dna_methylation_folder)
            if os.path.exists(dna_methylation_path):
                dna_methylation_filenames = extract_dna_methylation_filenames(dna_methylation_path)
                if dna_methylation_filenames:
                    case_data["methylation"]["has_data"] = True
                    case_data["methylation"]["dna_methylation_filename"] = dna_methylation_filenames
                else:
                    case_data["methylation"]["has_data"] = False
            else:
                case_data["methylation"]["has_data"] = False

            # check if Clinical folder exists and extract the data
            clinical_path = os.path.join(case_folder, clinical_folder)
            if os.path.exists(clinical_path):
                clinical_data = extract_clinical_data(clinical_path)
                if clinical_data:
                    clinical_data = json.loads(clinical_data)
                    case_data["clinical"]["has_data"] = True
                    case_data["clinical"]["clinical_patient_data"] = clinical_data
                else:
                    case_data["clinical"]["has_data"] = False
            else:
                case_data["clinical"]["has_data"] = False

            # check if Biospecimen folder exists and extract the data
            biospecimen_path = os.path.join(case_folder, biospecimen_folder)
            if os.path.exists(biospecimen_path):
                slide_files = [f for f in os.listdir(biospecimen_path) if f.endswith('.svs')] 
                if slide_files:
                    biospecimen_data = extract_biospecimen_data(biospecimen_path, slide_files)
                    if biospecimen_data:
                        biospecimen_data = json.loads(biospecimen_data)
                        case_data["biospecimen"]["has_data"] = True
                        case_data["biospecimen"]["biospecimen_data"] = biospecimen_data
                    else:
                        case_data["biospecimen"]["has_data"] = False
            else:
                case_data["biospecimen"]["has_data"] = False

            output_folder = os.path.join(case_folder, "aggregated_data")
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)

            output_file = os.path.join(output_folder, f"{case_id}_data.json")
            with open(output_file, 'w') as json_file:
                json.dump(case_data, json_file, indent=4)

            print(f"Saved JSON for {case_id} in {output_file}")

In [16]:
aggregate_case_data("./cases_TEST_TRAIN_100")

Processing 13eff2e5-e33a-485f-9ba4-8a7ccb3c7528...
Saved JSON for 13eff2e5-e33a-485f-9ba4-8a7ccb3c7528 in ./cases_TEST_TRAIN_100\13eff2e5-e33a-485f-9ba4-8a7ccb3c7528\aggregated_data\13eff2e5-e33a-485f-9ba4-8a7ccb3c7528_data.json
Processing 401a37f3-6630-4400-a811-a262351e37de...
Saved JSON for 401a37f3-6630-4400-a811-a262351e37de in ./cases_TEST_TRAIN_100\401a37f3-6630-4400-a811-a262351e37de\aggregated_data\401a37f3-6630-4400-a811-a262351e37de_data.json
Processing 4f786107-3cf5-4ab3-bba4-f399dee23f0e...
Saved JSON for 4f786107-3cf5-4ab3-bba4-f399dee23f0e in ./cases_TEST_TRAIN_100\4f786107-3cf5-4ab3-bba4-f399dee23f0e\aggregated_data\4f786107-3cf5-4ab3-bba4-f399dee23f0e_data.json
Processing 65bb7520-f055-43a8-b735-1152fa2c9e04...
Saved JSON for 65bb7520-f055-43a8-b735-1152fa2c9e04 in ./cases_TEST_TRAIN_100\65bb7520-f055-43a8-b735-1152fa2c9e04\aggregated_data\65bb7520-f055-43a8-b735-1152fa2c9e04_data.json
Processing 733d8b6a-ca9d-4a69-8c9c-1f88733e8b68...
Saved JSON for 733d8b6a-ca9d-4a69