In [11]:
import json
import pandas as pd
import re
from typing import Dict, List, Any, Optional, Tuple

In [4]:


def load_clinical_data(filepath: str) -> List[Dict[str, Any]]:
    """Load clinical JSON data"""
    with open(filepath, "r") as f:
        data = json.load(f)
    return data


def normalize_string(value: Any) -> str:
    """Normalize string values for comparison"""
    if value is None or pd.isna(value):
        return ""
    return str(value).lower().strip()


def extract_percentage(percent_str: Any) -> Optional[float]:
    """Extract percentage value from strings like '<10%', '50-59%', etc."""
    if percent_str is None or pd.isna(percent_str):
        return None

    percent_norm = normalize_string(percent_str)

    # Handle various formats
    if "<10%" in percent_norm or percent_norm == "<10%":
        return 5.0  # Use midpoint for <10%
    elif "1-10%" in percent_norm:
        return 5.5  # Midpoint of 1-10%
    elif "10-19%" in percent_norm:
        return 14.5  # Midpoint of 10-19%
    elif "11-25%" in percent_norm:
        return 18.0  # Midpoint
    elif "20-29%" in percent_norm:
        return 24.5  # Midpoint
    elif "26-49%" in percent_norm:
        return 37.5  # Midpoint
    elif "30-39%" in percent_norm:
        return 34.5  # Midpoint
    elif "40-49%" in percent_norm:
        return 44.5  # Midpoint
    elif "50-59%" in percent_norm:
        return 54.5  # Midpoint
    elif "60-69%" in percent_norm:
        return 64.5  # Midpoint
    elif "70-79%" in percent_norm:
        return 74.5  # Midpoint
    elif "80-89%" in percent_norm:
        return 84.5  # Midpoint
    elif "90-100%" in percent_norm or ">90%" in percent_norm:
        return 95.0  # Midpoint
    elif ">100%" in percent_norm:
        return 100.0  # Cap at 100%

    # Try to extract pure numbers
    import re

    numbers = re.findall(r"\d+\.?\d*", percent_norm)
    if numbers:
        return float(numbers[0])

    # If we can't parse it, return None and log for debugging
    print(f"Warning: Could not parse percentage string: '{percent_str}'")
    return None


def extract_molecular_tests(case: Dict[str, Any]) -> Dict[str, Any]:
    """Extract ER/PR/HER2 molecular test results from follow_ups"""
    results = {
        "er_status": None,
        "er_percentage": None,
        "pr_status": None,
        "pr_percentage": None,
        "her2_ihc_status": None,
        "her2_ihc_intensity": None,
        "her2_fish_status": None,
        "her2_fish_value": None,
    }

    follow_ups = case.get("follow_ups", [])
    for followup in follow_ups:
        molecular_tests = followup.get("molecular_tests", [])
        for test in molecular_tests:
            gene_symbol = test.get("gene_symbol", "").upper()
            test_result = test.get("test_result", "")
            test_method = test.get("molecular_analysis_method", "").upper()
            test_value_range = test.get("test_value_range")
            test_value = test.get("test_value")
            staining_intensity = test.get("staining_intensity_value")

            if gene_symbol == "ESR1":  # Estrogen Receptor
                results["er_status"] = test_result
                if test_value_range:
                    results["er_percentage"] = test_value_range

            elif gene_symbol == "PGR":  # Progesterone Receptor
                results["pr_status"] = test_result
                if test_value_range:
                    results["pr_percentage"] = test_value_range

            elif gene_symbol == "ERBB2":  # HER2
                if test_method == "IHC":
                    results["her2_ihc_status"] = test_result
                    if staining_intensity:
                        results["her2_ihc_intensity"] = staining_intensity
                elif test_method == "FISH":
                    results["her2_fish_status"] = test_result
                    if test_value is not None:
                        results["her2_fish_value"] = test_value

    return results


def is_receptor_negative(
    status: Any, percentage: Any = None, threshold: float = 10.0
) -> Optional[bool]:
    """
    Check if receptor is negative based on status and/or percentage
    Returns True if negative, False if positive, None if indeterminate
    """
    status_norm = normalize_string(status)

    # Check percentage first if available
    if percentage is not None:
        percent_val = extract_percentage(percentage)
        if percent_val is not None:
            return percent_val <= threshold

    # Check status strings
    if "negative" in status_norm:
        return True
    elif "positive" in status_norm:
        return False

    return None  # Indeterminate


def is_her2_negative(
    ihc_status: Any,
    fish_status: Any = None,
    fish_value: Any = None,
    ihc_intensity: Any = None,
) -> Optional[bool]:
    """
    Check if HER2 is negative based on paper criteria:
    HER2 IHC 0-1+ or FISH < 2 if IHC 2+
    """
    ihc_norm = normalize_string(ihc_status)
    intensity_norm = normalize_string(ihc_intensity) if ihc_intensity else ""

    # Check intensity score first if available (most specific)
    if intensity_norm:
        if intensity_norm in ["0", "1+"]:
            return True  # HER2 negative
        elif intensity_norm == "3+":
            return False  # HER2 positive
        elif intensity_norm == "2+":
            # IHC 2+ requires FISH confirmation
            if fish_status is not None:
                fish_norm = normalize_string(fish_status)
                if "negative" in fish_norm:
                    return True
                elif "positive" in fish_norm:
                    return False
            # Check FISH ratio value < 2
            if fish_value is not None:
                try:
                    fish_val = float(fish_value)
                    return fish_val < 2.0
                except (ValueError, TypeError):
                    pass
            # If IHC 2+ but no FISH data, it's indeterminate
            return None

    # Fallback to status if no intensity score
    if "negative" in ihc_norm:
        return True
    elif "positive" in ihc_norm:
        # If positive but no intensity, check FISH if available
        if fish_status is not None:
            fish_norm = normalize_string(fish_status)
            if "negative" in fish_norm:
                return True
            elif "positive" in fish_norm:
                return False
            # Check FISH ratio value
            if fish_value is not None:
                try:
                    fish_val = float(fish_value)
                    return fish_val < 2.0
                except (ValueError, TypeError):
                    pass
        # If positive but no FISH data, assume positive
        return False

    return None  # Indeterminate


def is_stage_ii_iii(stage: Any) -> bool:
    """Check if pathologic stage is II or III"""
    stage_norm = normalize_string(stage)

    # Match stage II or III patterns
    stage_pattern = r"stage\s*(ii|iii|2|3)([abc]?)\b"
    return bool(re.search(stage_pattern, stage_norm))


def filter_tnbc_cohort(
    data: List[Dict[str, Any]],
) -> Tuple[List[Dict[str, Any]], Dict[str, int], List[Dict[str, Any]]]:
    """
    Filter cases to create Stage II-III TNBC cohort using molecular test data from JSON
    """
    filtered_cases = []
    stats = {
        "total_cases": len(data),
        "stage_ii_iii": 0,
        "er_negative": 0,
        "pr_negative": 0,
        "her2_negative": 0,
        "tnbc_any_stage": 0,
        "final_tnbc_stage_ii_iii": 0,
        "missing_er": 0,
        "missing_pr": 0,
        "missing_her2": 0,
        "cases_with_molecular_data": 0,
        "er_positive_over_10": 0,
        "pr_positive_over_10": 0,
        "her2_positive": 0,
        "stage_ii_iii_with_molecular": 0,
    }

    debug_cases = []  # Store details for analysis

    for case in data:
        case_id = case.get("case_id")
        submitter_id = case.get("submitter_id")

        # Extract stage from diagnoses
        stage = None
        primary_diagnosis = None
        if "diagnoses" in case and case["diagnoses"]:
            # Use first primary diagnosis
            for diagnosis in case["diagnoses"]:
                if diagnosis.get("diagnosis_is_primary_disease") == "true":
                    stage = diagnosis.get("ajcc_pathologic_stage")
                    primary_diagnosis = diagnosis.get("primary_diagnosis")
                    break
            # Fallback to first diagnosis if no primary found
            if not stage and case["diagnoses"]:
                stage = case["diagnoses"][0].get("ajcc_pathologic_stage")
                primary_diagnosis = case["diagnoses"][0].get("primary_diagnosis")

        # Extract molecular test results
        molecular_data = extract_molecular_tests(case)

        # Check if we have any molecular data
        has_molecular_data = any(v is not None for v in molecular_data.values())
        if has_molecular_data:
            stats["cases_with_molecular_data"] += 1

        # Apply filters
        stage_ok = is_stage_ii_iii(stage) if stage else False
        er_neg = is_receptor_negative(
            molecular_data["er_status"], molecular_data["er_percentage"], threshold=10.0
        )
        pr_neg = is_receptor_negative(
            molecular_data["pr_status"], molecular_data["pr_percentage"], threshold=10.0
        )
        her2_neg = is_her2_negative(
            molecular_data["her2_ihc_status"],
            molecular_data["her2_fish_status"],
            molecular_data["her2_fish_value"],
            molecular_data["her2_ihc_intensity"],
        )

        # For Stage II-III cases with molecular data, track more details
        if stage_ok and has_molecular_data:
            stats["stage_ii_iii_with_molecular"] += 1

            # Track why cases are excluded
            if er_neg is False:
                stats["er_positive_over_10"] += 1
            if pr_neg is False:
                stats["pr_positive_over_10"] += 1
            if her2_neg is False:
                stats["her2_positive"] += 1

            debug_cases.append(
                {
                    "submitter_id": submitter_id,
                    "stage": stage,
                    "er_status": molecular_data["er_status"],
                    "er_percentage": molecular_data["er_percentage"],
                    "er_neg": er_neg,
                    "pr_status": molecular_data["pr_status"],
                    "pr_percentage": molecular_data["pr_percentage"],
                    "pr_neg": pr_neg,
                    "her2_ihc": molecular_data["her2_ihc_status"],
                    "her2_ihc_intensity": molecular_data["her2_ihc_intensity"],
                    "her2_fish": molecular_data["her2_fish_status"],
                    "her2_fish_value": molecular_data["her2_fish_value"],
                    "her2_neg": her2_neg,
                    "is_tnbc": er_neg is True and pr_neg is True and her2_neg is True,
                }
            )

        # Track missing data
        if er_neg is None:
            stats["missing_er"] += 1
        if pr_neg is None:
            stats["missing_pr"] += 1
        if her2_neg is None:
            stats["missing_her2"] += 1

        # Update stats
        if stage_ok:
            stats["stage_ii_iii"] += 1
        if er_neg is True:
            stats["er_negative"] += 1
        if pr_neg is True:
            stats["pr_negative"] += 1
        if her2_neg is True:
            stats["her2_negative"] += 1
        if er_neg is True and pr_neg is True and her2_neg is True:
            stats["tnbc_any_stage"] += 1
            if stage_ok:
                stats["final_tnbc_stage_ii_iii"] += 1
                filtered_cases.append(
                    {
                        "case_id": case_id,
                        "submitter_id": submitter_id,
                        "stage": stage,
                        "er_status": molecular_data["er_status"],
                        "er_percentage": molecular_data["er_percentage"],
                        "pr_status": molecular_data["pr_status"],
                        "pr_percentage": molecular_data["pr_percentage"],
                        "her2_ihc_status": molecular_data["her2_ihc_status"],
                        "her2_ihc_intensity": molecular_data["her2_ihc_intensity"],
                        "her2_fish_status": molecular_data["her2_fish_status"],
                        "her2_fish_value": molecular_data["her2_fish_value"],
                        "primary_diagnosis": primary_diagnosis,
                        "primary_site": case.get("primary_site"),
                    }
                )

    return filtered_cases, stats, debug_cases


def main():
    # Load clinical data
    print("Loading clinical JSON data...")
    data = load_clinical_data("clinical-tcga-brca.json")

    print(f"Loaded {len(data)} cases from TCGA-BRCA")

    # Apply TNBC filtering
    print("\n" + "=" * 60)
    print("APPLYING TNBC FILTERING USING JSON MOLECULAR TEST DATA")
    print("=" * 60)

    cohort, stats, debug_cases = filter_tnbc_cohort(data)

    # Print detailed results
    print("\nFiltering Statistics:")
    print(f"  Total cases: {stats['total_cases']}")
    print(f"  Cases with molecular data: {stats['cases_with_molecular_data']}")
    print(f"  Stage II-III cases: {stats['stage_ii_iii']}")
    print(f"  Stage II-III with molecular data: {stats['stage_ii_iii_with_molecular']}")
    print(f"  ER negative (≤10%): {stats['er_negative']}")
    print(f"  PR negative (≤10%): {stats['pr_negative']}")
    print(f"  HER2 negative: {stats['her2_negative']}")
    print(f"  TNBC (any stage): {stats['tnbc_any_stage']}")
    print(f"  Final TNBC Stage II-III cohort: {stats['final_tnbc_stage_ii_iii']}")

    print(f"\nExclusion breakdown for Stage II-III cases with molecular data:")
    print(f"  ER positive (>10%): {stats['er_positive_over_10']}")
    print(f"  PR positive (>10%): {stats['pr_positive_over_10']}")
    print(f"  HER2 positive: {stats['her2_positive']}")

    print(f"\nMissing data:")
    print(f"  Missing ER status: {stats['missing_er']}")
    print(f"  Missing PR status: {stats['missing_pr']}")
    print(f"  Missing HER2 status: {stats['missing_her2']}")

    # Analyze Stage II-III cases with molecular data
    tnbc_count = sum(1 for case in debug_cases if case["is_tnbc"])
    print(f"\nStage II-III cases with molecular data breakdown:")
    print(f"  Total: {len(debug_cases)}")
    print(f"  TNBC: {tnbc_count}")
    print(f"  Non-TNBC: {len(debug_cases) - tnbc_count}")

    # Show cases that might be borderline
    print(f"\nBorderline cases analysis:")
    borderline_cases = []
    for case in debug_cases:
        is_borderline = False
        reasons = []

        # Check for missing data that might affect classification
        if case["er_neg"] is None:
            reasons.append("Missing ER")
            is_borderline = True
        if case["pr_neg"] is None:
            reasons.append("Missing PR")
            is_borderline = True
        if case["her2_neg"] is None:
            reasons.append("Missing HER2")
            is_borderline = True

        if is_borderline:
            borderline_cases.append((case["submitter_id"], reasons))

    print(f"  Cases with missing receptor data: {len(borderline_cases)}")
    if borderline_cases:
        print("  Examples:")
        for case_id, reasons in borderline_cases[:5]:
            print(f"    {case_id}: {', '.join(reasons)}")

    # Save results
    if cohort:
        # Check for duplicate submitter IDs
        submitter_ids = [
            case["submitter_id"] for case in cohort if case["submitter_id"]
        ]
        unique_submitter_ids = list(set(submitter_ids))
        duplicates_count = len(submitter_ids) - len(unique_submitter_ids)

        print(f"\n" + "=" * 60)
        print("DUPLICATE CHECK")
        print("=" * 60)
        print(f"  Total submitter IDs: {len(submitter_ids)}")
        print(f"  Unique submitter IDs: {len(unique_submitter_ids)}")
        print(f"  Duplicates found: {duplicates_count}")

        if duplicates_count > 0:
            # Find and report actual duplicates
            from collections import Counter

            id_counts = Counter(submitter_ids)
            duplicated_ids = [
                id_val for id_val, count in id_counts.items() if count > 1
            ]
            print(f"  Duplicated IDs: {duplicated_ids}")
            for dup_id in duplicated_ids:
                print(f"    {dup_id}: appears {id_counts[dup_id]} times")

        # Save complete cohort data
        with open("ids/tcga_brca_tnbc_stage2-3_cohort_json_final.json", "w") as f:
            json.dump(cohort, f, indent=2)

        # Save case IDs (all, including any duplicates for reference)
        case_ids = [case["case_id"] for case in cohort if case["case_id"]]
        with open("ids/tcga_brca_tnbc_stage2-3_case_ids_json_final.txt", "w") as f:
            for case_id in case_ids:
                f.write(f"{case_id}\n")

        # Save UNIQUE submitter IDs only
        unique_submitter_ids_sorted = sorted(unique_submitter_ids)
        with open("ids/tcga_brca_tnbc_stage2-3_submitter_ids_json_final.txt", "w") as f:
            for submitter_id in unique_submitter_ids_sorted:
                f.write(f"{submitter_id}\n")

        # Save debug data for analysis
        with open("ids/tcga_brca_stage2-3_debug_analysis.json", "w") as f:
            json.dump(debug_cases, f, indent=2)

        print(f"Final cohort size: {len(cohort)} Stage II-III TNBC cases")
        print(f"Unique patients: {len(unique_submitter_ids)} (submitter IDs)")
        print(f"\nFiles saved:")
        print(f"  - tcga_brca_tnbc_stage2-3_cohort_json_final.json")
        print(
            f"  - tcga_brca_tnbc_stage2-3_case_ids_json_final.txt ({len(case_ids)} case IDs)"
        )
        print(
            f"  - tcga_brca_tnbc_stage2-3_submitter_ids_json_final.txt ({len(unique_submitter_ids)} unique submitter IDs)"
        )
        print(f"  - tcga_brca_stage2-3_debug_analysis.json (for detailed analysis)")

        # Show first few examples
    else:
        print("\nNo cases found matching TNBC criteria!")
        print("This might indicate:")
        print("- Very strict filtering criteria")
        print("- Missing molecular test data")
        print("- Need to check data extraction logic")


if __name__ == "__main__":
    main()

Loading clinical JSON data...
Loaded 1098 cases from TCGA-BRCA

APPLYING TNBC FILTERING USING JSON MOLECULAR TEST DATA

Filtering Statistics:
  Total cases: 1098
  Cases with molecular data: 1049
  Stage II-III cases: 870
  Stage II-III with molecular data: 831
  ER negative (≤10%): 259
  PR negative (≤10%): 398
  HER2 negative: 725
  TNBC (any stage): 160
  Final TNBC Stage II-III cohort: 131

Exclusion breakdown for Stage II-III cases with molecular data:
  ER positive (>10%): 617
  PR positive (>10%): 498
  HER2 positive: 141

Missing data:
  Missing ER status: 51
  Missing PR status: 55
  Missing HER2 status: 211

Stage II-III cases with molecular data breakdown:
  Total: 831
  TNBC: 131
  Non-TNBC: 700

Borderline cases analysis:
  Cases with missing receptor data: 127
  Examples:
    TCGA-B6-A0IQ: Missing HER2
    TCGA-B6-A0I6: Missing HER2
    TCGA-B6-A0IK: Missing HER2
    TCGA-B6-A0IO: Missing PR, Missing HER2
    TCGA-E2-A572: Missing HER2

DUPLICATE CHECK
  Total submitter I

In [6]:
foo = pd.read_json("ids/tcga_brca_tnbc_stage2-3_cohort_json_final.json")
foo.shape

(131, 13)

In [12]:
foo = pd.read_json("ids/tcga_cohort.json")
foo.head()

Unnamed: 0,case_id,submitter_id,stage,er_status,er_percentage,pr_status,pr_percentage,her2_ihc_status,her2_ihc_intensity,her2_fish_status,her2_fish_value,primary_diagnosis,primary_site
0,01674b2c-5cf2-478f-84a1-f69c39f47bd4,TCGA-EW-A1P7,Stage IIA,Negative,<10%,Negative,<10%,Equivocal,2+,Test Value Reported,1.8,"Infiltrating duct carcinoma, NOS",Breast
1,016caf42-4e19-4444-ab5d-6cf1e76c4afa,TCGA-AO-A128,Stage IIA,Negative,<10%,Negative,<10%,Negative,0,Test Value Reported,1.0,"Infiltrating duct carcinoma, NOS",Breast
2,029ce650-5e5a-4100-8596-cd94300e7ef5,TCGA-E2-A574,Stage IIA,Negative,,Negative,,Equivocal,2+,Copy Number Reported,1.0,"Infiltrating duct carcinoma, NOS",Breast
3,05506f4c-e701-4a9d-ae06-97f066aade43,TCGA-AN-A0AT,Stage IIA,Negative,,Negative,,Negative,,,,"Infiltrating duct carcinoma, NOS",Breast
4,0dca98b0-f43e-45b6-9a02-00092c78678c,TCGA-D8-A27H,Stage IIA,Negative,,Negative,,Negative,,,,"Infiltrating duct carcinoma, NOS",Breast


In [10]:
clinical_data_raw = pd.read_json("./clinical-tcga-brca.json").set_index("case_id")
clinical_data_raw.head()

Unnamed: 0_level_0,disease_type,project,submitter_id,days_to_consent,diagnoses,consent_type,demographic,primary_site,updated_datetime,follow_ups,index_date,state,lost_to_followup,exposures
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
001cef41-ff86-4d3f-a140-a647ac4b10a1,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},TCGA-E2-A1IU,-34.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': 'd14426b2-e0a0-519a-bea6-4f...,Breast,2025-01-06T00:20:17.681998-06:00,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,
0045349c-69d9-4306-a403-c9c1fa836644,Adenomas and Adenocarcinomas,{'project_id': 'TCGA-BRCA'},TCGA-A1-A0SB,76.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': 'fa693617-eab9-502c-9a16-0e...,Breast,2025-01-05T21:27:11.301925-06:00,"[{'timepoint_category': 'Last Contact', 'follo...",Diagnosis,released,,
00807dae-9f4a-4fd1-aac2-82eb11bf2afb,Adnexal and Skin Appendage Neoplasms,{'project_id': 'TCGA-BRCA'},TCGA-A2-A04W,19.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '6562e53d-99fd-54ed-b8e1-51...,Breast,2025-01-06T07:31:06.617417-06:00,"[{'timepoint_category': 'Follow-up', 'follow_u...",Diagnosis,released,No,
00a2d166-78c9-4687-a195-3d6315c27574,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},TCGA-AN-A0AM,0.0,"[{'synchronous_malignancy': 'No', 'ajcc_pathol...",Informed Consent,{'demographic_id': '7366952a-e8e7-56ec-9867-23...,Breast,2025-01-05T22:52:03.042081-06:00,"[{'timepoint_category': 'Follow-up', 'follow_u...",Diagnosis,released,Yes,
00b11ca8-8540-4a3d-b602-ec754b00230b,Ductal and Lobular Neoplasms,{'project_id': 'TCGA-BRCA'},TCGA-LL-A440,11.0,"[{'ajcc_pathologic_t': 'Tis (DCIS)', 'morpholo...",Informed Consent,{'demographic_id': 'f5229922-62e2-51d5-ba4e-94...,Breast,2025-01-05T22:47:37.828549-06:00,"[{'timepoint_category': 'Follow-up', 'follow_u...",Diagnosis,released,No,
