# GDC Query Evaluation Framework

This notebook evaluates 30 queries across three complexity levels against the Genomic Data Commons (GDC) API:
- **Basic Discovery (Low Complexity)**: 10 queries (EV-L01 to EV-L10)
- **Entity Filtering (Medium Complexity)**: 10 queries (EV-M01 to EV-M10)  
- **Complex Cohorts (High Complexity)**: 10 queries (EV-H01 to EV-H10)

In [36]:
# Import Required Libraries

import requests
import json
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import time
from datetime import datetime

In [37]:

# GDC API Configuration
GDC_API_BASE = "https://api.gdc.cancer.gov"

results = {}

In [38]:
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def graphql_query(query, variables=None):
    """Execute GraphQL query against GDC"""
    url = f"{GDC_API_BASE}/v0/graphql"
    headers = {"Content-Type": "application/json"}
    payload = {"query": query}
    if variables:
        payload["variables"] = variables

    response = requests.post(url, json=payload, headers=headers)

    # Better error handling
    if response.status_code != 200:
        print(f"❌ GraphQL Error: {response.status_code}")
        print(f"Response: {response.text}")
        return None

    result = response.json()
    if "errors" in result:
        print(f"❌ GraphQL Errors: {json.dumps(result['errors'], indent=2)}")
        return None

    return result


def rest_query(endpoint, params=None):
    """Execute REST API query against GDC"""
    url = f"{GDC_API_BASE}/{endpoint}"
    headers = {"Content-Type": "application/json"}

    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()
    return response.json()

In [39]:
# EV-L01: In the GDC database, list all available program names
def eval_L01():
    start = time.time()
    try:
        # Get programs through projects endpoint since programs endpoint doesn't exist
        result = rest_query("projects", {
            "size": "2000",
            "fields": "program.name"
        })
        
        # Extract unique program names
        programs = set()
        for project in result["data"]["hits"]:
            program_info = project.get("program", {})
            if isinstance(program_info, dict) and "name" in program_info:
                programs.add(program_info["name"])
            elif isinstance(program_info, list):
                for prog in program_info:
                    if isinstance(prog, dict) and "name" in prog:
                        programs.add(prog["name"])
        
        programs_list = sorted(list(programs))
        count = len(programs_list)
        
        print(f"✅ EV-L01: Found {count} programs")
        print(f"Programs: {', '.join(programs_list)}")
        
        results["EV-L01"] = {
            "status": "success",
            "result": f"{count} programs",
            "data": programs_list,
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-L01 Failed: {e}")
        results["EV-L01"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L01()

✅ EV-L01: Found 22 programs
Programs: APOLLO, BEATAML1.0, CDDP_EAGLE, CGCI, CMI, CPTAC, CTSP, EXCEPTIONAL_RESPONDERS, FM, HCMI, MATCH, MMRF, MP2PRT, NCICCR, OHSU, ORGANOID, REBC, TARGET, TCGA, TRIO, VAREPOP, WCDT


In [40]:
# EV-L02: In the GDC database, count the total number of projects
def eval_L02():
    start = time.time()
    try:
        result = rest_query("projects", {"size": "0"})
        count = result["data"]["pagination"]["total"]
        
        print(f"✅ EV-L02: Found {count} projects")
        
        results["EV-L02"] = {
            "status": "success",
            "result": f"{count} projects",
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-L02 Failed: {e}")
        results["EV-L02"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L02()

✅ EV-L02: Found 86 projects


In [41]:
# EV-L03: In the GDC database, retrieve the primary sites represented across all projects
def eval_L03():
    start = time.time()
    try:
        result = rest_query("projects", {
            "size": "2000",
            "fields": "primary_site"
        })
        
        primary_sites = set()
        for project in result["data"]["hits"]:
            sites = project.get("primary_site", [])
            if isinstance(sites, list):
                for site in sites:
                    if site and site != "_missing":  # Exclude _missing values
                        primary_sites.add(site)
            elif sites and sites != "_missing":  # Exclude _missing values
                primary_sites.add(sites)
        
        count = len(primary_sites)
        sorted_sites = sorted(primary_sites)
        
        print(f"✅ EV-L03: Found {count} primary sites (excluding '_missing')")
        print(f"Sites: {'; '.join(sorted_sites[:10])}{'...' if count > 10 else ''}")
        
        results["EV-L03"] = {
            "status": "success",
            "result": f"{count} primary sites",
            "data": sorted_sites,
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L03 Failed: {e}")
        results["EV-L03"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L03()

✅ EV-L03: Found 69 primary sites (excluding '_missing')
Sites: Accessory sinuses; Adrenal gland; Anus and anal canal; Base of tongue; Bladder; Bones, joints and articular cartilage of limbs; Bones, joints and articular cartilage of other and unspecified sites; Brain; Breast; Bronchus and lung...


In [42]:
# EV-L04: In the GDC database, list all data categories (e.g., Raw Sequencing Data, Transcriptome Profiling)
def eval_L04():
    start = time.time()
    try:
        result = rest_query("files", {
            "size": "0",
            "facets": "data_category"
        })
        
        categories = []
        for bucket in result["data"]["aggregations"]["data_category"]["buckets"]:
            categories.append(bucket["key"])
        
        count = len(categories)
        print(f"✅ EV-L04: Found {count} data categories")
        print(f"Categories: {'; '.join(sorted(categories))}")
        
        results["EV-L04"] = {
            "status": "success", 
            "result": f"{count} data categories",
            "data": sorted(categories),
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L04 Failed: {e}")
        results["EV-L04"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L04()

✅ EV-L04: Found 11 data categories
Categories: biospecimen; clinical; combined nucleotide variation; copy number variation; dna methylation; proteome profiling; sequencing reads; simple nucleotide variation; somatic structural variation; structural variation; transcriptome profiling


In [43]:
# EV-L05: In the GDC database, get all experimental strategies used
def eval_L05():
    start = time.time()
    try:
        result = rest_query("files", {
            "size": "0",
            "facets": "experimental_strategy"
        })
        
        strategies = []
        for bucket in result["data"]["aggregations"]["experimental_strategy"]["buckets"]:
            strategy = bucket["key"]
            if strategy and strategy != "_missing":  # Exclude _missing values
                strategies.append(strategy)
        
        count = len(strategies)
        print(f"✅ EV-L05: Found {count} experimental strategies (excluding '_missing')")
        print(f"Strategies: {'; '.join(sorted(strategies))}")
        
        results["EV-L05"] = {
            "status": "success",
            "result": f"{count} experimental strategies", 
            "data": sorted(strategies),
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L05 Failed: {e}")
        results["EV-L05"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L05()

✅ EV-L05: Found 13 experimental strategies (excluding '_missing')
Strategies: ATAC-Seq; Diagnostic Slide; Expression Array; Genotyping Array; Methylation Array; RNA-Seq; Reverse Phase Protein Array; Targeted Sequencing; Tissue Slide; WGS; WXS; miRNA-Seq; scRNA-Seq


In [44]:
# EV-L06: In the GDC database, list all file formats
def eval_L06():
    start = time.time()
    try:
        result = rest_query("files", {
            "size": "0", 
            "facets": "data_format"
        })
        
        formats = []
        for bucket in result["data"]["aggregations"]["data_format"]["buckets"]:
            formats.append(bucket["key"])
        
        count = len(formats)
        print(f"✅ EV-L06: Found {count} file formats")
        print(f"Formats: {'; '.join(sorted(formats))}")
        
        results["EV-L06"] = {
            "status": "success",
            "result": f"{count} file formats",
            "data": sorted(formats),
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L06 Failed: {e}")
        results["EV-L06"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L06()

✅ EV-L06: Found 22 file formats
Formats: bam; bcr auxiliary xml; bcr biotab; bcr omf xml; bcr pps xml; bcr ssf xml; bcr xml; bedpe; cdc json; cel; hdf5; idat; jpeg 2000; maf; mex; pdf; svs; tar; tsv; txt; vcf; xlsx


In [45]:
# EV-L07: In the GDC database, list annotation categories and classifications used in Annotations to flag QC issues
def eval_L07():
    start = time.time()
    try:
        result = rest_query("annotations", {
            "size": "0",
            "facets": "category,classification"
        })
        
        categories = []
        classifications = []
        
        for bucket in result["data"]["aggregations"]["category"]["buckets"]:
            categories.append(bucket["key"])
            
        for bucket in result["data"]["aggregations"]["classification"]["buckets"]:
            classifications.append(bucket["key"])
        
        print(f"✅ EV-L07: Found {len(categories)} annotation categories, {len(classifications)} classifications")
        print(f"Categories: {'; '.join(categories)}")
        print(f"Classifications: {'; '.join(classifications)}")
        
        results["EV-L07"] = {
            "status": "success",
            "result": f"{len(categories)} categories, {len(classifications)} classifications",
            "data": {"categories": categories, "classifications": classifications},
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L07 Failed: {e}")
        results["EV-L07"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L07()

✅ EV-L07: Found 35 annotation categories, 4 classifications
Categories: general; item is noncanonical; item flagged dnu; prior malignancy; alternate sample pipeline; center qc failed; history of unacceptable prior treatment related to a prior/other malignancy; item in special subset; synchronous malignancy; neoadjuvant therapy; genotype mismatch; bcr notification; history of acceptable prior treatment related to a prior/other malignancy; item does not meet study protocol; item flagged low quality; item may not meet study protocol; case submitted is found to be a recurrence after submission; permanently missing item or object; duplicate case; subject identity unknown; molecular analysis outside specification; pathology outside specification; barcode incorrect; acceptable treatment for tcga tumor; biospecimen identity unknown; subject withdrew consent; qualification metrics changed; inadvertently shipped; qualified in error; normal tissue origin incorrect; normal class but appears diseas

In [46]:
# EV-L08: In the GDC database, list all the available disease types
def eval_L08():
    start = time.time()
    try:
        result = rest_query("cases", {
            "size": "0",
            "facets": "disease_type"
        })
        
        disease_types = []
        for bucket in result["data"]["aggregations"]["disease_type"]["buckets"]:
            disease_type = bucket["key"]
            if disease_type and disease_type != "_missing":  # Exclude _missing values
                disease_types.append(disease_type)
        
        count = len(disease_types)
        print(f"✅ EV-L08: Found {count} disease types")
        print(f"Disease types: {'; '.join(sorted(disease_types)[:10])}{'...' if count > 10 else ''}")
        
        results["EV-L08"] = {
            "status": "success",
            "result": f"{count} disease types",
            "data": sorted(disease_types),
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-L08 Failed: {e}")
        results["EV-L08"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L08()

✅ EV-L08: Found 46 disease types
Disease types: acinar cell neoplasms; acute lymphoblastic leukemia; adenomas and adenocarcinomas; adnexal and skin appendage neoplasms; basal cell neoplasms; blood vessel tumors; chronic myeloproliferative disorders; complex epithelial neoplasms; complex mixed and stromal neoplasms; cystic, mucinous and serous neoplasms...


In [47]:
# EV-L09: In the GDC database, list the available ethnicity categories
def eval_L09():
    start = time.time()
    try:
        result = rest_query("cases", {
            "size": "0",
            "facets": "demographic.ethnicity"
        })
        
        ethnicities = []
        for bucket in result["data"]["aggregations"]["demographic.ethnicity"]["buckets"]:
            ethnicity = bucket["key"]
            if ethnicity and ethnicity != "_missing":  # Exclude _missing values
                ethnicities.append(ethnicity)
        
        count = len(ethnicities)
        print(f"✅ EV-L09: Found {count} ethnicity categories")
        print(f"Ethnicities: {'; '.join(sorted(ethnicities))}")
        
        results["EV-L09"] = {
            "status": "success",
            "result": f"{count} ethnicity categories",
            "data": sorted(ethnicities),
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-L09 Failed: {e}")
        results["EV-L09"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L09()

✅ EV-L09: Found 4 ethnicity categories
Ethnicities: hispanic or latino; not hispanic or latino; not reported; unknown


In [48]:
# EV-L10: In the GDC database, what are the available platform types used for sequencing
def eval_L10():
    start = time.time()
    try:
        result = rest_query("files", {
            "size": "0",
            "facets": "platform"
        })
        
        platforms = []
        platform_counts = {}
        for bucket in result["data"]["aggregations"]["platform"]["buckets"]:
            platform = bucket["key"]
            count = bucket["doc_count"]
            if platform and platform != "_missing":  # Exclude _missing values
                platforms.append(platform)
                platform_counts[platform] = count
        
        total_platforms = len(platforms)
        print(f"✅ EV-L10: Found {total_platforms} platform types (excluding '_missing')")
        
        # Show top platforms by count
        sorted_platforms = sorted(platform_counts.items(), key=lambda x: x[1], reverse=True)
        for platform, count in sorted_platforms[:10]:
            print(f"  {platform}: {count:,}")
        
        results["EV-L10"] = {
            "status": "success",
            "result": f"{total_platforms} platform types",
            "data": {"platforms": platforms, "counts": platform_counts},
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-L10 Failed: {e}")
        results["EV-L10"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_L10()

✅ EV-L10: Found 10 platform types (excluding '_missing')
  illumina: 780,755
  affymetrix snp 6.0: 147,734
  illumina human methylation 450: 31,776
  illumina methylation epic: 9,489
  illumina human methylation 27: 9,435
  rppa: 7,906
  genechip u133a: 1,243
  complete genomics: 581
  illumina methylation epic v2: 213
  genechip u133 plus 2.0: 183


### ENTITY FILTERING QUERIES (Medium Complexity)

These queries apply specific filtering criteria to narrow down results within one or two entity types.

In [49]:
# EV-M01: In the GDC database, count the total number of RNA-Seq files across all projects
def eval_M01():
    start = time.time()
    try:
        filters = {
            "op": "=",
            "content": {
                "field": "experimental_strategy",
                "value": "RNA-Seq"
            }
        }
        
        result = rest_query("files", {
            "filters": json.dumps(filters),
            "size": "0"
        })
        
        count = result["data"]["pagination"]["total"]
        print(f"✅ EV-M01: Found {count:,} RNA-Seq files")
        
        results["EV-M01"] = {
            "status": "success",
            "result": f"{count} RNA-Seq files",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M01 Failed: {e}")
        results["EV-M01"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M01()

✅ EV-M01: Found 227,556 RNA-Seq files


In [50]:
# EV-M02: In the GDC database, count male vs. female cases in TCGA-LUAD
def eval_M02():
    start = time.time()
    try:
        filters = {
            "op": "=",
            "content": {
                "field": "project.project_id",
                "value": "TCGA-LUAD"
            }
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "0",
            "facets": "demographic.gender"
        })
        
        gender_counts = {}
        for bucket in result["data"]["aggregations"]["demographic.gender"]["buckets"]:
            gender_counts[bucket["key"]] = bucket["doc_count"]
        
        females = gender_counts.get("female", 0)
        males = gender_counts.get("male", 0)
        
        print(f"✅ EV-M02: TCGA-LUAD gender distribution:")
        print(f"  Females: {females}")
        print(f"  Males: {males}")
        
        results["EV-M02"] = {
            "status": "success",
            "result": f"{females} females, {males} males",
            "data": gender_counts,
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-M02 Failed: {e}")
        results["EV-M02"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M02()

✅ EV-M02: TCGA-LUAD gender distribution:
  Females: 280
  Males: 242


In [51]:
# EV-M03: In the GDC database, list the top 5 diseases by case count
def eval_M03():
    start = time.time()
    try:
        result = rest_query("cases", {
            "size": "0",
            "facets": "disease_type"
        })
        
        # Get disease counts and sort by count
        disease_counts = []
        for bucket in result["data"]["aggregations"]["disease_type"]["buckets"]:
            disease_counts.append((bucket["key"], bucket["doc_count"]))
        
        # Sort by count (descending) and get top 5
        top_5_diseases = sorted(disease_counts, key=lambda x: x[1], reverse=True)[:5]
        
        print(f"✅ EV-M03: Top 5 diseases by case count:")
        for i, (disease, count) in enumerate(top_5_diseases, 1):
            print(f"  {i}. {disease}: {count:,} cases")
        
        results["EV-M03"] = {
            "status": "success",
            "result": f"Top 5 diseases by case count",
            "data": top_5_diseases,
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M03 Failed: {e}")
        results["EV-M03"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M03()

✅ EV-M03: Top 5 diseases by case count:
  1. adenomas and adenocarcinomas: 14,549 cases
  2. ductal and lobular neoplasms: 3,642 cases
  3. myeloid leukemias: 3,465 cases
  4. epithelial neoplasms, nos: 3,287 cases
  5. squamous cell neoplasms: 3,112 cases


In [52]:
# EV-M04: In the GDC database, get the number of files linked to TARGET-AML
def eval_M04():
    start = time.time()
    try:
        filters = {
            "op": "=",
            "content": {
                "field": "cases.project.project_id",
                "value": "TARGET-AML"
            }
        }
        
        result = rest_query("files", {
            "filters": json.dumps(filters),
            "size": "0"
        })
        
        count = result["data"]["pagination"]["total"]
        print(f"✅ EV-M04: Found {count:,} files linked to TARGET-AML")
        
        results["EV-M04"] = {
            "status": "success",
            "result": f"{count} files linked to TARGET-AML",
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-M04 Failed: {e}")
        results["EV-M04"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M04()

✅ EV-M04: Found 52,123 files linked to TARGET-AML


In [53]:
# EV-M05: In the GDC database, retrieve TCGA-BRCA cases diagnosed at Stage II
def eval_M05():
    start = time.time()
    try:
        filters = {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "project.project_id", "value": "TCGA-BRCA"}},
                {"op": "=", "content": {"field": "diagnoses.ajcc_pathologic_stage", "value": "Stage II"}}
            ]
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "0"
        })
        
        count = result["data"]["pagination"]["total"]
        print(f"✅ EV-M05: Found {count} TCGA-BRCA cases diagnosed at Stage II")
        
        results["EV-M05"] = {
            "status": "success",
            "result": f"{count} TCGA-BRCA Stage II cases",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M05 Failed: {e}")
        results["EV-M05"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M05()

✅ EV-M05: Found 8 TCGA-BRCA cases diagnosed at Stage II


In [54]:
# EV-M06: In the GDC database, show the mean age at diagnosis for TCGA-COAD
def eval_M06():
    start = time.time()
    try:
        filters = {
            "op": "=",
            "content": {
                "field": "project.project_id",
                "value": "TCGA-COAD"
            }
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "2000",
            "fields": "diagnoses.age_at_diagnosis"
        })
        
        # Extract ages and calculate mean
        ages = []
        for case in result["data"]["hits"]:
            diagnoses = case.get("diagnoses", [])
            for diagnosis in diagnoses:
                age = diagnosis.get("age_at_diagnosis")
                if age is not None:
                    ages.append(age / 365.25)  # Convert from days to years
        
        if ages:
            mean_age = sum(ages) / len(ages)
            print(f"✅ EV-M06: Mean age at diagnosis for TCGA-COAD: {mean_age:.1f} years")
            print(f"  Based on {len(ages)} cases with age data")
            
            results["EV-M06"] = {
                "status": "success",
                "result": f"{mean_age:.1f} years mean age",
                "data": {"mean_age": mean_age, "sample_size": len(ages)},
                "time": time.time() - start
            }
        else:
            print(f"❌ EV-M06: No age data found for TCGA-COAD")
            results["EV-M06"] = {"status": "error", "error": "No age data found", "time": time.time() - start}
            
    except Exception as e:
        print(f"❌ EV-M06 Failed: {e}")
        results["EV-M06"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M06()

✅ EV-M06: Mean age at diagnosis for TCGA-COAD: 67.2 years
  Based on 568 cases with age data


In [55]:
# EV-M07: In the GDC database, list all projects that used RNA-Seq
def eval_M07():
    start = time.time()
    try:
        filters = {
            "op": "=",
            "content": {
                "field": "experimental_strategy",
                "value": "RNA-Seq"
            }
        }
        
        result = rest_query("files", {
            "filters": json.dumps(filters),
            "size": "0",
            "facets": "cases.project.project_id"
        })
        
        # Extract unique project IDs
        projects = []
        for bucket in result["data"]["aggregations"]["cases.project.project_id"]["buckets"]:
            projects.append(bucket["key"])
        
        count = len(projects)
        print(f"✅ EV-M07: Found {count} projects that used RNA-Seq")
        print(f"Projects: {', '.join(sorted(projects)[:10])}{'...' if count > 10 else ''}")
        
        results["EV-M07"] = {
            "status": "success",
            "result": f"{count} projects used RNA-Seq",
            "data": sorted(projects),
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M07 Failed: {e}")
        results["EV-M07"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M07()

✅ EV-M07: Found 82 projects that used RNA-Seq
Projects: APOLLO-LUAD, BEATAML1.0-COHORT, CDDP_EAGLE-1, CGCI-BLGSP, CGCI-HTMCP-CC, CGCI-HTMCP-DLBCL, CGCI-HTMCP-LC, CMI-ASC, CMI-MBC, CMI-MPC...


In [56]:
# EV-M08: In the GDC database, show the race distribution for TCGA-LIHC
def eval_M08():
    start = time.time()
    try:
        filters = {
            "op": "=",
            "content": {
                "field": "project.project_id",
                "value": "TCGA-LIHC"
            }
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "0",
            "facets": "demographic.race"
        })
        
        # Get race distribution
        race_counts = {}
        total_cases = 0
        for bucket in result["data"]["aggregations"]["demographic.race"]["buckets"]:
            race = bucket["key"]
            count = bucket["doc_count"]
            race_counts[race] = count
            total_cases += count
        
        print(f"✅ EV-M08: Race distribution for TCGA-LIHC ({total_cases} total cases):")
        for race, count in sorted(race_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / total_cases) * 100 if total_cases > 0 else 0
            print(f"  {race}: {count} ({percentage:.2f}%)")
        
        results["EV-M08"] = {
            "status": "success",
            "result": f"Race distribution for TCGA-LIHC",
            "data": race_counts,
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M08 Failed: {e}")
        results["EV-M08"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M08()

✅ EV-M08: Race distribution for TCGA-LIHC (377 total cases):
  white: 187 (49.60%)
  asian: 161 (42.71%)
  black or african american: 17 (4.51%)
  not reported: 6 (1.59%)
  unknown: 4 (1.06%)
  american indian or alaska native: 2 (0.53%)


In [57]:
# EV-M09: In the GDC database, count WGS files > 50 GB for TCGA-GBM
def eval_M09():
    start = time.time()
    try:
        filters = {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "cases.project.project_id", "value": "TCGA-GBM"}},
                {"op": "=", "content": {"field": "experimental_strategy", "value": "WGS"}},
                {"op": ">", "content": {"field": "file_size", "value": 53687091200}}  # 50 GB in bytes
            ]
        }

        result = rest_query("files", {
            "filters": json.dumps(filters),
            "size": "0"
        })
        
        count = result["data"]["pagination"]["total"]
        print(f"✅ EV-M09: Found {count} WGS files > 50GB for TCGA-GBM")
        
        results["EV-M09"] = {
            "status": "success",
            "result": f"{count} WGS files > 50GB",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M09 Failed: {e}")
        results["EV-M09"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M09()

✅ EV-M09: Found 798 WGS files > 50GB for TCGA-GBM


In [58]:
# EV-M10: In the GDC database, count cases with days_to_death < 1000 in TCGA-OV
def eval_M10():
    start = time.time()
    try:
        filters = {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "project.project_id", "value": "TCGA-OV"}},
                {"op": "<", "content": {"field": "demographic.days_to_death", "value": 1000}}
            ]
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "0"
        })
        
        count = result["data"]["pagination"]["total"]
        print(f"✅ EV-M10: Found {count} TCGA-OV cases with days_to_death < 1000")
        
        results["EV-M10"] = {
            "status": "success",
            "result": f"{count} TCGA-OV cases with days_to_death < 1000",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-M10 Failed: {e}")
        results["EV-M10"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_M10()

✅ EV-M10: Found 154 TCGA-OV cases with days_to_death < 1000


### COMPLEX COHORTS QUERIES (High Complexity)

These queries require multi-step reasoning, multiple entity relationships, or sophisticated filtering to define patient/sample cohorts.

In [59]:
# EV-H01: In the GDC database, list cases that have both WXS and RNA-Seq files
def eval_H01():
    start = time.time()
    try:
        # Find cases with WXS files - get actual case list instead of faceting
        wxs_filters = {
            "op": "=",
            "content": {
                "field": "files.experimental_strategy",
                "value": "WXS"
            }
        }
        
        # Function to get all case IDs for an experimental strategy with pagination
        def get_all_cases_for_strategy(strategy):
            filters = {
                "op": "=",
                "content": {
                    "field": "files.experimental_strategy",
                    "value": strategy
                }
            }
            
            # First get total count
            count_result = rest_query("cases", {
                "filters": json.dumps(filters),
                "size": "0"
            })
            total = count_result["data"]["pagination"]["total"]
            print(f"📊 Total {strategy} cases: {total:,}")
            
            # Get all cases (use pagination if needed)
            all_cases = set()
            size = 2000
            from_idx = 0
            
            while from_idx < total:
                result = rest_query("cases", {
                    "filters": json.dumps(filters),
                    "size": str(size),
                    "from": str(from_idx),
                    "fields": "submitter_id"
                })
                
                for case in result["data"]["hits"]:
                    all_cases.add(case["submitter_id"])
                
                from_idx += size
                
                # Break if we got fewer results than requested (end of data)
                if len(result["data"]["hits"]) < size:
                    break
            
            return all_cases, total
        
        # Get all WXS cases
        wxs_cases, wxs_total = get_all_cases_for_strategy("WXS")
        
        # Get all RNA-Seq cases
        rnaseq_cases, rnaseq_total = get_all_cases_for_strategy("RNA-Seq")
        
        # Find intersection
        both_cases = wxs_cases.intersection(rnaseq_cases)
        count = len(both_cases)
        
        print(f"✅ EV-H01: Found {count:,} cases with both WXS and RNA-Seq files")
        print(f"  WXS cases: {wxs_total:,}")
        print(f"  RNA-Seq cases: {rnaseq_total:,}")
        print(f"  Intersection (both): {count:,}")
        
        # Sample case IDs
        sample_cases = list(both_cases)[:10]
        if sample_cases:
            print(f"  Sample case IDs: {sample_cases}")
        
        results["EV-H01"] = {
            "status": "success",
            "result": f"{count} cases with both WXS and RNA-Seq",
            "data": {"both_count": count, "wxs_count": wxs_total, "rnaseq_count": rnaseq_total, "sample_cases": sample_cases},
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H01 Failed: {e}")
        results["EV-H01"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H01()

📊 Total WXS cases: 18,809
📊 Total RNA-Seq cases: 21,640
📊 Total RNA-Seq cases: 21,640
✅ EV-H01: Found 16,943 cases with both WXS and RNA-Seq files
  WXS cases: 18,809
  RNA-Seq cases: 21,640
  Intersection (both): 16,943
  Sample case IDs: ['TCGA-G4-6295', 'TCGA-IG-A97H', 'TCGA-B6-A0I6', 'TCGA-AX-A3FS', 'TCGA-BH-A18M', 'TARGET-20-PARGVC', 'C3L-00603', '2552', 'TCGA-WC-A882', 'C3L-01061']
✅ EV-H01: Found 16,943 cases with both WXS and RNA-Seq files
  WXS cases: 18,809
  RNA-Seq cases: 21,640
  Intersection (both): 16,943
  Sample case IDs: ['TCGA-G4-6295', 'TCGA-IG-A97H', 'TCGA-B6-A0I6', 'TCGA-AX-A3FS', 'TCGA-BH-A18M', 'TARGET-20-PARGVC', 'C3L-00603', '2552', 'TCGA-WC-A882', 'C3L-01061']


In [60]:
# EV-H02: In the GDC database, show the distribution of years of smoking for TCGA-LUSC
def eval_H02():
    start = time.time()
    try:
        # Use GraphQL to get exposure data
        query = """
        query LUSCSmokingData($filters: FiltersArgument) {
          viewer {
            repository {
              cases {
                hits(first: 600, filters: $filters) {
                  edges {
                    node {
                      case_id
                      exposures {
                        hits {
                          edges {
                            node {
                              tobacco_smoking_onset_year
                              tobacco_smoking_quit_year
                            }
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
        """
        
        variables = {
            "filters": {
                "op": "=",
                "content": {
                    "field": "project.project_id",
                    "value": "TCGA-LUSC"
                }
            }
        }
        
        result = graphql_query(query, variables)
        if result:
            # Process exposure data
            smoking_years = []
            cases_processed = 0
            
            for case_edge in result["data"]["viewer"]["repository"]["cases"]["hits"]["edges"]:
                case_node = case_edge.get("node", {})
                exposures = case_node.get("exposures", {}).get("hits", {}).get("edges", [])
                
                for exp_edge in exposures:
                    exp = exp_edge.get("node", {})
                    onset = exp.get("tobacco_smoking_onset_year")
                    quit = exp.get("tobacco_smoking_quit_year")
                    
                    if onset and quit:
                        years = quit - onset
                        if years > 0:
                            smoking_years.append(years)
                
                cases_processed += 1
            
            print(f"✅ EV-H02: Processed {cases_processed} TCGA-LUSC cases")
            print(f"  Found {len(smoking_years)} valid smoking duration records")
            
            if smoking_years:
                # Create distribution bins
                import numpy as np
                bins = [0, 10, 20, 30, 40, 50, 100]
                hist, _ = np.histogram(smoking_years, bins=bins)
                
                print(f"  Smoking years distribution:")
                for i in range(len(bins)-1):
                    print(f"    {bins[i]}-{bins[i+1]} years: {hist[i]} cases")
            
            results["EV-H02"] = {
                "status": "success",
                "result": f"Years of smoking distribution for TCGA-LUSC",
                "data": {"smoking_years": smoking_years, "cases_processed": cases_processed},
                "time": time.time() - start
            }
        else:
            results["EV-H02"] = {"status": "error", "error": "GraphQL query failed", "time": time.time() - start}
            
    except Exception as e:
        print(f"❌ EV-H02 Failed: {e}")
        results["EV-H02"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H02()

✅ EV-H02: Processed 504 TCGA-LUSC cases
  Found 224 valid smoking duration records
  Smoking years distribution:
    0-10 years: 2 cases
    10-20 years: 7 cases
    20-30 years: 34 cases
    30-40 years: 46 cases
    40-50 years: 69 cases
    50-100 years: 66 cases


In [61]:
# EV-H03: In the GDC database, count breast cancer female cases under age 40 with RNA-Seq files
def eval_H03():
    start = time.time()
    try:
        filters = {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "project.project_id", "value": "TCGA-BRCA"}},
                {"op": "=", "content": {"field": "demographic.gender", "value": "female"}},
                {"op": "<", "content": {"field": "diagnoses.age_at_diagnosis", "value": 14600}},  # 40 years in days
                {"op": "=", "content": {"field": "files.experimental_strategy", "value": "RNA-Seq"}}
            ]
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "0"
        })
        
        count = result["data"]["pagination"]["total"]
        print(f"✅ EV-H03: Found {count} breast cancer female cases under 40 with RNA-Seq files")
        
        results["EV-H03"] = {
            "status": "success",
            "result": f"{count} breast cancer female cases under 40 with RNA-Seq",
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H03 Failed: {e}")
        results["EV-H03"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H03()

✅ EV-H03: Found 74 breast cancer female cases under 40 with RNA-Seq files


In [62]:
# EV-H04: In the GDC database, show case IDs and file IDs for LUAD patients with Stage III disease
def eval_H04():
    start = time.time()
    try:
        filters = {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "project.project_id", "value": "TCGA-LUAD"}},
                {"op": "=", "content": {"field": "diagnoses.ajcc_pathologic_stage", "value": "Stage III"}}
            ]
        }
        
        # Get cases with Stage III LUAD
        cases_result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "1000",
            "fields": "submitter_id,case_id,files.file_id"
        })
        
        stage_iii_cases = []
        all_file_ids = []
        
        for case in cases_result["data"]["hits"]:
            case_files = []
            files = case.get("files", [])
            
            for file_info in files:
                file_id = file_info.get("file_id")
                if file_id:
                    case_files.append(file_id)
                    all_file_ids.append(file_id)
            
            case_info = {
                "case_id": case["case_id"],
                "submitter_id": case["submitter_id"],
                "file_ids": case_files,
                "file_count": len(case_files)
            }
            stage_iii_cases.append(case_info)
        
        cases_count = len(stage_iii_cases)
        total_files = len(all_file_ids)
        
        print(f"✅ EV-H04: Found {cases_count} LUAD Stage III cases with {total_files} files")
        
        if stage_iii_cases:
            print(f"  Sample cases:")
            for case in stage_iii_cases[:3]:
                print(f"    Case {case['submitter_id']}: {case['file_count']} files")
                if case['file_ids']:
                    print(f"      Sample file IDs: {case['file_ids'][:3]}")
        
        results["EV-H04"] = {
            "status": "success",
            "result": f"{cases_count} LUAD Stage III cases with {total_files} files",
            "data": {"cases": stage_iii_cases, "total_files": total_files},
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H04 Failed: {e}")
        results["EV-H04"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H04()

✅ EV-H04: Found 1 LUAD Stage III cases with 76 files
  Sample cases:
    Case TCGA-95-7947: 76 files
      Sample file IDs: ['178c3efb-305d-41ce-8f8b-aab23a749d2a', '9ff005f2-ee8c-4a8a-bbb6-ccfea30a1cba', '076dbb19-20eb-48c8-ac5a-ba8184a275f7']


In [63]:
# EV-H05: In the GDC database, retrieve cases with both "Copy Number Variation" and "Simple Somatic Mutation"
def eval_H05():
    start = time.time()
    try:
        # Function to get all case IDs for a data category with pagination
        def get_all_cases_for_data_category(data_category):
            filters = {
                "op": "=",
                "content": {
                    "field": "files.data_category",
                    "value": data_category
                }
            }
            
            # First get total count
            count_result = rest_query("cases", {
                "filters": json.dumps(filters),
                "size": "0"
            })
            total = count_result["data"]["pagination"]["total"]
            print(f"📊 Total {data_category} cases: {total:,}")
            
            # Get all cases (use pagination if needed)
            all_cases = set()
            size = 2000
            from_idx = 0
            
            while from_idx < total:
                result = rest_query("cases", {
                    "filters": json.dumps(filters),
                    "size": str(size),
                    "from": str(from_idx),
                    "fields": "submitter_id"
                })
                
                for case in result["data"]["hits"]:
                    all_cases.add(case["submitter_id"])
                
                from_idx += size
                # Break if we got fewer results than requested (end of data)
                if len(result["data"]["hits"]) < size:
                    break
            
            return all_cases, total
        
        # Get all CNV cases
        cnv_cases, cnv_total = get_all_cases_for_data_category("Copy Number Variation")
        
        # Get all SSM cases
        ssm_cases, ssm_total = get_all_cases_for_data_category("Simple Nucleotide Variation")
        
        # Find intersection
        both_cases = cnv_cases.intersection(ssm_cases)
        both_count = len(both_cases)
        
        # Sample case IDs
        sample_cases = list(both_cases)[:10]
        
        print(f"✅ EV-H05: Found {both_count:,} cases with both CNV and SSM data")
        print(f"  CNV cases: {cnv_total:,}")
        print(f"  SSM cases: {ssm_total:,}")
        print(f"  Intersection (both): {both_count:,}")
        
        if sample_cases:
            print(f"  Sample case IDs: {sample_cases}")
        
        results["EV-H05"] = {
            "status": "success",
            "result": f"{both_count} cases with both CNV and SSM data",
            "data": {
                "both_count": both_count,
                "cnv_count": cnv_total,
                "ssm_count": ssm_total,
                "sample_cases": sample_cases
            },
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H05 Failed: {e}")
        results["EV-H05"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H05()

📊 Total Copy Number Variation cases: 17,377
📊 Total Simple Nucleotide Variation cases: 40,311
📊 Total Simple Nucleotide Variation cases: 40,311
✅ EV-H05: Found 17,263 cases with both CNV and SSM data
  CNV cases: 17,377
  SSM cases: 40,311
  Intersection (both): 17,263
  Sample case IDs: ['TCGA-G4-6295', 'TCGA-IG-A97H', 'TCGA-B6-A0I6', 'TCGA-AX-A3FS', 'TCGA-BH-A18M', 'TARGET-20-PARGVC', 'C3L-00603', 'TCGA-WC-A882', 'C3L-01061', 'TARGET-20-PAPZIZ']
✅ EV-H05: Found 17,263 cases with both CNV and SSM data
  CNV cases: 17,377
  SSM cases: 40,311
  Intersection (both): 17,263
  Sample case IDs: ['TCGA-G4-6295', 'TCGA-IG-A97H', 'TCGA-B6-A0I6', 'TCGA-AX-A3FS', 'TCGA-BH-A18M', 'TARGET-20-PARGVC', 'C3L-00603', 'TCGA-WC-A882', 'C3L-01061', 'TARGET-20-PAPZIZ']


In [64]:
# EV-H06: In the GDC database, find case IDs for patients with alcohol history AND AJCC Stage II
def eval_H06():
    start = time.time()
    try:
        filters = {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "exposures.alcohol_history", "value": "Yes"}},
                {"op": "=", "content": {"field": "diagnoses.ajcc_pathologic_stage", "value": "Stage II"}}
            ]
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "1000",
            "fields": "submitter_id,case_id"
        })
        
        case_ids = []
        for case in result["data"]["hits"]:
            case_ids.append({
                "case_id": case["case_id"],
                "submitter_id": case["submitter_id"]
            })
        
        count = len(case_ids)
        print(f"✅ EV-H06: Found {count} cases with alcohol history AND AJCC Stage II")
        if count > 0:
            print(f"  Sample case IDs: {[c['submitter_id'] for c in case_ids[:5]]}")
        
        results["EV-H06"] = {
            "status": "success",
            "result": f"{count} cases with alcohol history AND AJCC Stage II",
            "data": case_ids,
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H06 Failed: {e}")
        results["EV-H06"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H06()

✅ EV-H06: Found 56 cases with alcohol history AND AJCC Stage II
  Sample case IDs: ['TCGA-D6-8568', 'TCGA-CV-7254', 'TCGA-CV-7097', 'TCGA-HD-7917', 'TCGA-CN-5358']


In [65]:
# EV-H07: In the GDC database, count files for cases where cause_of_death is cancer related and patients were ≤ 50 years old at death
def eval_H07():
    start = time.time()
    try:
        filters = {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "demographic.cause_of_death", "value": "Cancer Related"}},
                {"op": "<=", "content": {"field": "demographic.days_to_death", "value": 18250}}  # 50 years in days
            ]
        }
        
        # First get the cases
        cases_result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "0"
        })
        
        cases_count = cases_result["data"]["pagination"]["total"]
        
        # Now get files for these cases
        files_result = rest_query("files", {
            "filters": json.dumps({
                "op": "and",
                "content": [
                    {"op": "=", "content": {"field": "cases.demographic.cause_of_death", "value": "Cancer Related"}},
                    {"op": "<=", "content": {"field": "cases.demographic.days_to_death", "value": 18250}}
                ]
            }),
            "size": "0"
        })
        
        files_count = files_result["data"]["pagination"]["total"]
        
        print(f"✅ EV-H07: Found {files_count} files for {cases_count} cases")
        print(f"  Cases: Cancer-related deaths ≤50 years old")
        
        results["EV-H07"] = {
            "status": "success",
            "result": f"{files_count} files for cancer-related deaths ≤50 years",
            "data": {"cases_count": cases_count, "files_count": files_count},
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H07 Failed: {e}")
        results["EV-H07"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H07()

✅ EV-H07: Found 58251 files for 1079 cases
  Cases: Cancer-related deaths ≤50 years old


In [66]:
# EV-H08: In the GDC database, show the joint distribution (gender × primary_site) for TCGA-COAD
def eval_H08():
    start = time.time()
    try:
        filters = {
            "op": "=",
            "content": {
                "field": "project.project_id",
                "value": "TCGA-COAD"
            }
        }
        
        # Get detailed case data with gender and primary site information
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "2000",
            "fields": "submitter_id,demographic.gender,project.primary_site"
        })
        
        # Process the data to create joint distribution
        joint_distribution = {}
        gender_totals = {}
        site_totals = {}
        total_cases = 0
        
        for case in result["data"]["hits"]:
            # Extract gender - handle both dict and list formats
            demographic = case.get("demographic", {})
            gender = "Unknown"
            
            if isinstance(demographic, list) and demographic:
                gender = demographic[0].get("gender", "Unknown")
            elif isinstance(demographic, dict):
                gender = demographic.get("gender", "Unknown")
            
            # Extract primary site
            project = case.get("project", {})
            primary_sites = project.get("primary_site", [])
            
            # Handle primary site (could be list or single value)
            if isinstance(primary_sites, list) and primary_sites:
                # Take the first primary site for simplicity
                primary_site = primary_sites[0]
            elif isinstance(primary_sites, str):
                primary_site = primary_sites
            else:
                primary_site = "Unknown"
            
            # Skip missing values
            if gender == "_missing":
                gender = "Unknown"
            if primary_site == "_missing":
                primary_site = "Unknown"
            
            # Update joint distribution
            key = (gender, primary_site)
            joint_distribution[key] = joint_distribution.get(key, 0) + 1
            
            # Update marginal totals
            gender_totals[gender] = gender_totals.get(gender, 0) + 1
            site_totals[primary_site] = site_totals.get(primary_site, 0) + 1
            total_cases += 1
        
        # Create a formatted cross-tabulation table
        print(f"✅ EV-H08: Joint distribution (gender × primary_site) for TCGA-COAD")
        print(f"  Total cases analyzed: {total_cases}")
        print()
        
        # Get unique genders and sites for table structure
        genders = sorted(gender_totals.keys())
        sites = sorted(site_totals.keys())
        
        # Print cross-tabulation table
        print("📊 CROSS-TABULATION TABLE:")
        
        # Header row
        header = "Primary Site \\ Gender".ljust(25)
        for gender in genders:
            header += f"{gender:>10}"
        header += f"{'Total':>10}"
        print(header)
        print("-" * len(header))
        
        # Data rows
        for site in sites:
            row = site.ljust(25)
            row_total = 0
            for gender in genders:
                count = joint_distribution.get((gender, site), 0)
                row += f"{count:>10}"
                row_total += count
            row += f"{row_total:>10}"
            print(row)
        
        # Total row
        total_row = "Total".ljust(25)
        for gender in genders:
            total_row += f"{gender_totals[gender]:>10}"
        total_row += f"{total_cases:>10}"
        print("-" * len(header))
        print(total_row)
        
        # Show percentages
        print(f"\n📈 PERCENTAGE BREAKDOWN:")
        for (gender, site), count in sorted(joint_distribution.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / total_cases) * 100 if total_cases > 0 else 0
            print(f"  {gender} × {site}: {count} cases ({percentage:.2f}%)")
        
        results["EV-H08"] = {
            "status": "success",
            "result": f"Joint distribution (gender × primary_site) for TCGA-COAD",
            "data": {
                "joint_distribution": joint_distribution,
                "gender_totals": gender_totals,
                "site_totals": site_totals,
                "total_cases": total_cases,
                "genders": genders,
                "sites": sites
            },
            "time": time.time() - start
        }
    except Exception as e:
        print(f"❌ EV-H08 Failed: {e}")
        results["EV-H08"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H08()

✅ EV-H08: Joint distribution (gender × primary_site) for TCGA-COAD
  Total cases analyzed: 461

📊 CROSS-TABULATION TABLE:
Primary Site \ Gender        female      malenot reported     Total
-------------------------------------------------------------------
Colon                           216       243         2       461
-------------------------------------------------------------------
Total                           216       243         2       461

📈 PERCENTAGE BREAKDOWN:
  male × Colon: 243 cases (52.71%)
  female × Colon: 216 cases (46.85%)
  not reported × Colon: 2 cases (0.43%)


In [67]:
# EV-H09: In the GDC database, show cases with family history of breast cancer AND RNA-Seq data
def eval_H09():
    start = time.time()
    try:
        filters = {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "family_histories.relationship_primary_diagnosis", "value": "Breast Cancer"}},
                {"op": "=", "content": {"field": "files.experimental_strategy", "value": "RNA-Seq"}}
            ]
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "1000",
            "fields": "submitter_id,case_id,project.project_id,files.experimental_strategy"
        })
        
        cases_with_history_and_rnaseq = []
        project_counts = {}
        
        for case in result["data"]["hits"]:
            # Verify RNA-Seq files exist
            has_rnaseq = False
            files = case.get("files", [])
            for file_info in files:
                if file_info.get("experimental_strategy") == "RNA-Seq":
                    has_rnaseq = True
                    break
            
            if has_rnaseq:
                case_info = {
                    "case_id": case["case_id"],
                    "submitter_id": case["submitter_id"],
                    "project": case.get("project", {}).get("project_id", "Unknown")
                }
                cases_with_history_and_rnaseq.append(case_info)
                
                # Count by project
                project = case_info["project"]
                project_counts[project] = project_counts.get(project, 0) + 1
        
        count = len(cases_with_history_and_rnaseq)
        total_count = result["data"]["pagination"]["total"]
        
        print(f"✅ EV-H09: Found {count} cases with family history of breast cancer AND RNA-Seq")
        print(f"  Total matching cases: {total_count}")
        
        if project_counts:
            sorted_projects = sorted(project_counts.items(), key=lambda x: x[1], reverse=True)
            print(f"  Cases by project:")
            for project, proj_count in sorted_projects:
                print(f"    {project}: {proj_count} cases")
        
        results["EV-H09"] = {
            "status": "success",
            "result": f"{count} cases with family history of breast cancer AND RNA-Seq",
            "data": {"count": count, "total_count": total_count, "cases": cases_with_history_and_rnaseq, "project_counts": project_counts},
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-H09 Failed: {e}")
        results["EV-H09"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H09()

✅ EV-H09: Found 144 cases with family history of breast cancer AND RNA-Seq
  Total matching cases: 144
  Cases by project:
    MMRF-COMMPASS: 71 cases
    TCGA-BLCA: 30 cases
    TCGA-TGCT: 15 cases
    TCGA-PAAD: 10 cases
    TCGA-MESO: 7 cases
    TCGA-CHOL: 6 cases
    HCMI-CMDC: 4 cases
    TCGA-UVM: 1 cases


In [68]:
# EV-H10: In the GDC database, count cases with multiple diagnoses records in TCGA-OV
def eval_H10():
    start = time.time()
    try:
        filters = {
            "op": "=",
            "content": {
                "field": "project.project_id",
                "value": "TCGA-OV"
            }
        }
        
        result = rest_query("cases", {
            "filters": json.dumps(filters),
            "size": "2000",
            "fields": "submitter_id,case_id,diagnoses.diagnosis_id"
        })
        
        cases_with_multiple_diagnoses = []
        single_diagnosis_count = 0
        multiple_diagnosis_count = 0
        
        for case in result["data"]["hits"]:
            diagnoses = case.get("diagnoses", [])
            diagnosis_count = len(diagnoses)
            
            case_info = {
                "case_id": case["case_id"],
                "submitter_id": case["submitter_id"],
                "diagnosis_count": diagnosis_count
            }
            
            if diagnosis_count > 1:
                cases_with_multiple_diagnoses.append(case_info)
                multiple_diagnosis_count += 1
            elif diagnosis_count == 1:
                single_diagnosis_count += 1
        
        total_cases = result["data"]["pagination"]["total"]
        
        print(f"✅ EV-H10: TCGA-OV diagnoses analysis:")
        print(f"  Total cases: {total_cases}")
        print(f"  Cases with single diagnosis: {single_diagnosis_count}")
        print(f"  Cases with multiple diagnoses: {multiple_diagnosis_count}")
        
        if cases_with_multiple_diagnoses:
            print(f"  Sample cases with multiple diagnoses:")
            for case in cases_with_multiple_diagnoses[:5]:
                print(f"    {case['submitter_id']}: {case['diagnosis_count']} diagnoses")
        
        results["EV-H10"] = {
            "status": "success",
            "result": f"{multiple_diagnosis_count} TCGA-OV cases with multiple diagnoses",
            "data": {
                "total_cases": total_cases,
                "single_diagnosis": single_diagnosis_count,
                "multiple_diagnoses": multiple_diagnosis_count,
                "multiple_diagnosis_cases": cases_with_multiple_diagnoses
            },
            "time": time.time() - start,
        }
    except Exception as e:
        print(f"❌ EV-H10 Failed: {e}")
        results["EV-H10"] = {"status": "error", "error": str(e), "time": time.time() - start}

eval_H10()

✅ EV-H10: TCGA-OV diagnoses analysis:
  Total cases: 608
  Cases with single diagnosis: 216
  Cases with multiple diagnoses: 371
  Sample cases with multiple diagnoses:
    TCGA-13-0920: 3 diagnoses
    TCGA-09-1662: 3 diagnoses
    TCGA-10-0925: 2 diagnoses
    TCGA-24-2033: 2 diagnoses
    TCGA-61-2110: 3 diagnoses
