In [None]:
# Notebook Setup
# Run this cell:
# The lines below will instruct jupyter to reload imported modules before
# executing code cells. This enables you to quickly iterate and test revisions
# to your code without having to restart the kernel and reload all of your
# modules each time you make a code change in a separate python file.

%load_ext autoreload
%autoreload 2

import os

# Change path to project root
if os.getcwd().endswith("notebooks"):
    os.chdir(os.path.dirname(os.getcwd()))
print(os.getcwd())

In [15]:
# Check for benchmark annotation files where there are grouped variants
import json
from pathlib import Path


def _has_comma(v) -> bool:
    return "," in (v or "")


def check_grouped_variants(file_path: str) -> bool:
    has_grouped = False
    """Check if a benchmark annotation file has grouped variants"""
    with open(file_path, "r") as f:
        data = json.load(f)

    print(f"Checking {file_path}")

    # Check var_drug_ann for grouped variants
    for item in data.get("var_drug_ann", []) or []:
        if _has_comma(item.get("Variant/Haplotypes")):
            print(f"Variant/Haplotypes: {item.get('Variant/Haplotypes')}")
            has_grouped = True
        if _has_comma(item.get("Drug(s)")):
            print(f"Found grouped drug in var_drug_ann: {item.get('Drug(s)')}")
            has_grouped = True

    # Check var_pheno_ann for grouped variants
    for item in data.get("var_pheno_ann", []) or []:
        if _has_comma(item.get("Variant/Haplotypes")):
            print(f"Variant/Haplotypes: {item.get('Variant/Haplotypes')}")
            has_grouped = True
        if _has_comma(item.get("Drug(s)")):
            print(f"Found grouped drug in var_pheno_ann: {item.get('Drug(s)')}")
            has_grouped = True

    # Check var_fa_ann for grouped variants
    for item in data.get("var_fa_ann", []) or []:
        if _has_comma(item.get("Variant/Haplotypes")):
            print(f"Variant/Haplotypes: {item.get('Variant/Haplotypes')}")
            has_grouped = True
        if _has_comma(item.get("Drug(s)")):
            print(f"Found grouped drug in var_fa_ann: {item.get('Drug(s)')}")
            has_grouped = True
        if _has_comma(item.get("Gene/gene product")):
            print(f"Found grouped gene in var_fa_ann: {item.get('Gene/gene product')}")
            has_grouped = True

    return has_grouped


# Find all benchmark annotation files with grouped variants
benchmark_dir = Path(
    "/Users/shloknatarajan/stanford/research/daneshjou/autogkb-benchmark/data/benchmark_annotations"
)
grouped_files = []

for file in benchmark_dir.glob("*.json"):
    if check_grouped_variants(str(file)):
        grouped_files.append(file.name)

print(f"Found {len(grouped_files)} files with grouped variants:")
for file in sorted(grouped_files):
    print(f"  - {file}")

Checking /Users/shloknatarajan/stanford/research/daneshjou/autogkb-benchmark/data/benchmark_annotations/PMC5508045.json
Checking /Users/shloknatarajan/stanford/research/daneshjou/autogkb-benchmark/data/benchmark_annotations/PMC4916189.json
Variant/Haplotypes: CYP2B6*1, CYP2B6*9
Checking /Users/shloknatarajan/stanford/research/daneshjou/autogkb-benchmark/data/benchmark_annotations/PMC12036300.json
Variant/Haplotypes: CYP2C19*1, CYP2C19*2, CYP2C19*17
Variant/Haplotypes: CYP2C19*1, CYP2C19*2, CYP2C19*17
Checking /Users/shloknatarajan/stanford/research/daneshjou/autogkb-benchmark/data/benchmark_annotations/PMC554812.json
Checking /Users/shloknatarajan/stanford/research/daneshjou/autogkb-benchmark/data/benchmark_annotations/PMC5561238.json
Checking /Users/shloknatarajan/stanford/research/daneshjou/autogkb-benchmark/data/benchmark_annotations/PMC10946077.json
Variant/Haplotypes: UGT1A1*1, UGT1A1*6, UGT1A1*28
Variant/Haplotypes: UGT1A1*1, UGT1A1*28
Variant/Haplotypes: UGT1A1*1, UGT1A1*6
Check