In [1]:
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict

# Load the original JSON
with open("biomarker_data_combined.json", "r") as f:
    raw_data = json.load(f)

# Step 1: Flatten and normalize all biomarker entries
biomarkers_flat = []

for entry in raw_data:
    # Case 1: nested list under "Biomarkers"
    if "Biomarkers" in entry:
        biomarkers_flat.extend(entry["Biomarkers"])
    else:
        biomarkers_flat.append(entry)

# Step 2: Normalize data into consistent structure
def normalize_entry(entry):
    def as_list(val):
        if isinstance(val, str):
            return [v.strip() for v in val.split(",")]
        return val if isinstance(val, list) else []

    return {
        "Biomarker Name": as_list(entry.get("Biomarker Name", [])),
        "Associated Disease(s)": as_list(entry.get("Associated Disease(s)", [])),
        "Application": as_list(entry.get("Application", [])),
        "Biological Source": as_list(entry.get("Biological Source", [])),
        "Mechanism / Pathway": as_list(entry.get("Mechanism / Pathway", [])),
        "Other Relevant Terms": as_list(entry.get("Other Relevant Terms", []))
    }

normalized = [normalize_entry(entry) for entry in biomarkers_flat]

# Step 3: Group by Disease and aggregate biomarkers
disease_map = defaultdict(lambda: {
    "Associated Biomarkers": set(),
    "Applications": set(),
    "Biological Sources": set(),
    "Mechanisms / Pathways": set(),
    "Relevant Terms": set()
})

for entry in normalized:
    biomarkers = entry["Biomarker Name"]
    diseases = entry["Associated Disease(s)"]

    for disease in diseases:
        disease_key = disease.strip().lower().capitalize()

        disease_map[disease_key]["Associated Biomarkers"].update(biomarkers)
        disease_map[disease_key]["Applications"].update(entry["Application"])
        disease_map[disease_key]["Biological Sources"].update(entry["Biological Source"])
        disease_map[disease_key]["Mechanisms / Pathways"].update(entry["Mechanism / Pathway"])
        disease_map[disease_key]["Relevant Terms"].update(entry["Other Relevant Terms"])

# Step 4: Convert to DataFrame
flattened_rows = []

for disease, details in disease_map.items():
    flattened_rows.append({
        "Disease": disease,
        "Associated Biomarkers": ", ".join(sorted(details["Associated Biomarkers"])),
        "Applications": ", ".join(sorted(details["Applications"])),
        "Biological Sources": ", ".join(sorted(details["Biological Sources"])),
        "Mechanisms / Pathways": ", ".join(sorted(details["Mechanisms / Pathways"])),
        "Relevant Terms": ", ".join(sorted(details["Relevant Terms"]))
    })

df_final = pd.DataFrame(flattened_rows)

# Step 5: Save outputs
output_dir = Path("disease_grouped_output")
output_dir.mkdir(exist_ok=True)

df_final.to_csv(output_dir / "disease_grouped.csv", index=False)
df_final.to_excel(output_dir / "disease_grouped.xlsx", index=False)
df_final.to_json(output_dir / "disease_grouped.json", orient="records", indent=2)

print("✅ Cleaned and grouped data exported to 'disease_grouped_output/'")


✅ Cleaned and grouped data exported to 'disease_grouped_output/'
