In [4]:
import networkx as nx
from collections import defaultdict
import itertools

# --- CONFIG: path to your normalized graph ---
GRAPH_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_rebuilt_normalized.graphml"

# --- Load graph ---
G = nx.read_graphml(GRAPH_PATH)

# --- Collect nodes grouped by "type" ---
sample_by_type = defaultdict(list)
for n, attrs in G.nodes(data=True):
    t = attrs.get("type", "unknown")
    v = attrs.get("value", "") or attrs.get("name", "")
    sample_by_type[t].append((n, v))

# --- Print 5 samples per node type ---
for t, nodes in itertools.islice(sample_by_type.items(), 20):  # first 20 types
    print(f"\n=== Type: {t} | {len(nodes)} nodes ===")
    for nid, val in nodes[:5]:
        print(f"  id={nid} | value={val}")



=== Type: drug | 10620 nodes ===
  id=(WHOLE PLANT) | value=
  id=.ALPHA.-KETOGLUTARIC ACID - ACONITIC ACID, CIS - ADENOSINE TRIPHOSPHATE - ALPHA LIPOIC ACID - ASCORBIC ACID - ASIAN GINSENG - BARIUM CATION - BEET - CEROUS OXALATE NONAHYDRATE - COENZYME A - CYSTEINE - FUMARIC ACID - LACTIC ACID, DL - MAGNESIUM CARBONATE - MALIC ACID - MANGANESE PHOSPHATE, DIBASIC - PORK - PYRIDOXINE HYDROCHLORIDE - RIBOFLAVIN - SODIUM DIETHYL OXALACETATE - SODIUM PYRUVATE - SUCCINIC ACID - SUS SCROFA ADRENAL GLAND - THIAMINE HYDROCHLORIDE - SUS SCROFA EMBRYO - | value=
  id=.ALPHA.-KETOGLUTARIC ACID - ALPHA LIPOIC ACID - ASCORBIC ACID - BARIUM OXALOSUCCINATE - BLACK COHOSH - BRYONIA ALBA ROOT - CALCITONIN HUMAN - CHLORINE - COLCHICUM AUTUMNALE BULB - NADIDE - | value=
  id=.ALPHA.-KETOGLUTARIC ACID - AMBERGRIS - ANAMIRTA COCCULUS SEED - CONIUM MACULATUM FLOWERING TOP - EPINEPHRINE - FUMARIC ACID - MALIC ACID - MELATONIN - MINERAL OIL - PTERIDIUM AQUILINUM ROOT - PYRIDOXINE HYDROCHLORIDE - QUINHYDRONE -

In [5]:
# --- STEP 1: Rename OpenFDA node types (edges unchanged) with refined mapping based on samples ---
import os, json
from collections import Counter
import networkx as nx

# ==========================
# CONFIG — edit these paths
# ==========================
INPUT_PATH  = "/Users/ganeshkumarboini/Downloads/drug_data_kg_rebuilt_normalized.graphml"   # your normalized OpenFDA graph
OUTPUT_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_rebuilt_nodes_normalized.graphml"           # output

NODE_TYPE_KEY_IN   = "type"        # node type key in your normalized OpenFDA graph
NODE_TYPE_KEY_OUT  = "node_type"   # PrimeKG-style node type key to write
EDGE_REL_KEY_IN    = "relation"    # current edge relation key in your graph (unchanged in Step 1)

# How to map INACTIVE ingredients?  "exposure" (default) or "drug"
INACTIVE_INGREDIENT_AS = "drug"   # set to "drug" if you prefer all substances under 'drug'

# ==========================
# PrimeKG classes (11)
# ==========================
PRIMEKG_NODE_TYPES = {
    "gene/protein","biological_process","effect/phenotype","disease","anatomy",
    "molecular_function","drug","cellular_component","pathway","exposure","unknown"
}

# ==========================
# Refined mapping: OpenFDA -> PrimeKG
# (rename-only, no entity linking)
# ==========================
NODE_TYPE_MAP_BASE = {
    "drug": "drug",
    "brand": "drug",

    # Substances/content
    "active_ingredient": "drug",                 # substances fit best as 'drug' for merging
    # 'inactive_ingredient' decided below via INACTIVE_INGREDIENT_AS

    # Instructional / narrative text → keep out of 'disease'
    "indications_and_usage": "effect/phenotype", # mostly sentences, not clean disease labels
    "purpose": "effect/phenotype",

    # Clinical/risk/safety buckets
    "ask_doctor": "disease",                     # your sample shows clean conditions here
    "warnings": "effect/phenotype",
    "stop_use": "effect/phenotype",
    "pregnancy_or_breast_feeding": "effect/phenotype",
    "keep_out_of_reach_of_children": "effect/phenotype",
    "overdose_warning": "effect/phenotype",
    "ask_doctor_or_pharmacist": "effect/phenotype",

    # Procedures/instructions → 'exposure' bucket
    "dosage_and_administration": "exposure",
    "storage_and_handling": "exposure",

    # Interaction targets are typically substances/herbs → treat as 'drug'
    "drug_interactions": "drug",
}

NODE_TYPE_MAP = dict(NODE_TYPE_MAP_BASE)
if INACTIVE_INGREDIENT_AS not in {"exposure","drug"}:
    raise ValueError("INACTIVE_INGREDIENT_AS must be 'exposure' or 'drug'")
NODE_TYPE_MAP["inactive_ingredient"] = INACTIVE_INGREDIENT_AS

def load_graph(path: str):
    ext = os.path.splitext(path)[1].lower()
    if ext in (".graphml", ".xml"):
        return nx.read_graphml(path)
    if ext == ".gexf":
        return nx.read_gexf(path)
    if ext == ".gml":
        return nx.read_gml(path)
    raise ValueError(f"Unsupported input format: {ext}")

def save_graph(G: nx.Graph, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    ext = os.path.splitext(path)[1].lower()
    if ext in (".graphml", ".xml"):
        nx.write_graphml(G, path)
    elif ext == ".gexf":
        nx.write_gexf(G, path)
    elif ext == ".gml":
        nx.write_gml(G, path)
    else:
        raise ValueError(f"Unsupported output format: {ext}")

def rename_node_types_only(input_path: str,
                           output_path: str,
                           node_type_key_in: str = "type",
                           node_type_key_out: str = "node_type",
                           edge_rel_key_in: str = "relation"):
    G = load_graph(input_path)

    # Build DiGraph result
    H = nx.DiGraph()

    # --- Nodes: rename types, preserve provenance ---
    for n, attrs in G.nodes(data=True):
        out = dict(attrs)  # copy all attrs
        orig_t = str(attrs.get(node_type_key_in, "")).strip()
        out["orig_type"] = orig_t  # provenance

        mapped = NODE_TYPE_MAP.get(orig_t.lower(), None)
        if mapped is None:
            # if already a PrimeKG class, keep; else set unknown
            mapped = orig_t if orig_t in PRIMEKG_NODE_TYPES else "unknown"
        out[node_type_key_out] = mapped

        if "node_source" not in out:
            out["node_source"] = "OpenFDA"

        # convenience: if you want a 'node_name', many datasets store text in the graphml 'id'
        if "node_name" not in out:
            # GraphML loader puts the node identifier in `n` (the key), not an attribute
            out["node_name"] = str(n)

        H.add_node(n, **out)

    # --- Edges (unchanged in Step 1) ---
    H.add_edges_from(G.edges(data=True))

    # --- Save ---
    save_graph(H, output_path)

    # --- Summary ---
    node_types_after = Counter(a.get(node_type_key_out, "unknown") for _, a in H.nodes(data=True))
    edge_types_now   = Counter(d.get(edge_rel_key_in, "unknown") for _, _, d in H.edges(data=True))

    summary = {
        "graph_kind": "DiGraph",
        "nodes": H.number_of_nodes(),
        "edges": H.number_of_edges(),

        "node_types_count": len(node_types_after),
        "node_types": [k for k, _ in sorted(node_types_after.items(), key=lambda kv: (-kv[1], kv[0]))],

        "relation_types_count": len(edge_types_now),
        "relation_types": [k for k, _ in sorted(edge_types_now.items(), key=lambda kv: (-kv[1], kv[0]))],

        "keys_used": {
            "node_type_key_in": node_type_key_in,
            "node_type_key_out": node_type_key_out,
            "edge_rel_key_in": edge_rel_key_in
        },

        "node_type_counts_after_top": sorted(node_types_after.items(), key=lambda kv: (-kv[1], kv[0]))[:20],
        "edge_type_counts_current_top": sorted(edge_types_now.items(), key=lambda kv: (-kv[1], kv[0]))[:20],

        "mapping_choices": {
            "inactive_ingredient_as": INACTIVE_INGREDIENT_AS
        },

        "saved_to": output_path
    }

    print(json.dumps(summary, indent=2))
    return H, summary

# ==========================
# RUN
# ==========================
H_nodes_renamed, summary = rename_node_types_only(
    INPUT_PATH,
    OUTPUT_PATH,
    node_type_key_in=NODE_TYPE_KEY_IN,
    node_type_key_out=NODE_TYPE_KEY_OUT,
    edge_rel_key_in=EDGE_REL_KEY_IN
)


{
  "graph_kind": "DiGraph",
  "nodes": 100495,
  "edges": 160229,
  "node_types_count": 5,
  "node_types": [
    "drug",
    "effect/phenotype",
    "exposure",
    "disease",
    "unknown"
  ],
  "relation_types_count": 15,
  "relation_types": [
    "dosage_and_administration",
    "active_ingredient",
    "indications_and_usage",
    "purpose",
    "inactive_ingredient",
    "keep_out_of_reach_of_children",
    "stop_use",
    "ask_doctor",
    "storage_and_handling",
    "do_not_use",
    "pregnancy_or_breast_feeding",
    "ask_doctor_or_pharmacist",
    "drug_interactions"
  ],
  "keys_used": {
    "node_type_key_in": "type",
    "node_type_key_out": "node_type",
    "edge_rel_key_in": "relation"
  },
  "node_type_counts_after_top": [
    [
      "drug",
      55430
    ],
    [
      "effect/phenotype",
      26406
    ],
    [
      "exposure",
      16513
    ],
    [
      "disease",
      1079
    ],
    [
      "unknown",
      1067
    ]
  ],
  "edge_type_counts_current_top

In [6]:
from collections import Counter

# Collect original -> mapped types
unknown_nodes = [(n, d.get("orig_type",""), d.get("node_name","")) 
                 for n,d in H_nodes_renamed.nodes(data=True) 
                 if d.get("node_type") == "unknown"]

print(f"Total unknown: {len(unknown_nodes)}\n")

# Count by original type
counts = Counter(orig for _, orig, _ in unknown_nodes)
for t, c in counts.most_common(20):
    print(f"{t:30s}  {c}")

# Optional: print a few examples for each unknown type
print("\n=== Sample unknowns ===")
seen = set()
for node_id, orig, name in unknown_nodes:
    if orig not in seen:
        print(f"orig_type={orig} | node_name={name}")
        seen.add(orig)
    if len(seen) >= 10:  # show up to 10 distinct samples
        break


Total unknown: 1067

do_not_use                      1067

=== Sample unknowns ===
orig_type=do_not_use | node_name=Stop use and ask a doctor if symptoms persist


In [13]:
# --- STEP 1: Rename OpenFDA node types to PrimeKG+Hetionet schema (edges unchanged)
# --- Hardened with robust type normalization + explicit do_not_use mapping
# --- Prints compact summary JSON (graph_kind, counts, node/edge type lists, keys used)

import os, json, re
from collections import Counter
import networkx as nx

# ==========================
# CONFIG — edit these paths
# ==========================
INPUT_PATH  = "/Users/ganeshkumarboini/Downloads/drug_data_kg_rebuilt_normalized.graphml"   # your normalized OpenFDA graph
OUTPUT_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_rebuilt_openfda_nodes_renamed.graphml"             # where to save the node-renamed graph

NODE_TYPE_KEY_IN   = "type"        # node type key in your normalized OpenFDA graph
NODE_TYPE_KEY_OUT  = "node_type"   # PrimeKG-style node type key to write
EDGE_REL_KEY_IN    = "relation"    # current edge relation key in your graph (unchanged in Step 1)

# How to map INACTIVE ingredients?  "exposure" (default) or "drug"
INACTIVE_INGREDIENT_AS = "drug"   # set to "drug" if you prefer all substances under 'drug'

# ==========================
# PrimeKG classes (11)
# ==========================
PRIMEKG_NODE_TYPES = {
    "gene/protein","biological_process","effect/phenotype","disease","anatomy",
    "molecular_function","drug","cellular_component","pathway","exposure","unknown"
}

# ==========================
# Normalization helpers
# ==========================
def normalize_openfda_type(t: str) -> str:
    """
    Normalize OpenFDA node type strings to a consistent key:
    - strip whitespace
    - lowercase
    - drop anything after a '|' (subfield decorations)
    - normalize '/', '-' to spaces; collapse to underscores
    - collapse multiple underscores
    """
    if t is None:
        return ""
    t = str(t).strip().lower()
    t = t.split("|", 1)[0]  # remove subfield suffixes like "do_not_use|children"
    t = t.replace("/", " ").replace("-", " ")
    t = re.sub(r"\s+", "_", t)
    t = re.sub(r"_+", "_", t)
    return t

# ==========================
# Refined mapping: OpenFDA -> PrimeKG
# (rename-only, no entity linking)
# ==========================
NODE_TYPE_MAP_BASE = {
    "drug": "drug",
    "brand": "drug",

    # Substances/content
    "active_ingredient": "drug",                 # actives as 'drug' for merging
    # 'inactive_ingredient' decided below via INACTIVE_INGREDIENT_AS

    # Instructional / narrative text (keep out of 'disease')
    "indications_and_usage": "effect/phenotype", # mostly sentences, not canonical diseases
    "purpose": "effect/phenotype",

    # Clinical/risk/safety buckets
    "ask_doctor": "disease",                     # your sample shows clean conditions here
    "warnings": "effect/phenotype",
    "stop_use": "effect/phenotype",
    "pregnancy_or_breast_feeding": "effect/phenotype",
    "keep_out_of_reach_of_children": "effect/phenotype",
    "overdose_warning": "effect/phenotype",
    "ask_doctor_or_pharmacist": "effect/phenotype",
    "do_not_use": "effect/phenotype",            # <-- explicit mapping added

    # Procedures/instructions → 'exposure'
    "dosage_and_administration": "exposure",
    "storage_and_handling": "exposure",

    # Interaction targets are typically substances/herbs → treat as 'drug'
    "drug_interactions": "drug",
}

# Apply inactive ingredient choice
if INACTIVE_INGREDIENT_AS not in {"exposure","drug"}:
    raise ValueError("INACTIVE_INGREDIENT_AS must be 'exposure' or 'drug'")
NODE_TYPE_MAP = dict(NODE_TYPE_MAP_BASE)
NODE_TYPE_MAP["inactive_ingredient"] = INACTIVE_INGREDIENT_AS

# ==========================
# IO helpers
# ==========================
def load_graph(path: str):
    ext = os.path.splitext(path)[1].lower()
    if ext in (".graphml", ".xml"):
        return nx.read_graphml(path)
    if ext == ".gexf":
        return nx.read_gexf(path)
    if ext == ".gml":
        return nx.read_gml(path)
    raise ValueError(f"Unsupported input format: {ext}")

def save_graph(G: nx.Graph, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    ext = os.path.splitext(path)[1].lower()
    if ext in (".graphml", ".xml"):
        nx.write_graphml(G, path)
    elif ext == ".gexf":
        nx.write_gexf(G, path)
    elif ext == ".gml":
        nx.write_gml(G, path)
    else:
        raise ValueError(f"Unsupported output format: {ext}")

# ==========================
# Main renamer (edges unchanged)
# ==========================
def rename_node_types_only(input_path: str,
                           output_path: str,
                           node_type_key_in: str = "type",
                           node_type_key_out: str = "node_type",
                           edge_rel_key_in: str = "relation"):
    G = load_graph(input_path)
    H = nx.DiGraph()

    # Nodes: rename types, preserve provenance and keep a human-friendly node_name
    for n, attrs in G.nodes(data=True):
        out = dict(attrs)
        orig_t_raw = attrs.get(node_type_key_in, "")
        norm_t = normalize_openfda_type(orig_t_raw)
        out["orig_type"] = str(orig_t_raw).strip()

        mapped = NODE_TYPE_MAP.get(norm_t)
        if mapped is None:
            # keep if already a PrimeKG class; else 'unknown'
            mapped = norm_t if norm_t in PRIMEKG_NODE_TYPES else "unknown"
        out[node_type_key_out] = mapped

        if "node_source" not in out:
            out["node_source"] = "OpenFDA"
        if "node_name" not in out:
            # GraphML loader uses node key `n` as the identifier/string you showed;
            # store it for convenience
            out["node_name"] = str(n)

        H.add_node(n, **out)

    # Edges: unchanged in Step 1
    H.add_edges_from(G.edges(data=True))

    # Save
    save_graph(H, output_path)

    # Summary
    node_types_after = Counter(a.get(node_type_key_out, "unknown") for _, a in H.nodes(data=True))
    edge_types_now   = Counter(d.get(edge_rel_key_in, "unknown") for _, _, d in H.edges(data=True))

    summary = {
        "graph_kind": "DiGraph",
        "nodes": H.number_of_nodes(),
        "edges": H.number_of_edges(),

        "node_types_count": len(node_types_after),
        "node_types": [k for k, _ in sorted(node_types_after.items(), key=lambda kv: (-kv[1], kv[0]))],

        "relation_types_count": len(edge_types_now),
        "relation_types": [k for k, _ in sorted(edge_types_now.items(), key=lambda kv: (-kv[1], kv[0]))],

        "keys_used": {
            "node_type_key_in": node_type_key_in,
            "node_type_key_out": node_type_key_out,
            "edge_rel_key_in": edge_rel_key_in
        },

        "node_type_counts_after_top": sorted(node_types_after.items(), key=lambda kv: (-kv[1], kv[0]))[:20],
        "edge_type_counts_current_top": sorted(edge_types_now.items(), key=lambda kv: (-kv[1], kv[0]))[:20],

        "mapping_choices": {
            "inactive_ingredient_as": INACTIVE_INGREDIENT_AS
        },

        "saved_to": output_path
    }

    print(json.dumps(summary, indent=2))
    return H, summary

# ==========================
# RUN
# ==========================
H_nodes_renamed, summary = rename_node_types_only(
    INPUT_PATH,
    OUTPUT_PATH,
    node_type_key_in=NODE_TYPE_KEY_IN,
    node_type_key_out=NODE_TYPE_KEY_OUT,
    edge_rel_key_in=EDGE_REL_KEY_IN
)

# Optional: quick peek at any remaining 'unknown' sources
from collections import Counter
unknown_nodes = [(n, d.get("orig_type",""), d.get("node_name",""))
                 for n,d in H_nodes_renamed.nodes(data=True)
                 if d.get(NODE_TYPE_KEY_OUT) == "unknown"]
if unknown_nodes:
    print(f"\nTotal 'unknown' nodes: {len(unknown_nodes)}")
    print("Top original types contributing to 'unknown':")
    cnt = Counter(orig for _, orig, _ in unknown_nodes)
    for t, c in cnt.most_common(20):
        print(f"  {t:40s}  {c}")


{
  "graph_kind": "DiGraph",
  "nodes": 100495,
  "edges": 160229,
  "node_types_count": 4,
  "node_types": [
    "drug",
    "effect/phenotype",
    "exposure",
    "disease"
  ],
  "relation_types_count": 15,
  "relation_types": [
    "dosage_and_administration",
    "active_ingredient",
    "indications_and_usage",
    "purpose",
    "inactive_ingredient",
    "keep_out_of_reach_of_children",
    "stop_use",
    "ask_doctor",
    "storage_and_handling",
    "do_not_use",
    "pregnancy_or_breast_feeding",
    "ask_doctor_or_pharmacist",
    "drug_interactions"
  ],
  "keys_used": {
    "node_type_key_in": "type",
    "node_type_key_out": "node_type",
    "edge_rel_key_in": "relation"
  },
  "node_type_counts_after_top": [
    [
      "drug",
      55430
    ],
    [
      "effect/phenotype",
      27473
    ],
    [
      "exposure",
      16513
    ],
    [
      "disease",
      1079
    ]
  ],
  "edge_type_counts_current_top": [
    [
      59029
    ],
    [
      "dosage_and_ad

In [9]:
import networkx as nx
from collections import defaultdict
import itertools

# --- CONFIG: path to your graph ---
GRAPH_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_rebuilt_normalized.graphml"

# --- Load graph ---
G = nx.read_graphml(GRAPH_PATH)

# --- Collect edges grouped by relation ---
sample_by_rel = defaultdict(list)
for u, v, attrs in G.edges(data=True):
    rel = attrs.get("relation", "unknown")
    sample_by_rel[rel].append((u, v, attrs))

# --- Print 5 samples per relation ---
for rel, edges in itertools.islice(sample_by_rel.items(), 20):  # first 20 relations
    print(f"\n=== Relation: {rel} | {len(edges)} edges ===")
    for u, v, attrs in edges[:5]:
        print(f"  {u} -> {v} | attrs={dict(attrs)}")



=== Relation: active_ingredient | 16527 edges ===
  (WHOLE PLANT) -> Lactuca virosa (Poisonous lettuce) Whole Plant 4X | attrs={'relationship': 'active_ingredient', 'orig_relationship': 'active_ingredient', 'relation': 'active_ingredient'}
  (WHOLE PLANT) -> Melissa officinalis (Lemon balm) Leaf/Young Shoot 4X | attrs={'relationship': 'active_ingredient', 'orig_relationship': 'active_ingredient', 'relation': 'active_ingredient'}
  (WHOLE PLANT) -> Millefolium (Yarrow) Aerial Parts 4X | attrs={'relationship': 'active_ingredient', 'orig_relationship': 'active_ingredient', 'relation': 'active_ingredient'}
  (WHOLE PLANT) -> Passiflora incarnata (Passion flower) Aerial Parts 4X | attrs={'relationship': 'active_ingredient', 'orig_relationship': 'active_ingredient', 'relation': 'active_ingredient'}
  (WHOLE PLANT) -> Thymus vulgaris Aerial Parts 4X | attrs={'relationship': 'active_ingredient', 'orig_relationship': 'active_ingredient', 'relation': 'active_ingredient'}

=== Relation: indicati

In [10]:
import networkx as nx
from collections import Counter, defaultdict
import itertools
from pprint import pprint

GRAPH_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_rebuilt_normalized.graphml"
REL_KEY   = "relation"      # normalized relation head
SUB_KEY   = "subfield"      # optional subfield (may not exist)
NT_KEY    = "type"          # node type BEFORE the PrimeKG renaming step (or "node_type" after)

G = nx.read_graphml(GRAPH_PATH)

# 1) Relation and (relation, subfield) frequencies
rel_cnt = Counter()
rel_sub_cnt = Counter()
for _, _, d in G.edges(data=True):
    r = d.get(REL_KEY, "unknown")
    rel_cnt[r] += 1
    sf = d.get(SUB_KEY, "")
    if sf:
        rel_sub_cnt[(r, sf)] += 1

print("=== Relation frequencies (top 50) ===")
for r, c in rel_cnt.most_common(50):
    print(f"{r:35s} {c}")

print("\n=== (Relation, subfield) pairs (top 50) ===")
for (r, sf), c in rel_sub_cnt.most_common(50):
    print(f"{r:35s} | {sf:30s} {c}")

# 2) For each relation: source/destination node-type breakdown (top few)
def top_nd(d, k=8):
    return sorted(d.items(), key=lambda kv: (-kv[1], kv[0]))[:k]

rel_sd_matrix = defaultdict(lambda: Counter())  # (src_nt, dst_nt) per relation
for u, v, d in G.edges(data=True):
    r = d.get(REL_KEY, "unknown")
    su = G.nodes[u].get(NT_KEY, "unknown")
    sv = G.nodes[v].get(NT_KEY, "unknown")
    rel_sd_matrix[r][(su, sv)] += 1

print("\n=== Per-relation source/dest node-type breakdown (top 8 each) ===")
for r, mat in itertools.islice(rel_sd_matrix.items(), 30):  # first 30 relations
    print(f"\n-- {r} --  total={rel_cnt[r]}")
    for (su, sv), c in top_nd(mat, 8):
        print(f"  ({su:22s} -> {sv:24s}) {c}")

# 3) Samples per relation (show 5)
print("\n=== Samples per relation (first 12 relations) ===")
rel_to_edges = defaultdict(list)
for u, v, d in G.edges(data=True):
    rel_to_edges[d.get(REL_KEY, "unknown")].append((u, v, d))

for r, edges in itertools.islice(rel_to_edges.items(), 12):
    print(f"\n### Relation: {r} | {len(edges)} edges")
    for u, v, d in edges[:5]:
        su = G.nodes[u].get(NT_KEY, "unknown"); sv = G.nodes[v].get(NT_KEY, "unknown")
        uname = G.nodes[u].get("value") or G.nodes[u].get("name") or u
        vname = G.nodes[v].get("value") or G.nodes[v].get("name") or v
        attrs = {k: d.get(k) for k in sorted(d.keys())}
        print(f"  [{su}] {uname}  ->  [{sv}] {vname}  | attrs={attrs}")


=== Relation frequencies (top 50) ===
dosage_and_administration           23949
active_ingredient                   16527
indications_and_usage               15600
purpose                             10227
inactive_ingredient                 8965
keep_out_of_reach_of_children       7726
stop_use                            5768
ask_doctor                          3584
storage_and_handling                3083
do_not_use                          2658
pregnancy_or_breast_feeding         2012
ask_doctor_or_pharmacist            942
drug_interactions                   48

=== (Relation, subfield) pairs (top 50) ===
dosage_and_administration           | dosage_and_administration      349
purpose                             | purpose                        253
indications_and_usage               | indications_and_usage          253
keep_out_of_reach_of_children       | keep_out_of_reach_of_children  158
dosage_and_administration           | table                          143
dosage_and_adminis

In [14]:
# JUPYTER CELL: OpenFDA -> PrimeKG/HETIONET edge remap + exact-format summary
import networkx as nx
from collections import Counter
from pathlib import Path
import json

# ---------- CONFIG ----------
GRAPH_IN  = "/Users/ganeshkumarboini/Downloads/drug_data_kg_rebuilt_openfda_nodes_renamed.graphml"
GRAPH_OUT = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_renamed.graphml"

# Node typing keys (auto-detect which exist in your graph)
NODE_TYPE_KEY_IN_CANDIDATES  = ("type", "node_type")
NODE_TYPE_KEY_OUT_CANDIDATES = ("node_type", "type")

# Edge relation key (the normalized head we’ll overwrite)
EDGE_REL_KEY_IN = "relation"

# This mirrors your earlier node-mapping choice; kept for the requested summary shape
INACTIVE_INGREDIENT_AS = "drug"

# ---------- LOAD ----------
G = nx.read_graphml(GRAPH_IN)

# ---------- DETECT NODE TYPE KEYS ----------
def find_first_node_key(candidates):
    for k in candidates:
        # as soon as we see it on any node, we consider it present
        for n in G.nodes:
            if k in G.nodes[n]:
                return k
    return candidates[0]  # fallback to first candidate if none found

node_type_key_in  = find_first_node_key(NODE_TYPE_KEY_IN_CANDIDATES)
node_type_key_out = find_first_node_key(NODE_TYPE_KEY_OUT_CANDIDATES)

def get_node_type(n):
    # prefer node_type_key_out for the post-rename view
    if node_type_key_out in G.nodes[n] and G.nodes[n][node_type_key_out]:
        return G.nodes[n][node_type_key_out]
    # fallback to node_type_key_in
    if node_type_key_in in G.nodes[n] and G.nodes[n][node_type_key_in]:
        return G.nodes[n][node_type_key_in]
    return "unknown"

# ---------- RELATION MAPPING ----------
def map_relation(src_t, dst_t, rel_in):
    rel_in = (rel_in or "").strip()

    # Normalize a couple of stray aliases seen in samples
    alias = {
        "interactions": "drug_interactions",
        "ask_doctor_or_p": "ask_doctor_or_pharmacist",
    }
    rel = alias.get(rel_in, rel_in)

    is_disease_or_effect = dst_t in {"disease", "effect/phenotype"}
    is_drug = (dst_t == "drug")

    # Indication-ish
    if rel in {"indications_and_usage", "purpose"}:
        return "indication" if is_disease_or_effect else "metaedge"

    # Adverse/safety text
    if rel in {"warnings", "stop_use", "overdose_warning"}:
        return "drug_effect"

    # Contraindication-ish
    if rel in {"do_not_use", "ask_doctor", "ask_doctor_or_pharmacist", "pregnancy_or_breast_feeding"}:
        return "contraindication" if is_disease_or_effect else "metaedge"

    # Composition / handling / instructions
    if rel in {"active_ingredient", "inactive_ingredient", "dosage_and_administration",
               "storage_and_handling", "keep_out_of_reach_of_children"}:
        return "metaedge"

    # Interactions
    if rel == "drug_interactions":
        return "drug_drug" if is_drug else "metaedge"

    # Safe fallback
    return "metaedge"

# ---------- APPLY MAPPING (preserve originals) ----------
for u, v, d in G.edges(data=True):
    # backup originals once
    if "openfda_relation" not in d:
        d["openfda_relation"] = d.get(EDGE_REL_KEY_IN, "unknown")
    if "openfda_subfield" not in d and "subfield" in d:
        d["openfda_subfield"] = d.get("subfield")

    new_rel = map_relation(get_node_type(u), get_node_type(v), d.get(EDGE_REL_KEY_IN))
    d[EDGE_REL_KEY_IN] = new_rel  # overwrite to PrimeKG/HETIONET-style

# ---------- SAVE ----------
Path(GRAPH_OUT).parent.mkdir(parents=True, exist_ok=True)
nx.write_graphml(G, GRAPH_OUT)

# ---------- SUMMARY (exact structure you requested) ----------
H = G  # naming to match your template

# node type distribution (post-rename view)
node_types_after = Counter(get_node_type(n) for n in H.nodes())

# edge relation distribution (current)
edge_types_now = Counter(d.get(EDGE_REL_KEY_IN, "unknown") for _,_,d in H.edges(data=True))

summary = {
    "graph_kind": "DiGraph",
    "nodes": H.number_of_nodes(),
    "edges": H.number_of_edges(),

    "node_types_count": len(node_types_after),
    "node_types": [k for k, _ in sorted(node_types_after.items(), key=lambda kv: (-kv[1], kv[0]))],

    "relation_types_count": len(edge_types_now),
    "relation_types": [k for k, _ in sorted(edge_types_now.items(), key=lambda kv: (-kv[1], kv[0]))],

    "keys_used": {
        "node_type_key_in": node_type_key_in,
        "node_type_key_out": node_type_key_out,
        "edge_rel_key_in": EDGE_REL_KEY_IN
    },

    "node_type_counts_after_top": sorted(node_types_after.items(), key=lambda kv: (-kv[1], kv[0]))[:20],
    "edge_type_counts_current_top": sorted(edge_types_now.items(), key=lambda kv: (-kv[1], kv[0]))[:20],

    "mapping_choices": {
        "inactive_ingredient_as": INACTIVE_INGREDIENT_AS  # included to mirror your earlier summary schema
    },

    "saved_to": GRAPH_OUT
}

print(json.dumps(summary, indent=2))


{
  "graph_kind": "DiGraph",
  "nodes": 100495,
  "edges": 160229,
  "node_types_count": 4,
  "node_types": [
    "drug",
    "effect/phenotype",
    "exposure",
    "disease"
  ],
  "relation_types_count": 5,
  "relation_types": [
    "drug_effect",
    "metaedge",
    "indication",
    "contraindication",
    "drug_drug"
  ],
  "keys_used": {
    "node_type_key_in": "type",
    "node_type_key_out": "node_type",
    "edge_rel_key_in": "relation"
  },
  "node_type_counts_after_top": [
    [
      "drug",
      55430
    ],
    [
      "effect/phenotype",
      27473
    ],
    [
      "exposure",
      16513
    ],
    [
      "disease",
      1079
    ]
  ],
  "edge_type_counts_current_top": [
    [
      "drug_effect",
      64908
    ],
    [
      "metaedge",
      62359
    ],
    [
      "indication",
      23823
    ],
    [
      "contraindication",
      9094
    ],
    [
      "drug_drug",
      45
    ]
  ],
  "mapping_choices": {
    "inactive_ingredient_as": "drug"
  },
  

In [15]:
import networkx as nx
from collections import Counter

G = nx.read_graphml("/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_renamed.graphml")

# 1) Ensure every node has node_type
missing_node_type = [n for n,a in G.nodes(data=True) if "node_type" not in a]
print("Nodes missing node_type:", len(missing_node_type))

# 2) Ensure every edge has relation
missing_relation = [(u,v) for u,v,a in G.edges(data=True) if "relation" not in a]
print("Edges missing relation:", len(missing_relation))

# 3) No legacy keys leaking through?
legacy_edge_keys = Counter(k for _,_,a in G.edges(data=True) for k in a.keys() if k in {"relationship","orig_relationship"})
print("Legacy edge keys present:", dict(legacy_edge_keys))

# 4) Relation whitelist (your 15)
REL_OK = {
    "active_ingredient","purpose","indications_and_usage","do_not_use","ask_doctor",
    "ask_doctor_or_pharmacist","warnings","stop_use","dosage_and_administration",
    "keep_out_of_reach_of_children","pregnancy_or_breast_feeding","storage_and_handling",
    "inactive_ingredient","drug_interactions","overdose_warning"
}
bad_rel = Counter(a.get("relation") for _,_,a in G.edges(data=True) if a.get("relation") not in REL_OK)
print("Non-whitelisted relations:", dict(bad_rel))


Nodes missing node_type: 0
Edges missing relation: 0
Legacy edge keys present: {'relationship': 160229, 'orig_relationship': 160229}
Non-whitelisted relations: {'metaedge': 62359, 'indication': 23823, 'drug_effect': 64908, 'contraindication': 9094, 'drug_drug': 45}


In [16]:
import networkx as nx
from collections import Counter

# --- paths ---
IN_PATH  = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_renamed.graphml"
OUT_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_renamed_clean.graphml"

# --- load ---
G = nx.read_graphml(IN_PATH)

# --- validator for PrimeKG buckets ---
PRIMEKG_REL_OK = {"drug_effect", "indication", "contraindication", "drug_drug", "metaedge"}

missing_node_type = [n for n,a in G.nodes(data=True) if "node_type" not in a]
missing_relation  = [(u,v) for u,v,a in G.edges(data=True) if "relation" not in a]
legacy_keys = Counter(k for _,_,a in G.edges(data=True) for k in a if k in {"relationship","orig_relationship"})

non_whitelisted = Counter(
    a.get("relation") for _,_,a in G.edges(data=True) if a.get("relation") not in PRIMEKG_REL_OK
)

print("=== BEFORE CLEAN ===")
print("Nodes missing node_type:", len(missing_node_type))
print("Edges missing relation:", len(missing_relation))
print("Legacy edge keys present:", dict(legacy_keys))
print("Non-whitelisted relations:", dict(non_whitelisted))

# --- strip legacy edge keys for a clean merge ---
for u,v,a in G.edges(data=True):
    a.pop("relationship", None)
    a.pop("orig_relationship", None)

# --- recompute counts after cleanup ---
edge_types = Counter(a.get("relation") for _,_,a in G.edges(data=True))
node_types = Counter(a.get("node_type") for _,a in G.nodes(data=True))

summary = {
    "graph_kind": "DiGraph",
    "nodes": G.number_of_nodes(),
    "edges": G.number_of_edges(),
    "node_types_count": len(node_types),
    "node_types": [k for k,_ in node_types.most_common()],
    "relation_types_count": len(edge_types),
    "relation_types": [k for k,_ in edge_types.most_common()],
    "keys_used": {
        "node_type_key_in": "type",          # from your normalization step
        "node_type_key_out": "node_type",
        "edge_rel_key_in": "relation"
    },
    "node_type_counts_after_top": node_types.most_common(20),
    "edge_type_counts_current_top": edge_types.most_common(20),
    "mapping_choices": {
        "inactive_ingredient_as": "drug"     # as previously chosen
    },
}

print("\n=== AFTER CLEAN ===")
print("Edge types:", edge_types.most_common())
print("\nSUMMARY:\n", summary)

# --- save cleaned graph ---
nx.write_graphml(G, OUT_PATH)
print("\nSaved cleaned graph to:", OUT_PATH)


=== BEFORE CLEAN ===
Nodes missing node_type: 0
Edges missing relation: 0
Legacy edge keys present: {'relationship': 160229, 'orig_relationship': 160229}
Non-whitelisted relations: {}

=== AFTER CLEAN ===
Edge types: [('drug_effect', 64908), ('metaedge', 62359), ('indication', 23823), ('contraindication', 9094), ('drug_drug', 45)]

SUMMARY:
 {'graph_kind': 'DiGraph', 'nodes': 100495, 'edges': 160229, 'node_types_count': 4, 'node_types': ['drug', 'effect/phenotype', 'exposure', 'disease'], 'relation_types_count': 5, 'relation_types': ['drug_effect', 'metaedge', 'indication', 'contraindication', 'drug_drug'], 'keys_used': {'node_type_key_in': 'type', 'node_type_key_out': 'node_type', 'edge_rel_key_in': 'relation'}, 'node_type_counts_after_top': [('drug', 55430), ('effect/phenotype', 27473), ('exposure', 16513), ('disease', 1079)], 'edge_type_counts_current_top': [('drug_effect', 64908), ('metaedge', 62359), ('indication', 23823), ('contraindication', 9094), ('drug_drug', 45)], 'mapping_c

In [17]:
import networkx as nx
from collections import Counter
import json

# --- Load GraphML ---
path = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_renamed_clean.graphml"
G = nx.read_graphml(path)

# --- Identify keys ---
node_type_key = "node_type"          # from your earlier dump
edge_rel_key  = "relation"  # from your earlier dump

# --- Collect node types ---
node_types = list(nx.get_node_attributes(G, node_type_key).values())
node_counts = Counter(node_types)

# --- Collect relation types ---
edge_types = list(nx.get_edge_attributes(G, edge_rel_key).values())
edge_counts = Counter(edge_types)

# --- Build summary dict ---
summary = {
    "graph_kind": type(G).__name__,  # e.g. "DiGraph"
    "nodes": G.number_of_nodes(),
    "edges": G.number_of_edges(),
    "node_types_count": len(node_counts),
    "node_types": list(node_counts.keys()),
    "relation_types_count": len(edge_counts),
    "relation_types": list(edge_counts.keys())
}

# --- Pretty print ---
print(json.dumps(summary, indent=2))


{
  "graph_kind": "DiGraph",
  "nodes": 100495,
  "edges": 160229,
  "node_types_count": 4,
  "node_types": [
    "drug",
    "effect/phenotype",
    "exposure",
    "disease"
  ],
  "relation_types_count": 5,
  "relation_types": [
    "metaedge",
    "indication",
    "drug_effect",
    "contraindication",
    "drug_drug"
  ]
}


In [20]:
import networkx as nx
from collections import Counter

GRAPH_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_renamed_clean.graphml"
G = nx.read_graphml(GRAPH_PATH)

# --- Node keys ---
node_keys = Counter()
for _, attrs in G.nodes(data=True):
    node_keys.update(attrs.keys())
print("=== Node attribute keys ===")
for k, c in node_keys.most_common():
    print(f"{k}: {c}")

# --- Edge keys ---
edge_keys = Counter()
for _, _, attrs in G.edges(data=True):
    edge_keys.update(attrs.keys())
print("\n=== Edge attribute keys ===")
for k, c in edge_keys.most_common():
    print(f"{k}: {c}")


=== Node attribute keys ===
type: 100495
orig_type: 100495
node_type: 100495
node_source: 100495
node_name: 100495
value: 1033

=== Edge attribute keys ===
relation: 160229
openfda_relation: 160229
subfield: 5989
openfda_subfield: 5989


In [18]:
# Peek at a few nodes and their attributes
for i, (n, data) in enumerate(G.nodes(data=True)):
    print(n, data)
    if i >= 5:
        break

# Check what attributes exist
all_node_keys = {k for _, d in G.nodes(data=True) for k in d.keys()}
print("Node attribute keys:", all_node_keys)


(WHOLE PLANT) {'type': 'drug', 'orig_type': 'drug', 'node_type': 'drug', 'node_source': 'OpenFDA', 'node_name': '(WHOLE PLANT)'}
Lactuca virosa (Poisonous lettuce) Whole Plant 4X {'type': 'active_ingredient', 'orig_type': 'active_ingredient', 'node_type': 'drug', 'node_source': 'OpenFDA', 'node_name': 'Lactuca virosa (Poisonous lettuce) Whole Plant 4X'}
Melissa officinalis (Lemon balm) Leaf/Young Shoot 4X {'type': 'active_ingredient', 'orig_type': 'active_ingredient', 'node_type': 'drug', 'node_source': 'OpenFDA', 'node_name': 'Melissa officinalis (Lemon balm) Leaf/Young Shoot 4X'}
Millefolium (Yarrow) Aerial Parts 4X {'type': 'active_ingredient', 'orig_type': 'active_ingredient', 'node_type': 'drug', 'node_source': 'OpenFDA', 'node_name': 'Millefolium (Yarrow) Aerial Parts 4X'}
Passiflora incarnata (Passion flower) Aerial Parts 4X {'type': 'active_ingredient', 'orig_type': 'active_ingredient', 'node_type': 'drug', 'node_source': 'OpenFDA', 'node_name': 'Passiflora incarnata (Passion f

In [19]:
for i, (u, v, data) in enumerate(G.edges(data=True)):
    print(u, "->", v, data)
    if i >= 5:
        break

all_edge_keys = {k for _, _, d in G.edges(data=True) for k in d.keys()}
print("Edge attribute keys:", all_edge_keys)


(WHOLE PLANT) -> Lactuca virosa (Poisonous lettuce) Whole Plant 4X {'relation': 'metaedge', 'openfda_relation': 'active_ingredient'}
(WHOLE PLANT) -> Melissa officinalis (Lemon balm) Leaf/Young Shoot 4X {'relation': 'metaedge', 'openfda_relation': 'active_ingredient'}
(WHOLE PLANT) -> Millefolium (Yarrow) Aerial Parts 4X {'relation': 'metaedge', 'openfda_relation': 'active_ingredient'}
(WHOLE PLANT) -> Passiflora incarnata (Passion flower) Aerial Parts 4X {'relation': 'metaedge', 'openfda_relation': 'active_ingredient'}
(WHOLE PLANT) -> Thymus vulgaris Aerial Parts 4X {'relation': 'metaedge', 'openfda_relation': 'active_ingredient'}
(WHOLE PLANT) -> Valeriana officinalis (Valerian) Root 4X {'relation': 'metaedge', 'openfda_relation': 'active_ingredient'}
Edge attribute keys: {'openfda_relation', 'relation', 'subfield', 'openfda_subfield'}


In [22]:
import networkx as nx
from pathlib import Path
from collections import Counter

# --- CONFIG ---
GRAPH_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_renamed_clean.graphml"
OUT_PATH   = Path("/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_minimal.graphml")
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)

# --- Load ---
G = nx.read_graphml(GRAPH_PATH)

# --- Node cleanup ---
keep_node_keys = {"node_type", "node_name", "node_source"}
for n, attrs in list(G.nodes(data=True)):
    cleaned = {k: v for k, v in attrs.items() if k in keep_node_keys}
    if not cleaned.get("node_name"):
        cleaned["node_name"] = str(n)
    G.nodes[n].clear()
    G.nodes[n].update(cleaned)

# --- Edge cleanup ---
keep_edge_keys = {"relation", "subfield"}  # you chose to retain subfield
for u, v, attrs in list(G.edges(data=True)):
    cleaned = {k: v for k, v in attrs.items() if k in keep_edge_keys}
    G.edges[u, v].clear()
    G.edges[u, v].update(cleaned)

# --- Save cleaned graph ---
nx.write_graphml(G, str(OUT_PATH))

# --- Build summary ---
node_types_after = Counter(nx.get_node_attributes(G, "node_type").values())
edge_types_now   = Counter(nx.get_edge_attributes(G, "relation").values())

summary = {
    "graph_kind": "DiGraph",
    "nodes": G.number_of_nodes(),
    "edges": G.number_of_edges(),
    "node_types_count": len(node_types_after),
    "node_types": [k for k, _ in sorted(node_types_after.items(), key=lambda kv: (-kv[1], kv[0]))],
    "relation_types_count": len(edge_types_now),
    "relation_types": [k for k, _ in sorted(edge_types_now.items(), key=lambda kv: (-kv[1], kv[0]))],
    "keys_used": {
        "node_type_key_out": "node_type",
        "node_name_key": "node_name",
        "node_source_key": "node_source",
        "edge_rel_key": "relation",
        "edge_subfield_key": "subfield",
    },
    "node_type_counts_after_top": sorted(node_types_after.items(), key=lambda kv: (-kv[1], kv[0]))[:20],
    "edge_type_counts_current_top": sorted(edge_types_now.items(), key=lambda kv: (-kv[1], kv[0]))[:20],
    "saved_to": str(OUT_PATH),
}

# --- Inspect keys (FIXED UNPACKING HERE) ---
print("=== Node attribute keys (after cleanup) ===")
print({k for _, attrs in G.nodes(data=True) for k in attrs.keys()})

print("\n=== Edge attribute keys (after cleanup) ===")
print({k for _, _, attrs in G.edges(data=True) for k in attrs.keys()})

print("\nSUMMARY:\n", summary)


=== Node attribute keys (after cleanup) ===
{'node_type', 'node_source', 'node_name'}

=== Edge attribute keys (after cleanup) ===
{'relation', 'subfield'}

SUMMARY:
 {'graph_kind': 'DiGraph', 'nodes': 100495, 'edges': 160229, 'node_types_count': 4, 'node_types': ['drug', 'effect/phenotype', 'exposure', 'disease'], 'relation_types_count': 5, 'relation_types': ['drug_effect', 'metaedge', 'indication', 'contraindication', 'drug_drug'], 'keys_used': {'node_type_key_out': 'node_type', 'node_name_key': 'node_name', 'node_source_key': 'node_source', 'edge_rel_key': 'relation', 'edge_subfield_key': 'subfield'}, 'node_type_counts_after_top': [('drug', 55430), ('effect/phenotype', 27473), ('exposure', 16513), ('disease', 1079)], 'edge_type_counts_current_top': [('drug_effect', 64908), ('metaedge', 62359), ('indication', 23823), ('contraindication', 9094), ('drug_drug', 45)], 'saved_to': '/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_minimal.graphml'}


In [23]:
import networkx as nx
from collections import Counter

G = nx.read_graphml("/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_minimal.graphml")  # or IN_PATH if you kept metaedge
triplets = Counter(
    (G.nodes[u]["node_type"], a.get("relation"), G.nodes[v]["node_type"])
    for u, v, a in G.edges(data=True)
)

print("Unique triplet types:", len(triplets))
for (src, rel, dst), c in triplets.most_common(20):
    print(f"({src}, {rel}, {dst}): {c}")

isolates = [n for n in G.nodes if G.degree(n)==0]
print("Isolates:", len(isolates))


Unique triplet types: 21
(drug, drug_effect, drug): 32458
(drug, drug_effect, effect/phenotype): 31966
(drug, metaedge, drug): 26776
(drug, metaedge, exposure): 26775
(drug, indication, effect/phenotype): 23746
(drug, metaedge, effect/phenotype): 8585
(drug, contraindication, effect/phenotype): 5962
(drug, contraindication, disease): 3124
(drug, drug_effect, exposure): 285
(drug, drug_effect, disease): 140
(effect/phenotype, metaedge, drug): 135
(effect/phenotype, indication, effect/phenotype): 75
(effect/phenotype, drug_effect, effect/phenotype): 58
(effect/phenotype, metaedge, exposure): 58
(drug, drug_drug, drug): 45
(effect/phenotype, metaedge, effect/phenotype): 27
(effect/phenotype, contraindication, effect/phenotype): 6
(drug, metaedge, disease): 3
(effect/phenotype, contraindication, disease): 2
(drug, indication, disease): 2
Isolates: 0


In [24]:
import networkx as nx
from pathlib import Path
from collections import Counter

# === CONFIG ===
IN_PATH  = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_minimal.graphml"
OUT_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_no_drop.graphml"

# === Load ===
G = nx.read_graphml(IN_PATH)

node_type_key = "node_type"
edge_rel_key  = "relation"
edge_subfield_key = "subfield"

def ntype(n):
    return G.nodes[n].get(node_type_key, "unknown")

# Canonical set we prefer to end up with
CANON = {"drug_effect", "indication", "contraindication", "drug_drug", "drug_disease", "metaedge"}

def upgrade_metaedge(u, v):
    """Heuristic reclassification for 'metaedge' using node types; falls back to 'metaedge'."""
    su, sv = ntype(u), ntype(v)
    if su == "drug" and sv == "drug":
        return "drug_drug"
    if su == "drug" and sv == "disease":
        return "indication"          # neutral interpretation
    if su == "drug" and sv == "effect/phenotype":
        return "drug_effect"
    if su == "drug" and sv == "exposure":
        return "drug_effect"         # treat exposure outcomes as effects
    if su == "drug" and sv not in {"drug","disease","effect/phenotype","exposure"}:
        return "drug_effect"         # best-effort
    if su == "effect/phenotype" and sv == "drug":
        return "drug_effect"
    # No confident mapping → preserve metaedge
    return "metaedge"

def relabel(u, v, old_rel):
    su, sv = ntype(u), ntype(v)
    r = old_rel

    # Normalize whitespace/case
    if r is None:
        r = ""
    r = str(r).strip()

    # 1) Metaedge: try to upgrade; else keep metaedge
    if r == "metaedge":
        return upgrade_metaedge(u, v)

    # 2) drug_effect refinements (never drop)
    if r == "drug_effect":
        if su == "drug" and sv == "drug":
            return "drug_drug"
        if su == "drug" and sv == "disease":
            return "drug_disease"
        return "drug_effect"

    # 3) indication refinements (never drop)
    if r == "indication":
        if su == "drug" and sv == "disease":
            return "indication"
        if su == "drug" and sv == "effect/phenotype":
            return "drug_effect"
        # preserve if weird combo
        return r if r in CANON else "metaedge"

    # 4) contraindication refinements (never drop)
    if r == "contraindication":
        if su == "drug" and sv == "disease":
            return "contraindication"
        if su == "drug" and sv == "effect/phenotype":
            return "drug_effect"
        return r if r in CANON else "metaedge"

    # 5) drug_drug: keep when drug↔drug else keep original (or metaedge)
    if r == "drug_drug":
        if su == "drug" and sv == "drug":
            return "drug_drug"
        return r if r in CANON else "metaedge"

    # 6) drug_disease already canonical; keep
    if r == "drug_disease":
        return "drug_disease"

    # 7) Anything else: keep if already canonical; otherwise bucket to metaedge
    return r if r in CANON else "metaedge"

# === BEFORE snapshot ===
rels_before = Counter(nx.get_edge_attributes(G, edge_rel_key).values())
E_before = G.number_of_edges()
print("=== BEFORE ===")
print("Edges:", E_before)
print("Relation counts:", dict(sorted(rels_before.items(), key=lambda kv:(-kv[1],kv[0]))))

# === Relabel in place; NO DROPS ===
changed = 0
for u, v, attrs in G.edges(data=True):
    old_rel = attrs.get(edge_rel_key, "")
    new_rel = relabel(u, v, old_rel)
    if new_rel != old_rel:
        G.edges[u, v][edge_rel_key] = new_rel
        changed += 1

# === AFTER snapshot (no dedup, no removal) ===
rels_after = Counter(nx.get_edge_attributes(G, edge_rel_key).values())
E_after = G.number_of_edges()

print("\n=== AFTER (no drop) ===")
print("Edges:", E_after, "| changed:", changed, "| dropped:", 0)
print("Relation counts:", dict(sorted(rels_after.items(), key=lambda kv:(-kv[1],kv[0]))))

# Quick triplet schema peek
triplets = Counter()
for u, v, attrs in G.edges(data=True):
    triplets[(ntype(u), attrs.get(edge_rel_key), ntype(v))] += 1

print("\nUnique triplet types:", len(triplets))
for (s, r, d), c in sorted(triplets.items(), key=lambda kv:(-kv[1], kv[0]))[:20]:
    print(f"({s}, {r}, {d}): {c}")

# === Summary object ===
summary = {
    "graph_kind": "DiGraph",
    "nodes": G.number_of_nodes(),
    "edges": G.number_of_edges(),
    "node_types_count": len(set(nx.get_node_attributes(G, node_type_key).values())),
    "node_types": sorted(set(nx.get_node_attributes(G, node_type_key).values())),
    "relation_types_count": len(rels_after),
    "relation_types": [k for k,_ in sorted(rels_after.items(), key=lambda kv:(-kv[1],kv[0]))],
    "keys_used": {
        "node_type_key_out": node_type_key,
        "edge_rel_key_out": edge_rel_key,
        "edge_subfield_key": edge_subfield_key
    },
    "policy": {
        "no_drop": True,
        "metaedge_upgrade": "heuristic by node types; fallback to 'metaedge'",
        "preserve_subfield": True
    },
    "saved_to": str(OUT_PATH)
}

# === Save ===
nx.write_graphml(G, OUT_PATH)
print("\nSaved:", OUT_PATH)
print("\nSUMMARY:\n", summary)


=== BEFORE ===
Edges: 160229
Relation counts: {'drug_effect': 64908, 'metaedge': 62359, 'indication': 23823, 'contraindication': 9094, 'drug_drug': 45}

=== AFTER (no drop) ===
Edges: 160229 | changed: 124580 | dropped: 0
Relation counts: {'drug_effect': 97513, 'drug_drug': 59279, 'contraindication': 3132, 'drug_disease': 140, 'metaedge': 85, 'indication': 80}

Unique triplet types: 14
(drug, drug_effect, effect/phenotype): 70259
(drug, drug_drug, drug): 59279
(drug, drug_effect, exposure): 27060
(drug, contraindication, disease): 3124
(drug, drug_disease, disease): 140
(effect/phenotype, drug_effect, drug): 135
(effect/phenotype, indication, effect/phenotype): 75
(effect/phenotype, drug_effect, effect/phenotype): 58
(effect/phenotype, metaedge, exposure): 58
(effect/phenotype, metaedge, effect/phenotype): 27
(effect/phenotype, contraindication, effect/phenotype): 6
(drug, indication, disease): 5
(effect/phenotype, contraindication, disease): 2
(effect/phenotype, drug_effect, exposure)

In [26]:
import networkx as nx
from collections import Counter
import json

# --- CONFIG ---
GRAPH_PATH = "/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_no_drop.graphml"

# --- Load ---
G = nx.read_graphml(GRAPH_PATH)

node_type_key = "node_type"
edge_rel_key = "relation"

# --- Basic counts ---
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
kind = "DiGraph" if not G.is_multigraph() else "MultiDiGraph"

# --- Node & relation type inventories ---
node_types = Counter(nx.get_node_attributes(G, node_type_key).values())
relation_types = Counter(nx.get_edge_attributes(G, edge_rel_key).values())

node_types_list = sorted(node_types.items(), key=lambda kv: (-kv[1], kv[0]))
relation_types_list = sorted(relation_types.items(), key=lambda kv: (-kv[1], kv[0]))

# --- Connectivity ---
comp = {}
UG = G.to_undirected()
comps = list(nx.connected_components(UG))
comp["components"] = len(comps)
comp["largest_component_size"] = max(len(c) for c in comps)
comp["largest_component_fraction"] = comp["largest_component_size"] / n_nodes

# Orphan nodes
orphans = sum(1 for _, deg in G.degree() if deg == 0)

# --- Build summary ---
summary = {
    "graph_kind": kind,
    "nodes": int(n_nodes),
    "edges": int(n_edges),

    "node_types_count": int(len(node_types_list)),
    "node_types": node_types_list,

    "relation_types_count": int(len(relation_types_list)),
    "relation_types": relation_types_list,

    "connectivity": {
        "components": int(comp["components"]),
        "largest_component_size": int(comp["largest_component_size"]),
        "largest_component_fraction": float(comp["largest_component_fraction"]),
        "orphans": int(orphans),
    },

    "keys_used": {
        "node_type_key": node_type_key,
        "edge_rel_key": edge_rel_key,
    },
}

# --- Pretty print ---
print(json.dumps(summary, indent=2))


{
  "graph_kind": "DiGraph",
  "nodes": 100495,
  "edges": 160229,
  "node_types_count": 4,
  "node_types": [
    [
      "drug",
      55430
    ],
    [
      "effect/phenotype",
      27473
    ],
    [
      "exposure",
      16513
    ],
    [
      "disease",
      1079
    ]
  ],
  "relation_types_count": 6,
  "relation_types": [
    [
      "drug_effect",
      97513
    ],
    [
      "drug_drug",
      59279
    ],
    [
      "contraindication",
      3132
    ],
    [
      "drug_disease",
      140
    ],
    [
      "metaedge",
      85
    ],
    [
      "indication",
      80
    ]
  ],
  "connectivity": {
    "components": 1910,
    "largest_component_size": 87889,
    "largest_component_fraction": 0.8745609234290264,
    "orphans": 0
  },
  "keys_used": {
    "node_type_key": "node_type",
    "edge_rel_key": "relation"
  }
}


In [27]:
import networkx as nx
from collections import Counter

G = nx.read_graphml("/Users/ganeshkumarboini/Downloads/drug_data_kg_openfda_edges_no_drop.graphml")  # or IN_PATH if you kept metaedge
triplets = Counter(
    (G.nodes[u]["node_type"], a.get("relation"), G.nodes[v]["node_type"])
    for u, v, a in G.edges(data=True)
)

print("Unique triplet types:", len(triplets))
for (src, rel, dst), c in triplets.most_common(20):
    print(f"({src}, {rel}, {dst}): {c}")

isolates = [n for n in G.nodes if G.degree(n)==0]
print("Isolates:", len(isolates))


Unique triplet types: 14
(drug, drug_effect, effect/phenotype): 70259
(drug, drug_drug, drug): 59279
(drug, drug_effect, exposure): 27060
(drug, contraindication, disease): 3124
(drug, drug_disease, disease): 140
(effect/phenotype, drug_effect, drug): 135
(effect/phenotype, indication, effect/phenotype): 75
(effect/phenotype, drug_effect, effect/phenotype): 58
(effect/phenotype, metaedge, exposure): 58
(effect/phenotype, metaedge, effect/phenotype): 27
(effect/phenotype, contraindication, effect/phenotype): 6
(drug, indication, disease): 5
(effect/phenotype, contraindication, disease): 2
(effect/phenotype, drug_effect, exposure): 1
Isolates: 0
