In [2]:
import os
import pandas as pd
from pathlib import Path

# ✅ Set data and output paths relative to notebook
NOTEBOOK_DIR = Path.cwd()
DATA_DIR = NOTEBOOK_DIR.parents[2] / "data" / "data" / "primekg"
OUTPUT_DIR = NOTEBOOK_DIR / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)

# ✅ Confirm data path
print("🔧 Data path resolved to:", DATA_DIR)
try:
    print("📂 Files in data path:", list(DATA_DIR.iterdir()))
except Exception as e:
    print("❌ Could not list data path:", e)

# ✅ Files and Labels
files = {
    "README.txt": "📘 README",
    "nodes.csv": "🧠 Nodes",
    "kg.csv": "📊 KG",
    "kg_raw.csv": "📊 KG Raw",
    "kg_grouped.csv": "📦 KG Grouped",
    "kg_grouped_diseases.csv": "🧬 KG Grouped Diseases",
    "kg_grouped_diseases_bert_map.csv": "🧬 KG Diseases BERT Map",
    "kg.giant.csv": "🗺️ KG Giant",
    "edges.csv": "🔗 Edges",
    "drug_features.csv": "💊 Drug Features",
    "disease_features.csv": "🦠 Disease Features"
}

# ✅ Load and preview files
for file, label in files.items():
    full_path = DATA_DIR / file
    print(f"\n--- {label} ({file}) ---")
    try:
        if file.endswith(".txt"):
            print(full_path.read_text().strip()[:1000])
        else:
            df = pd.read_csv(full_path)
            print(df.head())
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")


🔧 Data path resolved to: /Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg
📂 Files in data path: [PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg/disease_features.csv'), PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg/nodes.csv'), PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg/kg_grouped.csv'), PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg/kg_grouped_diseases.csv'), PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg/kg_grouped_diseases_bert_map.csv'), PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg/kg_giant.csv'), PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg/kg.csv'), PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg/drug_features.csv'), PosixPath('/Users/ganeshkumarboini/Downloads/KgTxAgent/main/

  df = pd.read_csv(full_path)


          relation display_relation  x_index  x_id        x_type  x_name  \
0  protein_protein              ppi        0  9796  gene/protein  PHYHIP   
1  protein_protein              ppi        1  7918  gene/protein  GPANK1   
2  protein_protein              ppi        2  8233  gene/protein   ZRSR2   
3  protein_protein              ppi        3  4899  gene/protein    NRF1   
4  protein_protein              ppi        4  5297  gene/protein   PI4KA   

  x_source  y_index   y_id        y_type  y_name y_source  
0     NCBI     8889  56992  gene/protein   KIF15     NCBI  
1     NCBI     2798   9240  gene/protein   PNMA1     NCBI  
2     NCBI     5646  23548  gene/protein   TTC33     NCBI  
3     NCBI    11592  11253  gene/protein  MAN1B1     NCBI  
4     NCBI     2122   8601  gene/protein   RGS20     NCBI  

--- 📊 KG Raw (kg_raw.csv) ---


  df = pd.read_csv(full_path)


          relation display_relation  x_id        x_type  x_name x_source  \
0  protein_protein              ppi  9796  gene/protein  PHYHIP     NCBI   
1  protein_protein              ppi  7918  gene/protein  GPANK1     NCBI   
2  protein_protein              ppi  8233  gene/protein   ZRSR2     NCBI   
3  protein_protein              ppi  4899  gene/protein    NRF1     NCBI   
4  protein_protein              ppi  5297  gene/protein   PI4KA     NCBI   

    y_id        y_type  y_name y_source  
0  56992  gene/protein   KIF15     NCBI  
1   9240  gene/protein   PNMA1     NCBI  
2  23548  gene/protein   TTC33     NCBI  
3  11253  gene/protein  MAN1B1     NCBI  
4   8601  gene/protein   RGS20     NCBI  

--- 📦 KG Grouped (kg_grouped.csv) ---


  df = pd.read_csv(full_path)


          relation display_relation  x_id        x_type  x_name x_source  \
0  protein_protein              ppi  9796  gene/protein  PHYHIP     NCBI   
1  protein_protein              ppi  7918  gene/protein  GPANK1     NCBI   
2  protein_protein              ppi  8233  gene/protein   ZRSR2     NCBI   
3  protein_protein              ppi  4899  gene/protein    NRF1     NCBI   
4  protein_protein              ppi  5297  gene/protein   PI4KA     NCBI   

    y_id        y_type  y_name y_source  
0  56992  gene/protein   KIF15     NCBI  
1   9240  gene/protein   PNMA1     NCBI  
2  23548  gene/protein   TTC33     NCBI  
3  11253  gene/protein  MAN1B1     NCBI  
4   8601  gene/protein   RGS20     NCBI  

--- 🧬 KG Grouped Diseases (kg_grouped_diseases.csv) ---
   node_id node_type                                          node_name  \
0    13924   disease                    osteogenesis imperfecta type 13   
1    11160   disease       autosomal recessive nonsyndromic deafness 15   
2     809

In [4]:

import pandas as pd
from pathlib import Path

# === Set base path to current notebook directory ===
NOTEBOOK_DIR = Path.cwd()
DATA_DIR = NOTEBOOK_DIR.parents[2] / "data" / "data" / "primekg"


# === Required CSVs ===
required_files = [
    "disease_features.csv",
    "kg.csv",
    "nodes.csv",
    "drug_features.csv"
]

# === Check if files exist ===
missing = [f for f in required_files if not (DATA_DIR / f).exists()]
if missing:
    print(f"❌ Missing required files: {missing}")
    print(f"   ➤ Please place them inside: {DATA_DIR.resolve()}")
else:
    print(f"✅ All required files found in: {DATA_DIR.resolve()}")

# === Load Data ===
if not missing:
    disease_feat = pd.read_csv(DATA_DIR / "disease_features.csv")
    kg = pd.read_csv(DATA_DIR / "kg.csv")
    nodes = pd.read_csv(DATA_DIR / "nodes.csv")
    drug_feat = pd.read_csv(DATA_DIR / "drug_features.csv")
    print("📥 Files loaded successfully.")

# === 1. Search for 'osteogenesis imperfecta' ===
term = "osteogenesis imperfecta"

mask = (
    disease_feat['mondo_name'].str.contains(term, case=False, na=False) |
    disease_feat['umls_description'].str.contains(term, case=False, na=False) |
    disease_feat['orphanet_definition'].str.contains(term, case=False, na=False)
)
match = disease_feat[mask]

if match.empty:
    print(f"❌ No disease entry found for '{term}'.")
    exit()

print(f"✅ Found {len(match)} entries for '{term}'.")

# === 2. Get node index ===
indices = match['node_index'].unique().tolist()
print(f"Node indices for '{term}': {indices}")

# === 3. Find edges ===
assoc_edges = kg[(kg['x_index'].isin(indices)) | (kg['y_index'].isin(indices))]
print(f"✅ Found {len(assoc_edges)} edges connected to '{term}'.")

# === 4. Get all other node indices ===
assoc_node_indices = set(pd.concat([assoc_edges['x_index'], assoc_edges['y_index']])) - set(indices)

# === 5. Get associated node details ===
assoc_nodes = nodes[nodes['node_index'].isin(assoc_node_indices)]

# === 6. Show drugs ===
drugs = assoc_nodes[assoc_nodes['node_type'] == 'drug']
if not drugs.empty:
    drugs_with_desc = drugs.merge(drug_feat, on='node_index', how='left')
    print("\n💊 Drugs associated:")
    print(drugs_with_desc[['node_name', 'description', 'indication']])
else:
    print("\nℹ️ No drugs associated.")

# === 7. Show genes/proteins ===
genes = assoc_nodes[assoc_nodes['node_type'] == 'gene/protein']
if not genes.empty:
    print("\n🧬 Genes/Proteins associated:")
    print(genes[['node_name']])
else:
    print("\nℹ️ No genes/proteins associated.")

# === 8. Disease details ===
print("\n📝 Disease details:")
with pd.option_context('display.max_colwidth', 500):
    print(match.T)


✅ All required files found in: /Users/ganeshkumarboini/Downloads/KgTxAgent/main/src/data/data/primekg


  kg = pd.read_csv(DATA_DIR / "kg.csv")


📥 Files loaded successfully.
✅ Found 143 entries for 'osteogenesis imperfecta'.
Node indices for 'osteogenesis imperfecta': [27263, 28264, 28337, 29736, 30423, 31891, 32372, 33478, 35787, 35854, 39782, 84257, 27158, 27239, 27314, 27954, 30606, 33025]
✅ Found 1602 edges connected to 'osteogenesis imperfecta'.

💊 Drugs associated:
              node_name                                        description  \
0            Dicoumarol  Dicoumarol is an oral anticoagulant agent that...   
1             Diltiazem  Diltiazem is a benzothiazepine derivative with...   
2              Sulindac  Sulindac is a nonsteroidal anti-inflammatory d...   
3             Verapamil  Verapamil is a phenylalkylamine calcium channe...   
4              Warfarin  Warfarin is an anticoagulant drug normally use...   
5             Tretinoin  Tretinoin, also known as all-trans-retinoic ac...   
6             Quinapril  Quinapril is the ethyl ester prodrug of the no...   
7          Isotretinoin  Isotretinoin is a re

In [5]:
import pandas as pd
from pathlib import Path

def main():
    # === CONFIG ===
    NOTEBOOK_DIR = Path.cwd()
    DATA_DIR = NOTEBOOK_DIR.parents[2] / "data" / "data" / "primekg"

    # === Required files ===
    required_files = [
        "kg.csv",
        "nodes.csv",
        "drug_features.csv",
        "disease_features.csv"
    ]

    # === Check all files exist ===
    missing = [f for f in required_files if not (DATA_DIR / f).exists()]
    if missing:
        print(f"❌ Missing files: {missing}")
        print(f"👉 Please make sure these files are in: {DATA_DIR.resolve()}")
        return

    # === 1. Load data ===
    kg = pd.read_csv(DATA_DIR / "kg.csv")
    nodes = pd.read_csv(DATA_DIR / "nodes.csv")
    drug_feat = pd.read_csv(DATA_DIR / "drug_features.csv")
    disease_feat = pd.read_csv(DATA_DIR / "disease_features.csv")

    # === 2. Drug search term ===
    search_term = "aspirin"  # <<-- change to your drug name!

    # === 3. Find drug nodes ===
    drug_matches = nodes[
        (nodes['node_type'] == 'drug') &
        (nodes['node_name'].str.contains(search_term, case=False, na=False))
    ]

    if drug_matches.empty:
        print(f"❌ No drug entry for '{search_term}' found.")
        return

    drug_indices = drug_matches['node_index'].unique().tolist()
    print(f"\n✅ Found drug node indices: {drug_indices}")

    # === 4. Find edges connected to the drug ===
    assoc_edges = kg[
        (kg['x_index'].isin(drug_indices)) | (kg['y_index'].isin(drug_indices))
    ]
    print(f"✅ Found {len(assoc_edges)} associations for '{search_term}'.")

    # === 5. Get other connected nodes ===
    assoc_node_indices = set(pd.concat([assoc_edges['x_index'], assoc_edges['y_index']])) - set(drug_indices)
    assoc_nodes = nodes[nodes['node_index'].isin(assoc_node_indices)]

    # === 6. Connected diseases ===
    diseases = assoc_nodes[assoc_nodes['node_type'] == 'disease']
    if not diseases.empty:
        diseases_with_desc = diseases.merge(disease_feat, on='node_index', how='left')
        print("\n🦠 Diseases connected to this drug:")
        with pd.option_context('display.max_colwidth', 1000):
            print(diseases_with_desc[['node_name', 'mondo_definition', 'umls_description']])
    else:
        print("\nℹ️ No diseases connected.")

    # === 7. Connected genes/proteins ===
    genes = assoc_nodes[assoc_nodes['node_type'] == 'gene/protein']
    if not genes.empty:
        print("\n🧬 Genes/proteins connected to this drug:")
        print(genes[['node_name']])
    else:
        print("\nℹ️ No genes/proteins connected.")

    # === 8. Connected other drugs ===
    other_drugs = assoc_nodes[assoc_nodes['node_type'] == 'drug']
    if not other_drugs.empty:
        other_drugs_desc = other_drugs.merge(drug_feat, on='node_index', how='left')
        print("\n💊 Other drugs connected to this drug:")
        print(other_drugs_desc[['node_name', 'description', 'indication']])
    else:
        print("\nℹ️ No other drugs connected.")

    # === 9. Show details for the searched drug ===
    drug_details = drug_matches.merge(drug_feat, on='node_index', how='left')
    print("\n🔎 Details for the searched drug:")
    with pd.option_context('display.max_colwidth', 1000):
        print(drug_details.T)

if __name__ == "__main__":
    main()


  kg = pd.read_csv(DATA_DIR / "kg.csv")



✅ Found drug node indices: [16008, 20756]
✅ Found 6176 associations for 'aspirin'.

ℹ️ No diseases connected.

🧬 Genes/proteins connected to this drug:
     node_name
9031     PTGS1

💊 Other drugs connected to this drug:
                        node_name  \
0                     Flunisolide   
1                 Fluorometholone   
2     Beclomethasone dipropionate   
3                   Betamethasone   
4          Fluticasone propionate   
...                           ...   
1592  Potassium Guaiacolsulfonate   
1593                    Enprostil   
1594       Potassium permanganate   
1595          Potassium carbonate   
1596          Potassium triiodide   

                                            description  \
0     Flunisolide (marketed as AeroBid, Nasalide, Na...   
1     A glucocorticoid employed, usually as eye drop...   
2     Beclomethasone dipropionate is a second-genera...   
3     Betamethasone is a long-acting corticosteroid ...   
4     Fluticasone propionate is a synt

In [4]:
import pandas as pd
from pathlib import Path

def main():
    # === CONFIG ===
    NOTEBOOK_DIR = Path.cwd()
    DATA_DIR = NOTEBOOK_DIR.parents[2] / "Code" / "data" / "primekg"

    # === Required files ===
    required_files = [
        "kg.csv",
        "nodes.csv",
        "drug_features.csv",
        "disease_features.csv"
    ]

    # === Check all files exist ===
    missing = [f for f in required_files if not (DATA_DIR / f).exists()]
    if missing:
        print(f"❌ Missing files: {missing}")
        print(f"👉 Please make sure these files are in: {DATA_DIR.resolve()}")
        return

    # === 1. Load data ===
    kg = pd.read_csv(DATA_DIR / "kg.csv")
    nodes = pd.read_csv(DATA_DIR / "nodes.csv")
    drug_feat = pd.read_csv(DATA_DIR / "drug_features.csv")
    disease_feat = pd.read_csv(DATA_DIR / "disease_features.csv")

    # === 2. Set your search term ===
    search_term = "BRCA1"  # 👈 change this to your gene/protein name

    # === 3. Find gene/protein node(s) ===
    gene_matches = nodes[
        (nodes['node_type'] == 'gene/protein') &
        (nodes['node_name'].str.contains(search_term, case=False, na=False))
    ]

    if gene_matches.empty:
        print(f"❌ No gene/protein entry found for '{search_term}'.")
        return

    gene_indices = gene_matches['node_index'].unique().tolist()
    print(f"\n✅ Found gene/protein node indices: {gene_indices}")

    # === 4. Find edges ===
    assoc_edges = kg[
        (kg['x_index'].isin(gene_indices)) | (kg['y_index'].isin(gene_indices))
    ]
    print(f"✅ Found {len(assoc_edges)} associations for '{search_term}'.")

    # === 5. Get other connected nodes ===
    assoc_node_indices = set(pd.concat([assoc_edges['x_index'], assoc_edges['y_index']])) - set(gene_indices)
    assoc_nodes = nodes[nodes['node_index'].isin(assoc_node_indices)]

    # === 6. Connected diseases ===
    diseases = assoc_nodes[assoc_nodes['node_type'] == 'disease']
    if not diseases.empty:
        diseases_with_desc = diseases.merge(disease_feat, on='node_index', how='left')
        print("\n🦠 Diseases connected to this gene/protein:")
        with pd.option_context('display.max_colwidth', 1000):
            print(diseases_with_desc[['node_name', 'mondo_definition', 'umls_description']])
    else:
        print("\nℹ️ No diseases connected.")

    # === 7. Connected drugs ===
    drugs = assoc_nodes[assoc_nodes['node_type'] == 'drug']
    if not drugs.empty:
        drugs_with_desc = drugs.merge(drug_feat, on='node_index', how='left')
        print("\n💊 Drugs connected to this gene/protein:")
        print(drugs_with_desc[['node_name', 'description', 'indication']])
    else:
        print("\nℹ️ No drugs connected.")

    # === 8. Connected other genes/proteins ===
    other_genes = assoc_nodes[assoc_nodes['node_type'] == 'gene/protein']
    if not other_genes.empty:
        print("\n🧬 Other genes/proteins connected to this gene/protein:")
        print(other_genes[['node_name']])
    else:
        print("\nℹ️ No other genes/proteins connected.")

    # === 9. Show details for the searched gene/protein ===
    print("\n🔎 Details for the searched gene/protein:")
    with pd.option_context('display.max_colwidth', 1000):
        print(gene_matches.T)

if __name__ == "__main__":
    main()


  kg = pd.read_csv(DATA_DIR / "kg.csv")



✅ Found gene/protein node indices: [554]
✅ Found 1944 associations for 'BRCA1'.

🦠 Diseases connected to this gene/protein:
                                node_name  \
0    Fanconi anemia complementation group   
1    Fanconi anemia complementation group   
2    Fanconi anemia complementation group   
3    Fanconi anemia complementation group   
4    Fanconi anemia complementation group   
..                                    ...   
369              ovarian gynandroblastoma   
370         benign breast phyllodes tumor   
371                   adenoma of pancreas   
372                sporadic breast cancer   
373                sporadic breast cancer   

                                                                                                                                                                                                                                                                                                                                              