In [1]:
import os
dir_path = os.getcwd()
print("The directory of this script is:", dir_path)
root_path = os.path.dirname(dir_path)
print("The root directory is:", root_path)

The directory of this script is: c:\Users\HP\Desktop\Projects\NodeRAG\graphs
The root directory is: c:\Users\HP\Desktop\Projects\NodeRAG


In [4]:
import pandas as pd
medical_responses = pd.read_parquet(f"{root_path}\\text_decomposition/medical_responses_cleaned.parquet")
novel_responses = pd.read_parquet(f"{root_path}\\text_decomposition/novel_responses_cleaned.parquet")

In [5]:
medical_responses

Unnamed: 0,index,response,cleaned_response,is_valid
0,0,"[\n{\n""semantic_unit"": ""Basal cell skin cancer...","[\n{\n""semantic_unit"": ""Basal cell skin cancer...",A valid decomposition response
1,1,"[\n{\n""semantic_unit"": ""The hypodermis, the de...","[\n{\n""semantic_unit"": ""The hypodermis, the de...",A valid decomposition response
2,2,"```json\n[\n {\n ""semantic_unit"": ""Basal c...","[\n {\n ""semantic_unit"": ""Basal cell skin ...",A valid decomposition response
3,3,"[\n{\n""semantic_unit"": ""Recurrence of basal ce...","[\n{\n""semantic_unit"": ""Recurrence of basal ce...",A valid decomposition response
4,4,"[\n{\n""semantic_unit"": ""This section discusses...","[\n{\n""semantic_unit"": ""This section discusses...",A valid decomposition response
...,...,...,...,...
549,549,"[\n{\n""semantic_unit"": ""The patient is inquiri...","[\n{\n""semantic_unit"": ""The patient is inquiri...",A valid decomposition response
550,550,"[\n{\n""semantic_unit"": ""The text presents a se...","[\n{\n""semantic_unit"": ""The text presents a se...",A valid decomposition response
551,551,"[\n {\n ""semantic_unit"": ""The text poses q...","[\n {\n ""semantic_unit"": ""The text poses q...",A valid decomposition response
552,552,"[\n{\n""semantic_unit"": ""The patient is inquiri...","[\n{\n""semantic_unit"": ""The patient is inquiri...",A valid decomposition response


In [6]:
novel_responses

Unnamed: 0,index,response,cleaned_response,is_valid
0,0,"```json\n[\n {\n ""semantic_unit"": ""This do...","[\n {\n ""semantic_unit"": ""This document wa...",A valid decomposition response
1,1,"[\n{\n""semantic_unit"": ""In December 1881, Augu...","[\n{\n""semantic_unit"": ""In December 1881, Augu...",A valid decomposition response
2,2,"[\n{\n""semantic_unit"": ""The inhabitants of Yuc...","[\n{\n""semantic_unit"": ""The inhabitants of Yuc...",A valid decomposition response
3,3,"[\n{\n""semantic_unit"": ""The author describes v...","[\n{\n""semantic_unit"": ""The author describes v...",A valid decomposition response
4,4,"```json\n[\n {\n ""semantic_unit"": ""Aborigi...","[\n {\n ""semantic_unit"": ""Aborigines tell ...",A valid decomposition response
...,...,...,...,...
2624,2624,"[\n{\n""semantic_unit"": ""The exact origin of th...","[\n{\n""semantic_unit"": ""The exact origin of th...",A valid decomposition response
2625,2625,"```json\n[\n {\n ""semantic_unit"": ""Upon ro...","[\n {\n ""semantic_unit"": ""Upon rounding a ...",A valid decomposition response
2626,2626,"[\n{\n""semantic_unit"": ""The narrator and compa...","[\n{\n""semantic_unit"": ""The narrator and compa...",A valid decomposition response
2627,2627,"[\n{\n""semantic_unit"": ""The author describes a...","[\n{\n""semantic_unit"": ""The author describes a...",A valid decomposition response


In [7]:
import sys
sys.path.append(root_path)
from graphs.Node import Node

In [8]:
import re
def entities_in_relationship(rel, entities):
    rel_text = rel.content.lower()
    found = []
    for e in entities:
        pattern = r'\b' + re.escape(e.content.lower()) + r'\b'
        if re.search(pattern, rel_text):
            found.append(e)
    return found

In [9]:
import json
from tqdm import tqdm
def build_nodes(df, source_name):
    nodes = dict()
    entity_nodes = dict()
    for idx in tqdm(range(len(df))):
        row = df.iloc[idx]
        response = row["cleaned_response"]
        response = json.loads(response)
        for unit_idx,unit in enumerate(response):
            #data
            semantic_unit = unit["semantic_unit"]
            entities = unit["entities"]
            relationships = unit["relationships"]

            #ids
            source_id = f"{source_name}-{idx}"
            semantic_id = f"{source_id}-S-{unit_idx}"
            #entities_ids = [f"{semantic_id}-N-{e_idx}" for e_idx in range(len(entities))]
            relationships_ids = [f"{semantic_id}-R-{r_idx}" for r_idx in range(len(relationships))]

            #create semantic node
            semantic_node = Node(
                id=semantic_id,
                node_type = "S",
                source = source_id,
                content = semantic_unit
            )

            #create entity nodes
            current_entity_nodes = []
            for e_idx, entity in enumerate(entities):
                if entity not in entity_nodes:
                    entity_node = Node(
                        id = f"{source_name}-N-{len(entity_nodes)}",
                        node_type = "N",
                        source = "",
                        content = entity
                    )
                    entity_nodes[entity] = entity_node
                else:
                    entity_node = entity_nodes[entity]
                current_entity_nodes.append(entity_node)

            #create relationship nodes
            relationship_nodes = []
            for r_idx, relationship in enumerate(relationships):
                relationship_node = Node(
                    id = relationships_ids[r_idx],
                    node_type = "R",
                    source = source_id,
                    content = relationship
                )
                relationship_nodes.append(relationship_node)

            #link nodes
            for entity_node in current_entity_nodes:
                semantic_node.link(entity_node)
                entity_node.link(semantic_node)
            
            for relationship_node in relationship_nodes:
                ents = entities_in_relationship(relationship_node, current_entity_nodes)
                for ent in ents:
                    relationship_node.link(ent)
                    ent.link(relationship_node)
            
            current_nodes = [semantic_node] + relationship_nodes
            for n in current_nodes:
                nodes[n.id] = n
    nodes.update({v.id: v for v in entity_nodes.values()})
    entities_dict = {k: v.id for k, v in entity_nodes.items()}
    return nodes, entities_dict

In [10]:
medical_nodes, medical_entities = build_nodes(medical_responses, "medical")

100%|██████████| 554/554 [00:01<00:00, 432.71it/s]


In [11]:
novel_nodes, novel_entities = build_nodes(novel_responses, "novel")

100%|██████████| 2629/2629 [00:04<00:00, 611.68it/s]


In [None]:
import pickle
with open(f"{root_path}/graphs/data/G1_medical_primary_graph.pkl", "wb") as f:
    pickle.dump(medical_nodes, f)
with open(f"{root_path}/graphs/data/medical_entities.pkl", "wb") as f:
    pickle.dump(medical_entities, f)
with open(f"{root_path}/graphs/data/G1_novel_primary_graph.pkl", "wb") as f:
    pickle.dump(novel_nodes, f)
with open(f"{root_path}/graphs/data/novel_entities.pkl", "wb") as f:
    pickle.dump(novel_entities, f)

In [None]:
with open(f"{root_path}/graphs/data/G1_medical_primary_graph.pkl", "rb") as f:
    medical_nodes = pickle.load(f)
with open(f"{root_path}/graphs/data/medical_entities.pkl", "rb") as f:
    medical_entities = pickle.load(f)

with open(f"{root_path}/graphs/data/G1_novel_primary_graph.pkl", "rb") as f:
    novel_nodes = pickle.load(f)
with open(f"{root_path}/graphs/data/novel_entities.pkl", "rb") as f:
    novel_entities = pickle.load(f)

In [14]:
def nodes_to_dataframe(nodes):
    data = []
    for node in nodes.values():
        data.append({
            "id": node.id,
            "node_type": node.node_type,
            "source": node.source,
            "content": node.content,
            "edges": node.edges
        })
    return pd.DataFrame(data)


In [15]:
medical_nodes_df = nodes_to_dataframe(medical_nodes)
medical_nodes_df = medical_nodes_df.sort_values(by=["source", "node_type", "id"]).reset_index(drop=True)
medical_nodes_df

Unnamed: 0,id,node_type,source,content,edges
0,medical-N-0,N,,BASAL CELL SKIN CANCER,"{'medical-0-S-0': 1, 'medical-0-S-0-R-0': 1, '..."
1,medical-N-1,N,,BASAL CELL CARCINOMA (BCC),{'medical-0-S-0': 1}
2,medical-N-10,N,,ARMS,"{'medical-0-S-1': 1, 'medical-0-S-1-R-5': 1, '..."
3,medical-N-100,N,,METABOLISM,"{'medical-5-S-0': 1, 'medical-5-S-0-R-6': 1, '..."
4,medical-N-1000,N,,EARLY OR LOCALLY ADVANCED NSCLC,"{'medical-49-S-3': 1, 'medical-49-S-3-R-5': 1,..."
...,...,...,...,...,...
31436,medical-99-S-3-R-6,R,medical-99,"HIGHER LEVELS of CREATININE, mean the, KIDNEYS...","{'medical-N-262': 1, 'medical-N-1880': 1}"
31437,medical-99-S-0,S,medical-99,"Gathering family health history, including spe...","{'medical-N-208': 1, 'medical-N-212': 1, 'medi..."
31438,medical-99-S-1,S,medical-99,Blood tests are performed to detect disease an...,"{'medical-N-174': 1, 'medical-N-321': 1, 'medi..."
31439,medical-99-S-2,S,medical-99,CBC measurements of particular attention inclu...,"{'medical-N-1878': 1, 'medical-N-1875': 1, 'me..."


In [16]:
medical_entities

{'BASAL CELL SKIN CANCER': 'medical-N-0',
 'BASAL CELL CARCINOMA (BCC)': 'medical-N-1',
 '3 MILLION CASES': 'medical-N-2',
 'UNITED STATES': 'medical-N-3',
 'SURGERY': 'medical-N-4',
 'BASAL CELLS': 'medical-N-5',
 'EPIDERMIS': 'medical-N-6',
 'FACE': 'medical-N-7',
 'HEAD': 'medical-N-8',
 'NECK': 'medical-N-9',
 'ARMS': 'medical-N-10',
 'LEGS': 'medical-N-11',
 'TRUNK': 'medical-N-12',
 'SKIN': 'medical-N-13',
 'DERMIS': 'medical-N-14',
 'HYPODERMIS': 'medical-N-15',
 'SQUAMOUS CELLS': 'medical-N-16',
 'MELANOCYTES': 'medical-N-17',
 'FAT': 'medical-N-18',
 'CONNECTIVE TISSUE': 'medical-N-19',
 'ULTRAVIOLET (UV) RAYS': 'medical-N-20',
 'SQUAMOUS CELL SKIN CANCER': 'medical-N-21',
 'MELANOMA': 'medical-N-22',
 'NCCN GUIDELINES FOR PATIENTS': 'medical-N-23',
 'NCCN.ORG/PATIENTGUIDELINES': 'medical-N-24',
 'NCCN PATIENT GUIDES FOR CANCER APP': 'medical-N-25',
 'LIGHTER SKIN': 'medical-N-26',
 'LIGHTER HAIR': 'medical-N-27',
 'LIGHTER EYES': 'medical-N-28',
 'FLAT, PALE OR YELLOW AREAS':

In [17]:
total_edges = sum(len(node.edges) for node in medical_nodes.values())//2
print("Total edges:", total_edges)


Total edges: 67566


In [18]:
novel_nodes_df = nodes_to_dataframe(novel_nodes)
novel_nodes_df = novel_nodes_df.sort_values(by=["source", "node_type", "id"]).reset_index(drop=True)
novel_nodes_df

Unnamed: 0,id,node_type,source,content,edges
0,novel-N-0,N,,JULIA MILLER,"{'novel-0-S-0': 1, 'novel-0-S-0-R-0': 1, 'nove..."
1,novel-N-1,N,,ONLINE DISTRIBUTED PROOFREADING TEAM,"{'novel-0-S-0': 1, 'novel-0-S-0-R-1': 1, 'nove..."
2,novel-N-10,N,,MAYAB,"{'novel-0-S-1': 1, 'novel-24-S-1': 1, 'novel-2..."
3,novel-N-100,N,,STREAMS,"{'novel-3-S-1': 1, 'novel-3-S-1-R-1': 1, 'nove..."
4,novel-N-1000,N,,PRISON,"{'novel-39-S-2': 1, 'novel-39-S-2-R-4': 1, 'no..."
...,...,...,...,...,...
124473,novel-999-S-0,S,novel-999,"For hours, the rebels unsuccessfully attempted...","{'novel-N-15582': 1, 'novel-N-18096': 1, 'nove..."
124474,novel-999-S-1,S,novel-999,The day's casualties included three killed and...,"{'novel-N-15514': 1, 'novel-N-15607': 1, 'nove..."
124475,novel-999-S-2,S,novel-999,Capt. Munch was injured when his horse was sho...,"{'novel-N-18430': 1, 'novel-N-13310': 1, 'nove..."
124476,novel-999-S-3,S,novel-999,"On the morning of April 7, Gen. Buell arrived,...","{'novel-N-18434': 1, 'novel-N-17888': 1, 'nove..."


In [19]:
novel_entities

{'JULIA MILLER': 'novel-N-0',
 'ONLINE DISTRIBUTED PROOFREADING TEAM': 'novel-N-1',
 'HTTP://WWW.PGDP.NET': 'novel-N-2',
 'THE INTERNET ARCHIVE/AMERICAN LIBRARIES': 'novel-N-3',
 '[TN-#]': 'novel-N-4',
 'OE LIGATURES': 'novel-N-5',
 '[SUN]': 'novel-N-6',
 '[=A]': 'novel-N-7',
 '[C]': 'novel-N-8',
 'VESTIGES OF THE MAYAS': 'novel-N-9',
 'MAYAB': 'novel-N-10',
 'ASIA': 'novel-N-11',
 'AFRICA': 'novel-N-12',
 'AUGUSTUS LE PLONGEON, M. D.': 'novel-N-13',
 'AMERICAN ANTIQUARIAN SOCIETY OF WORCESTER, MASS.': 'novel-N-14',
 'CALIFORNIA ACADEMY OF SCIENCES': 'novel-N-15',
 'SCIENTIFIC SOCIETIES': 'novel-N-16',
 'ESSAYS AND SCIENTIFIC WORKS': 'novel-N-17',
 'NEW YORK': 'novel-N-18',
 'JOHN POLHEMUS': 'novel-N-19',
 'PRINTER AND STATIONER': 'novel-N-20',
 '1881': 'novel-N-21',
 'MR. PIERRE LORILLARD': 'novel-N-22',
 'AMERICAN ARCHAEOLOGY': 'novel-N-23',
 'CENTRAL AMERICA': 'novel-N-24',
 'FOREIGN EXPLORERS': 'novel-N-25',
 'AMERICAN EXPLORERS': 'novel-N-26',
 'INCAS': 'novel-N-27',
 'PERU': 'nov

In [20]:
total_edges = sum(len(node.edges) for node in novel_nodes.values())//2
print("Total edges:", total_edges)


Total edges: 222960


In [21]:
def shallow_ppr_local(nodes_dict, entry_id, alpha = 0.5, t = 2, k = 10):
    #simulate a random walk with restarts, number of steps t, probability to stop at each node after stepping is alpha
    pi = dict()   # PPR scores: probability that the walk ends at each node
    r = {entry_id: 1.0}  # probability that the next step move to the node. at step 0, probability is 1 to move to entry node

    for _ in range(t): 
        r_next = dict()
        for node_id, residual in r.items(): #step to next node if probability residual
            pi[node_id] = pi.get(node_id, 0) + alpha * residual #increase PPR score by the probability of stopping here after step
            push_val = (1 - alpha) * residual #probability to continue walking (the remaining probability)
            node = nodes_dict[node_id]
            total_weight = node.degree
            if total_weight == 0: #stop if no neighbors (won't happen in undirected graph)
                continue
            for nbr_id, w in node.edges.items(): 
                r_next[nbr_id] = r_next.get(nbr_id, 0) + push_val * (w / total_weight) #probability to move to the neighbor using edge weight
        r = r_next
    #add remaining residual probabilities to PPR scores
    for node_id, residual in r.items():
        pi[node_id] = pi.get(node_id, 0) + residual
    top_nodes = sorted(pi.items(), key=lambda x: x[1], reverse=True)[:k]
    return dict(top_nodes)

shallow_ppr_local(novel_nodes, entry_id="novel-N-0")


{'novel-N-0': 0.6333333333333333,
 'novel-0-S-0': 0.08333333333333333,
 'novel-0-S-0-R-0': 0.08333333333333333,
 'novel-0-S-0-R-2': 0.08333333333333333,
 'novel-N-1': 0.049999999999999996,
 'novel-N-8': 0.016666666666666666,
 'novel-N-2': 0.008333333333333333,
 'novel-N-3': 0.008333333333333333,
 'novel-N-4': 0.008333333333333333,
 'novel-N-5': 0.008333333333333333}

In [22]:
import networkx as nx

G = nx.Graph()
for node_id, node in medical_nodes.items():
    G.add_node(node_id, node_type=node.node_type, content=node.content, source=node.source)
    for target_id, weight in node.edges.items():
        if target_id in medical_nodes:
            G.add_edge(node_id, target_id, weight=weight)

nx.write_gml(G, "viz/medical_graph.gml")


In [23]:
G = nx.Graph()
for node_id, node in novel_nodes.items():
    G.add_node(node_id, node_type=node.node_type, content=node.content, source=node.source)
    for target_id, weight in node.edges.items():
        if target_id in novel_nodes:
            G.add_edge(node_id, target_id, weight=weight)

nx.write_gml(G, "viz/novel_graph.gml")