In [None]:

!pip install openai
!pip install langchain
!pip install -U langchain-community
!pip install unstructured
!pip install libmagic
!pip install unstructured[pdf]
!pip install yachalk
!pip install seaborn
!pip install pyvis
!pip isntall networkx

: 

In [None]:
!pip install ipykernel

: 

In [2]:
import pandas as pd
import numpy as np

import os
os.makedirs('./docs', exist_ok=True)
from pyvis.network import Network
graph_output_directory = "./docs/index_LA.html"

LOG_FILE = './processed_files.log'

import networkx as nx
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader, TextLoader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

import seaborn as sns
palette = "hls"

In [3]:
"""    # Use this cell only if Chunking text

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[5].page_content)
"""

'    # Use this cell only if Chunking text\n\nsplitter = RecursiveCharacterTextSplitter(\n    chunk_size=1500,\n    chunk_overlap=150,\n    length_function=len,\n    is_separator_regex=False,\n)\n\npages = splitter.split_documents(documents)\nprint("Number of chunks = ", len(pages))\nprint(pages[5].page_content)\n'

In [4]:
"""     Process to automate finding and incorporating new .txt files from a directory

GRAPH_CSV_PATH = outputdirectory / "graph.csv"
CHUNKS_CSV_PATH = outputdirectory / "chunks.csv"
PROCESSED_LOG_PATH = "processed_files.log"

## --- State Management: Identify New Files ---

# Read the list of already processed files
try:
    with open(PROCESSED_LOG_PATH, 'r') as f:
        processed_files = set(f.read().splitlines())
except FileNotFoundError:
    processed_files = set()

# Get the list of all current .txt files in the directory
current_files = {str(p) for p in inputdirectory.glob("**/*.txt")}

# Determine which files are new
new_files_to_process = list(current_files - processed_files)

print(f"Found {len(new_files_to_process)} new file(s) to process.")
print(new_files_to_process)
"""

'     Process to automate finding and incorporating new .txt files from a directory\n\nGRAPH_CSV_PATH = outputdirectory / "graph.csv"\nCHUNKS_CSV_PATH = outputdirectory / "chunks.csv"\nPROCESSED_LOG_PATH = "processed_files.log"\n\n## --- State Management: Identify New Files ---\n\n# Read the list of already processed files\ntry:\n    with open(PROCESSED_LOG_PATH, \'r\') as f:\n        processed_files = set(f.read().splitlines())\nexcept FileNotFoundError:\n    processed_files = set()\n\n# Get the list of all current .txt files in the directory\ncurrent_files = {str(p) for p in inputdirectory.glob("**/*.txt")}\n\n# Determine which files are new\nnew_files_to_process = list(current_files - processed_files)\n\nprint(f"Found {len(new_files_to_process)} new file(s) to process.")\nprint(new_files_to_process)\n'

In [5]:
import os
import json
from typing import List, Tuple, Dict, Set
import openai
from load_dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')
MODEL_NAME = 'gpt-4.1-nano'

class PromptEngine:
    def __init__(self):
        self.cache: Dict[Tuple[str, str], str] = {}

    def build(self, pairs: List[Tuple[str, str]]) -> List[Tuple[Tuple[str, str], str]]:
        prompts = []
        for e1, e2 in pairs:
            if (e1, e2) not in self.cache:
                prompt = f"""
                You are a creative alchemist. Your job is to invent new elements by combining two existing ones in imaginative yet plausible ways.

                When given two elements, combine their characteristics to create a new, unique element.
                Express each result as:
                `[Element1] and [Element2] gives me [ResultElement]`

                **Rules:**

                1. For each new element generated, attempt to combine it with each of the original input elements (the base set), but **do not** combine new elements with each other unless one is from the base set.
                2. Continue this process for several generations, always only pairing a newly created element with any element from the original base set.
                3. Each combination should produce a plausible new element, grounded in the properties or concepts of the ingredients.
                4. Do not repeat combinations or reverse orderings (e.g., if "Water and Fire" is done, skip "Fire and Water").

                **Template for output:**

                ```
                [Element1] and [Element2] gives me [ResultElement]
                ```

                **Input Elements:**

                * {e1}
                * {e2}

                **Step-by-step:**

                1. Combine {e1} and {e2} to create [O1].
                2. Combine [O1] with {e1} (if not already used).
                3. Combine [O1] with {e2} (if not already used).
                4. For each new element produced, only combine with the original input elements, not with other new elements.

                ---

                **Example Workflow:**
                Suppose Input Elements: **Fire** and **Water**

                ```
                Fire and Water gives me Steam
                Steam and Fire gives me Energy
                Steam and Water gives me Cloud
                ```
             """
                prompts.append(((e1, e2), prompt))
        return prompts

    def cache_result(self, pair: Tuple[str, str], result: str):
        self.cache[pair] = result

class LLMClient:
    def __init__(self, model: str = MODEL_NAME):
        self.model = model

    def batch_query(self, prompts: List[str]) -> List[str]:
        responses = []
        for prompt in prompts:
            resp = openai.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
            )
            text = resp.choices[0].message.content.strip()
            responses.append(text)
        return responses



In [6]:
import uuid

def text2Dataframe(text_chunks) -> pd.DataFrame:
    rows = []
    for chunk in text_chunks:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

In [7]:
import json
import openai
from openai import OpenAI

from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable not set.")
client = OpenAI(api_key=API_KEY)

def generate(model, system, user):
    response = openai.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user}
        ]
    )
    return response.choices[0].message.content

def graphPrompt(input: str, metadata={}, model="gpt-4.1-mini"):
    if model == None:
        model = "gpt-4.1-mini"

    # model_info = client.show(model_name=model)
    # print( chalk.blue(model_info))

    SYS_PROMPT = (
        "You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context text chunk (delimited by ```) Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include object, entity, location, organization, person, \n"
            "\tcondition, acronym, documents, service, concept, etc.\n"
            "\tTerms should be as atomistic as possible\n\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms can be related to many other terms\n\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
        "Format your output as a list of json. Each element of the list contains a pair of terms"
        "and the relation between them, like the following: \n"
        "[\n"
        "   {\n"
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
        "   }, {...}\n"
        "]"
    )

    USER_PROMPT = f"context: ```{input}``` \n\n output: "
    response = generate(model=model, system=SYS_PROMPT, user=USER_PROMPT)
    try:
        result = json.loads(response)
        result = [dict(item, **metadata) for item in result]
    except:
        print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        result = None
    return result

In [8]:
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors

def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list

def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

def display_graph(G):
    ## Display & Save Graph Visualization
    #
    net = Network(
        notebook=False,
        # bgcolor="#1a1a1a",
        cdn_resources="remote",
        height="900px",
        width="100%",
        select_menu=True,
        # font_color="#cccccc",
        filter_menu=False,
    )

    net.from_nx(G)
    # net.repulsion(node_distance=150, spring_length=400)
    net.force_atlas_2based(central_gravity=0.015, gravity=-31)
    # net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
    net.show_buttons(filter_=["physics"])

    net.show(graph_output_directory, notebook=False)

In [9]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2

In [10]:
def graph_viz_builder(dfg):
    nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
    G = nx.Graph()

    # Add nodes to the graph
    for node in nodes:
        G.add_node(
            str(node)
        )

    # Add edges to the graph
    for index, row in dfg.iterrows():
        G.add_edge(
            str(row["node_1"]),
            str(row["node_2"]),
            title=row["edge"],
            weight=row['count']/4
        )

    """     Girvan Newman Algorithm & Community coloring / Can also be used for Stopping Condition

    # Graph Communities & Coloring
    communities_generator = nx.community.girvan_newman(G)
    top_level_communities = next(communities_generator)
    next_level_communities = next(communities_generator)
    communities = sorted(map(sorted, next_level_communities))
    print("Number of Communities = ", len(communities))
    print(communities)
    df_colors = colors2Community(communities)

    for index, row in df_colors.iterrows():
        G.nodes[row['node']]['group'] = row['group']
        G.nodes[row['node']]['color'] = row['color']
        G.nodes[row['node']]['size'] = G.degree[row['node']]"
    """

    display_graph(G)

In [11]:
class GraphBuilder:
    def __init__(self):
        self.graph = nx.Graph()
        self.prompt_engine = PromptEngine()
        self.llm = LLMClient()
        self.seen_nodes: Set[str] = set()

    def get_unexplored_nodes(self) -> List[str]:
        return [n for n in self.graph.nodes() if n not in self.seen_nodes]

    def run_loop(
        self,
        base_elements: List[str],
        max_iters: int = 5,
        batch_size: int = 2
    ):
        """

        """

        self.base = [e.lower() for e in base_elements]
        for node in self.base:
            self.graph.add_node(node)
        print(f"Seeded graph with base elements: {base_elements}")

        for iteration in range(1, max_iters + 1):
            to_explore = self.get_unexplored_nodes()[:batch_size]
            if not to_explore:
                print("No more new nodes — stopping early.")
                break

            print(f"\n--- Iteration {iteration}: exploring {to_explore} ---")

            pairs = [(new, base) 
                     for new in to_explore 
                     for base in self.base 
                     if new != base]

            built = self.prompt_engine.build(pairs)
            pairs_batch, prompts = zip(*built)
            responses = self.llm.batch_query(list(prompts))

            for (node, _), raw in zip(pairs_batch, responses):
                print(f"\n🔄 Response for '{node}':\n{raw}")
                try:
                    triples = self.preprocess_response(raw)
                    text_chunks = raw.split('\n')
                    text_chunks = [chunk.strip() for chunk in text_chunks if chunk.strip()]
                    df = text2Dataframe(text_chunks)
                    concepts_list = df2Graph(df, model='gpt-4.1-mini')
                    dfg1 = graph2Df(concepts_list)
                    if not os.path.exists(outputdirectory):
                        os.makedirs(outputdirectory)
                    
                    dfg1.to_csv(GRAPH_CSV_PATH, sep="|", index=False)
                    df.to_csv(CHUNKS_CSV_PATH, sep="|", index=False)
                    print(f"  Extracted triples: {triples}")
                    
                except Exception as e:
                    print(f"  ⚠️ Could not parse JSON for '{node}'. Marking as seen.")
                    self.seen_nodes.add(node)
                    continue

                self.add_triples_to_graph(triples)
                self.seen_nodes.add(node)

            time.sleep(1)

        print(
            f"\nDone! Graph has {self.graph.number_of_nodes()} nodes "
            f"and {self.graph.number_of_edges()} edges."
        )
        net = Network(notebook=True, height="800px", width="100%")
        net.from_nx(self.graph) 
        net.show_buttons(filter_=['physics'])
        net.show("kg_visualization_8iter.html")


In [12]:
out_dir = "test_output1"
outputdirectory = Path(f"./data_output/{out_dir}")
print(f"Output directory: {outputdirectory}")
#print the file and its creation time
print(f"Output directory: {outputdirectory}")
import os
print(os.listdir(outputdirectory))
import os
import datetime

# Print the output directory
print(f"Output directory: {outputdirectory}")

# List files and print their creation times
for filename in os.listdir(outputdirectory):
    filepath = os.path.join(outputdirectory, filename)
    if os.path.isfile(filepath):
        creation_time = os.path.getctime(filepath)
        readable_time = datetime.datetime.fromtimestamp(creation_time).strftime('%Y-%m-%d %H:%M:%S')
        print(f"{filename} - Created on: {readable_time}")


Output directory: data_output\test_output1
Output directory: data_output\test_output1
['chunks.csv', 'context_prox_df.csv', 'graph.csv']
Output directory: data_output\test_output1
chunks.csv - Created on: 2025-07-27 20:51:56
context_prox_df.csv - Created on: 2025-07-27 20:51:56
graph.csv - Created on: 2025-07-27 20:51:56


In [13]:
def implement_KG(text_files):
    ## Dir PDF Loader example / adapt as needed
    # loader = PyPDFDirectoryLoader(inputdirectory)
    # loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")

    # loader = DirectoryLoader(inputdirectory, show_progress=True)       # Load all text files in directory
    # documents = loader.load()

    out_dir = "test_output1"
    outputdirectory = Path(f"./data_output/{out_dir}")
    print(f"Output directory: {outputdirectory}")
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    text_files = [f.split("/")[-1] for f in text_files if f.endswith('.txt')]
    text_loader = TextLoader(f"{text_files[0]}")      # Assumedly, load just one text file in the directory
    text_lines = text_loader.load()

    # Split the text by the newline character
    print(text_lines)
    text_chunks = text_lines
    #text_chunks = text_lines.split('\n')
    print(text_chunks)

    # Convert text to dataframe
    df = text2Dataframe(text_chunks)
    print(df.shape)
    df.head()

    ## To Generate or Regenerate the Knowldge Graph with LLM 
    #
    regenerate = True       # set this to 'False' for the Concatenation process
    GRAPH_CSV_PATH = outputdirectory / "graph.csv"
    # CHUNKS_CSV_PATH = outputdirectory / "chunks.csv"

    if regenerate:      # I doubt we need regeneration
        concepts_list = df2Graph(df, model='gpt-4.1-mini')
        dfg1 = graph2Df(concepts_list)
        if not os.path.exists(outputdirectory):
            os.makedirs(outputdirectory)
        
        dfg1.to_csv(GRAPH_CSV_PATH, sep="|", index=False)
        df.to_csv(CHUNKS_CSV_PATH, sep="|", index=False)
    else:
        dfg1 = pd.read_csv(GRAPH_CSV_PATH, sep="|")

    # --- Load Existing Graph or Initialize Empty ---

    #if os.path.exists(GRAPH_CSV_PATH):
    #    print("Loading existing graph...")
    #    df_graph_existing = pd.read_csv(GRAPH_CSV_PATH, sep="|")
    #    df_chunks_existing = pd.read_csv(CHUNKS_CSV_PATH, sep="|")
#
    #elif os.path.exists(GRAPH_CSV_PATH):
    #    with open(LOG_FILE, 'r') as f:
    #        content = f.read()
    #        if not content:
    #            print("LOG_FILE is empty.")
    #        else:
    #            df_graph = pd.read_csv(GRAPH_CSV_PATH, sep="|")
    #            dfg1 = pd.concat([df_graph, dfg1], ignore_index=True)   # Concatenation process / still need modification
    #    
    #else:
    #    print("No existing graph found. Starting fresh.")
    #    df_graph_existing = pd.DataFrame(columns=['node_1', 'node_2', 'edge', 'chunk_id'])
    #    df_chunks_existing = pd.DataFrame(columns=['text', 'source', 'chunk_id'])

        if os.path.exists(GRAPH_CSV_PATH):
            print("Loading existing graph...")
            # df_chunks_existing = pd.read_csv(CHUNKS_CSV_PATH, sep="|")

            with open(LOG_FILE, 'r') as f:
                content = f.read()
                if not content:
                    print("LOG_FILE is empty.")
                else:
                    df_graph_existing = pd.read_csv(GRAPH_CSV_PATH, sep="|")     # Reading the existing graph
                    dfg1 = pd.concat([df_graph_existing, dfg1], ignore_index=True)   # Concatenation process / still need modification - esp. the text-chunking process  
                    
                    dfg1.replace("", np.nan, inplace=True)
                    dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
                    # Still confused about including 'count' column in this new implementation 


    ## Perform Contextual Proximity calculation / Use only if needed
    #
    dfg2 = contextual_proximity(dfg1)
    dfg2.tail()
    dfg2.to_csv(outputdirectory/"context_prox_df.csv", index=False)

    ## Graph Concatenation & Aggregation
    #
    dfg = pd.concat([dfg1, dfg2], axis=0)
    dfg = (
        dfg.groupby(["node_1", "node_2"])
        .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
        .reset_index()
    )

    ## Graph-visualizier-building
    
    graph_viz_builder(dfg1)

In [14]:
if __name__ == "__main__":
    """ Not performing textsplit-and-overlapping chunking process as of now, 
        bcs the inputs to this modification are just independent lines of text separated by newline character (\n). 
        Also, only one new text file at a time. 
    """
    import pathlib
    first_run = True       # Change to 'False' after the first graph creation
    INPUT_DIR = pathlib.Path('./data_input')
    LOG_FILE = pathlib.Path(LOG_FILE)
    if first_run:
        if not os.listdir(INPUT_DIR):
            print("Directory is empty")
        else:
            text_files = [str(p) for p in INPUT_DIR.glob("**/*.txt")]
            print(text_files)
            #text_files = ""
            
            # Processing files for the first time
            if LOG_FILE.exists():
                with open(LOG_FILE, 'r') as f:
                    processed_files = set(f.read().splitlines())
            else:
                processed_files = set()     # Creating 'log file' if one not already created
            
            implement_KG(text_files)      # Generate/Concatenate Knowledge Graph

            with open(LOG_FILE, 'a') as f:  # Logging new processed files
                for file in text_files:
                    f.write(file + '\n')
    else:
        all_files = {str(p) for p in INPUT_DIR.glob("**/*.txt")}
        if LOG_FILE.exists():
                with open(LOG_FILE, 'r') as f:
                    processed_files = set(f.read().splitlines())

        # Finding newly added files
        new_files = list(all_files - processed_files)

        if not new_files:
            print("No new files to process.")
            # Optionally exit or continue as needed
        else:
            print(f"Processing {len(new_files)} new files...\n")
            implement_KG(new_files)        # Still need to modify this for improved Graph Concatenation

            with open(LOG_FILE, 'a') as f:  # Logging new processed files
                for file in all_files:
                    f.write(file + '\n')

    print("\nDone.")


    

['data_input\\final.txt']
Output directory: data_output\test_output1
[Document(metadata={'source': 'data_input\\final.txt'}, page_content="Water and Earth gives me Mud\nMud and Water gives me Swamp\nMud and Earth gives me Clay\nSwamp and Earth gives me Bog\nSwamp and Water gives me Quagmire\n\nClay and Water gives me Sludge\nFire and Water gives me Steam\n\nSteam and Water gives me Cloud\n\nSteam and Fire gives me Energy\n\nCloud and Fire gives me Thundercloud\n\nCloud and Water gives me Mist\n\nEnergy and Water gives me Boil\n\nEnergy and Fire gives me Plasma\nWater and Air gives me Mist\n\nMist and Water gives me Dew\n\nMist and Air gives me Vapor\n\nDew and Water gives me Humidity\n\nDew and Air gives me Fog\nFire and Water gives me Steam  \nSteam and Earth gives me Geothermal Vapor  \nSteam and Water gives me Condensed Mist\nEarth and Fire gives me Lava\n\nLava and Earth gives me Obsidian\n\nLava and Fire gives me Plasma\n\nObsidian and Earth gives me Geode\n\nObsidian and Fire giv