# Pacchetti usati

In [None]:
from pydantic import BaseModel, Field
from typing import List, Union, Dict, Any, Optional, Set
from pydantic.json import pydantic_encoder
import os
import re
import logging
import json
from dotenv import load_dotenv
from neo4j import GraphDatabase
from groq import Groq
from together import Together
import numpy as np
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
from neo4j_graphrag.indexes import create_vector_index
from openai import OpenAI
from neo4j_graphrag.retrievers import VectorRetriever
from neo4j_graphrag.retrievers import VectorCypherRetriever
from neo4j_graphrag.generation import GraphRAG
from neo4j_graphrag.generation.prompts import RagTemplate

togetherai = Together(api_key=os.getenv("TOGETHERAI_API_KEY"))
groq = Groq(api_key=os.getenv("GROQ_API_KEY"))
# Initialize the OpenAIEmbeddings with the desired model
load_dotenv()
neo4j_uri = os.getenv("NEO4J_URI1")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD1")
gpt = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key=os.environ.get("GROQ_API_KEY")
)
embedder = OpenAIEmbeddings(model="text-embedding-ada-002")
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))


# Definizione del knowledge graph

In [None]:
# Base class for a graph node without "confidence" and "reason" fields
class BaseNode(BaseModel):
    name: str = Field(..., description="Name of the node")
    type: List[str] = Field(..., description="Type of the node")
    attributes: dict = Field(default_factory=dict, description="Node attributes")
    
    def __repr__(self):
        return f"<BaseNode name={self.name}, type={self.type}, attributes={self.attributes}>\n"

# Base class for a graph edge without the "attributes" field
class BaseEdge(BaseModel):
    source: str = Field(..., description="Source node name")
    target: str = Field(..., description="Target node name")
    type: str = Field(..., description="Type of relationship")

    def __repr__(self):
        return f"<BaseEdge from={self.source} to={self.target}, type={self.type}>\n"

# Class representing the entire knowledge graph
class KnowledgeGraph(BaseModel):
    nodes: List[BaseNode] = Field(default_factory=list, description="List of nodes")
    edges: List[BaseEdge] = Field(default_factory=list, description="List of edges")

    def add_node(self, node: BaseNode):
        self.nodes.append(node)

    def add_edge(self, edge: BaseEdge):
        self.edges.append(edge)
        
    def add_BaseNode(self, name: str, types: Union[str, List[str]], attributes: Optional[Dict[str, Any]] = None):
        # Consistent handling of types
        if isinstance(types, str):
            types = [types]
        
        self.nodes.append(
            BaseNode(
                name=name,
                type=types,
                attributes=attributes or {}
            )
        )
        
    def add_BaseEdge(self, source: str, target: str, type: str):
        self.edges.append(
            BaseEdge(
                source=source,
                target=target,
                type=type
            )
        )
    
    def load_from_dict(self, data: Dict[str, Any]):
        """Load knowledge graph data from a dictionary"""
        # Expecting nodes data to use 'name', 'types' (or 'type') and 'attributes'
        for node_data in data.get("nodes", []):
            # Handle both 'type' and 'types' fields for backward compatibility
            node_types = node_data.get("types", node_data.get("type", []))
            self.add_BaseNode(
                name=node_data["name"],
                types=node_types,
                attributes=node_data.get("attributes", {})
            )
            
        for edge_data in data.get("edges", []):
            self.add_BaseEdge(
                source=edge_data["source"],
                target=edge_data["target"],
                type=edge_data["type"]
            )
        
        return self
        
    def get_BaseNode_by_name(self, node_name: str) -> Optional[BaseNode]:
        """Get a BaseNode by its name"""
        for node in self.nodes:
            if node.name == node_name:
                return node
        return None
        
    def get_BaseNodes_by_type(self, node_type: str) -> List[BaseNode]:
        """Get all BaseNodes that have the specified type"""
        return [node for node in self.nodes if node_type in node.type]
    
    def get_BaseNodes_with_multiple_types(self) -> List[BaseNode]:
        """Get all BaseNodes that have multiple types"""
        return [node for node in self.nodes if len(node.type) > 1]
    
    def add_type_to_BaseNode(self, node_name: str, new_type: str) -> bool:
        """Add a type to an existing BaseNode"""
        node = self.get_BaseNode_by_name(node_name)
        if node:
            if new_type not in node.type:
                node.type.append(new_type)
            return True
        return False
    
    def get_all_BaseNode_types(self) -> Set[str]:
        """Get all unique BaseNode types in the knowledge graph"""
        types_set = set()
        for node in self.nodes:
            types_set.update(node.type)
        return types_set
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert the knowledge graph to a dictionary with consistent field names"""
        return {
            "nodes": [
                {
                    "name": node.name,
                    "type": node.type,
                    "attributes": node.attributes
                }
                for node in self.nodes
            ],
            "edges": [
                {
                    "source": edge.source,
                    "target": edge.target,
                    "type": edge.type
                }
                for edge in self.edges
            ]
        }
    
    def to_json(self) -> str:
        """Convert the knowledge graph to a JSON string"""
        return json.dumps(self.to_dict(), default=pydantic_encoder, indent=2)

# Example of loading a knowledge graph with a BaseNode having multiple types
def example_knowledge_graph():
    # Create a new knowledge graph
    kg = KnowledgeGraph()
    
    # Add BaseNodes with multiple types
    kg.add_BaseNode(
        name="person1",
        types=["Person", "Author"],
        attributes={"name": "Jane Doe", "age": 35}
    )
    
    kg.add_BaseNode(
        name="book1",
        types=["Book"],
        attributes={"title": "Graph Theory Applications", "year": 2023}
    )
    
    # Add an edge
    kg.add_BaseEdge(
        source="person1",
        target="book1",
        type="WROTE"
    )
    
    # You can also load from a dictionary (e.g., from JSON)
    data = {
        "nodes": [
            {
                "name": "person2",
                "types": ["Person", "Researcher", "Teacher"],
                "attributes": {"name": "John Smith", "institution": "University XYZ"}
            },
            {
                "name": "paper1",
                "types": ["Publication"],
                "attributes": {"title": "New Methods in Knowledge Graphs", "year": 2024}
            }
        ],
        "edges": [
            {
                "source": "person2",
                "target": "paper1",
                "type": "AUTHORED"
            },
            {
                "source": "person1",
                "target": "person2",
                "type": "COLLABORATES_WITH"
            }
        ]
    }
    
    # Load the additional data
    kg.load_from_dict(data)
    
    return kg

# Example usage
if __name__ == "__main__":
    # Create and populate knowledge graph
    knowledge_graph = example_knowledge_graph()
    
    # Print the graph information
    print(f"Knowledge Graph has {len(knowledge_graph.nodes)} nodes and {len(knowledge_graph.edges)} edges")
    
    # Find nodes with multiple types using the new method
    multi_type_nodes = knowledge_graph.get_BaseNodes_with_multiple_types()
    print(f"\nNodes with multiple types: {len(multi_type_nodes)}")
    
    for node in multi_type_nodes:
        print(f"Node Name: {node.name}")
        print(f"Types: {', '.join(node.type)}")
        print(f"Attributes: {node.attributes}\n")
    
    # Add another type to an existing node
    knowledge_graph.add_type_to_BaseNode("book1", "TextBook")
    print("\nAfter adding 'TextBook' type to 'book1':")
    book_node = knowledge_graph.get_BaseNode_by_name("book1")
    print(f"Node Name: {book_node.name}")
    print(f"Types: {', '.join(book_node.type)}")
    
    # Get all nodes of a specific type
    researchers = knowledge_graph.get_BaseNodes_by_type("Researcher")
    print(f"\nResearchers in the knowledge graph: {len(researchers)}")
    for researcher in researchers:
        print(f"- {researcher.name}: {researcher.attributes.get('name', 'Unknown')}")
    
    # Display all unique node types in the graph
    all_types = knowledge_graph.get_all_BaseNode_types()
    print(f"\nAll node types in the knowledge graph: {', '.join(sorted(all_types))}")
    
    # Export to JSON with consistent field names
    print("\nJSON representation (excerpt):")
    json_str = knowledge_graph.to_json()
    print(json_str[:300] + "..." if len(json_str) > 300 else json_str)
         
'''class FeatureNode(BaseNode, frozen=True):
    label: str = Field("FeatureNode", description="This node is used to store product technical and functional features data.")
    name: str = Field(..., description="Name of the feature.")
    value: Union[float, List[float]] = Field(..., description="Value of the feature. It could be a single value or a list two values representing a range")
    unit: str = Field(..., description="Unit of the feature value")
    condition: Optional[str] = Field(..., description="Condition of the feature")
    source_text: str = Field(..., description="text content from which node information is extracted")

class InterfaceNode(BaseNode, frozen=True):
    label: str = Field("InterfaceNode", description="This node is used to store product interfaces data")
    name: str = Field(..., description="Name of the interface")
    parameter_name: str = Field(..., description="Name of the parameter")
    parameter_value: Optional[Union[int, float, str, List[float]]] = Field(..., description="Value of the parameter")
    table: Optional[List[dict]] = Field(..., description="Table of the interface")
    source_text: str = Field(..., description="text content from which node information is extracted")

class InstallationNode(BaseNode, frozen=True):
    label: str = Field("InstallationNode", description="This node is used to store product installation data")
    installation_details: str = Field(..., description="Details of the installation")
    source_text: str = Field(..., description="text content from which node information is extracted")

class CertificationNode(BaseNode, frozen=True):
    label: str = Field("CertificationNode", description="This node is used to store product certification data")
    name: str = Field(..., description="Name of the certification")
    certification_number: str = Field(..., description="Certification number")
    standards: List[str] = Field(default_factory=list, description="List of standards associated with the certification")
    source_text: str = Field(..., description="text content from which node information is extracted")

class SafetyNode(BaseNode, frozen=True):
    label: str = Field("SafetyNode", description="This node is used to store product safety data")
    title: str = Field(..., description="Title of the safety parameter or section")
    source_text: str = Field(..., description="text content from which node information is extracted")

class SafetyParametersNode(BaseNode, frozen=True):
    label: str = Field("SafetyParametersNode", description="This node is used to store product safety parameters data")
    parameters: dict = Field(..., description="Table of safety parameters")
    source_text: str = Field(..., description="text content from which node information is extracted")

class OrderNode(BaseNode, frozen=True):
    label: str = Field("OrderNode", description="This node is used to store product order data")
    code: str = Field(..., description="Order code")
    source_text: str = Field(..., description="text content from which node information is extracted")

class StartUpNode(BaseNode, frozen=True):
    label: str = Field("StartUpNode", description="This node is used to store product startup data")
    procedure: str = Field(..., description="Procedure for starting up the product")
    source_text: str = Field(..., description="text content from which node information is extracted")

class FieldConnectionNode(BaseNode, frozen=True):
    label: str = Field("FieldConnectionNode", description="This node is used to store product field connection data")
    name: str = Field(..., description="Name of the field connection")
    type: str = Field(..., description="Type of the field connection")
    source_text: str = Field(..., description="text content from which node information is extracted")

class OperationModeNode(BaseNode, frozen=True):
    label: str = Field("OperationModeNode", description="This node is used to store product operation mode data")
    name: str = Field(..., description="Type of the operation mode")
    description: str = Field(..., description="Instructions for the operation mode")
    source_text: str = Field(..., description="text content from which node information is extracted")

class ConfigurationModeNode(BaseNode, frozen=True):
    label: str = Field("ConfigurationModeNode", description="This node is used to store product configuration data")
    name: str = Field(..., description="Name of the configuration")
    instructions: str = Field(..., description="Instructions for the configuration mode")
    source_text: str = Field(..., description="text content from which node information is extracted")
'''

# Inserimento dei json nel knowledge graph

In [None]:
def create_knowledge_graph_from_folder(folder_path: str):
    """
    Reads JSON files in the specified folder (and its subfolders) and builds a knowledge graph.
    
    For each JSON file:
      - Derives the PDF base from the "pdf_name" field.
      - Creates nodes for each section (sections with key != "-1") and a special root node for sections with key "-1".
      - Ensures a product node exists (node id equals the PDF base).
      - Adds bidirectional edges based on "supersections" references. When a section’s supersection is "-1",
        edges are also added from the product node.
      - Forces every edge originating from the root node to have type "product".
    
    Returns:
        KnowledgeGraph: The constructed knowledge graph.
    """
    kg = KnowledgeGraph()
    nodes = {}

    def convert_ref(ref):
        try:
            return int(ref)
        except (ValueError, TypeError):
            return None

    def add_bidirectional_edge(source, target, title):
        # Add edge from source to target and a reverse edge with modified type.
        kg.add_edge(BaseEdge(source=source, target=target, type=f"has {title}"))
        #kg.add_edge(BaseEdge(source=target, target=source, type=f"{title} of"))

    for root_dir, _, files in os.walk(folder_path):
        for file in files:
            if not file.lower().endswith(".json"):
                continue

            file_path = os.path.join(root_dir, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue

            # Derive PDF base name
            pdf_name_raw = data.get("pdf_name", "Unknown")
            pdf_base = pdf_name_raw.split("_ISM")[0] if "_ISM" in pdf_name_raw else pdf_name_raw
            pdf_base = pdf_base.replace("_EN", "").replace("_R", "")

            product_node_id = pdf_base
            root_node_id = f"{pdf_base} root"

            # Create product node if not already present.
            if product_node_id not in nodes:
                product_node = BaseNode(name=product_node_id, type=["product"], attributes={})
                nodes[product_node_id] = product_node
                kg.add_node(product_node)

            sections = data.get("sections", {})
            file_node_map = {}

            # Create nodes for sections.
            for sec_key, sec_data in sections.items():
                if sec_key == "-1":
                    # Create the special root node.
                    if root_node_id not in nodes:
                        root_node = BaseNode(name="root", type=["root"], attributes={})
                        nodes[root_node_id] = root_node
                else:
                    ref_int = convert_ref(sec_key)
                    if ref_int is None:
                        continue
                    node_id = f"{pdf_base}_{ref_int}"
                    file_node_map[ref_int] = node_id
                    title = sec_data.get("title", "").strip()
                    node = BaseNode(
                        name=f"{pdf_base} {title}",
                        type=[title],
                        attributes={k: sec_data[k] for k in ("tables", "text") if k in sec_data}
                    )
                    nodes[node_id] = node
                    kg.add_node(node)

            # Create edges based on "supersections".
            for sec_key, sec_data in sections.items():
                if sec_key == "-1":
                    continue
                ref_int = convert_ref(sec_key)
                if ref_int is None:
                    continue
                source_node_id = f"{pdf_base}_{ref_int}"
                title = nodes[source_node_id].type
                for ref in sec_data.get("supersections", []):
                    z = convert_ref(ref)
                    if z is None:
                        continue
                    # Determine the super node: -1 references the root node.
                    if z != -1: 
                        add_bidirectional_edge(nodes[file_node_map.get(z, f"{pdf_base}_{z}")].name, nodes[source_node_id].name, title)
                    # If the supersection is -1, also connect the product node.
                    if z == -1:
                        add_bidirectional_edge(nodes[product_node_id].name, nodes[source_node_id].name, title)
            kg.add_edge(BaseEdge(source="root", target=nodes[product_node_id].name, type="product"))
    kg.add_node(root_node)
    for node in kg.nodes:
        # Se il campo name è una stringa vuota (dopo aver rimosso gli spazi) lo sostituisce
        if isinstance(node.name, str) and not node.name.strip():
            node.name = "empty name"
        
        # Se il campo label (node.type) è una stringa vuota lo sostituisce
        if isinstance(node.type, str) and not node.type.strip():
            node.type = "empty label"

    return kg

In [None]:

# --- Example usage ---
if __name__ == "__main__":
    folder_path = input("Enter the folder path (default: dataset/extraction_results_renamed/ISM): ").strip()
    if not folder_path:
        folder_path = "dataset/prova"
    
    graph = create_knowledge_graph_from_folder(folder_path)
    


In [None]:
# Conta e stampa il numero totale di nodi
print("Total nodes:", len(graph.nodes))
empty_nodes_count = 0

print("Nodes:")
for node in graph.nodes:
    # Estrai e pulisci i campi name e label
    node_name_raw = node.name if hasattr(node, "name") else ""
    node_label_raw = node.type if hasattr(node, "type") else ""
    
    node_name = node_name_raw.strip()
    node_label = node_label_raw
    
    # Verifica se name o label sono vuoti
    if node_name == "" or node_label == "":
        empty_nodes_count += 1
        if node_name == "":
            node_name = "empty name"
        if node_label == "":
            node_label = "empty label"
    

# Stampa il numero di nodi vuoti
print("Total empty nodes:", empty_nodes_count)

# Conta e stampa il numero totale di edge
print("\nTotal edges:", len(graph.edges))
print("Edges:")
for edge in graph.edges:
    print(edge)


In [None]:
print(graph)

# Funzione che carica un KG su neo4j

In [None]:
def load_knowledge_graph_into_neo4j(kg: KnowledgeGraph,
                                    uri: str = os.getenv("NEO4J_URI1"),
                                    user: str = os.getenv("NEO4J_USERNAME"),
                                    password: str = os.getenv("NEO4J_PASSWORD1")):
    """
    Carica i nodi e le relazioni di un'istanza di KnowledgeGraph in Neo4j
    usando il driver ufficiale (neo4j.GraphDatabase).

    Per ogni nodo:
      - Crea un nodo con label corrispondente a node.type
      - Setta name=node.name come proprietà e aggiunge gli attributes 
        (convertendo eventuali strutture complesse in stringhe JSON).

    Per ogni edge:
      - Crea una relazione con type = edge.type
        partendo dal nodo con name=edge.source 
        verso il nodo con name=edge.target.
    """

    driver = GraphDatabase.driver(uri, auth=(user, password))

    # Funzione per eseguire le modifiche di schema
    def drop_schema(tx):
        constraints = tx.run("SHOW CONSTRAINTS").data()
        for c in constraints:
            constraint_name = c.get("name")
            if constraint_name:
                tx.run(f"DROP CONSTRAINT {constraint_name} IF EXISTS")
        indexes = tx.run("SHOW INDEXES").data()
        for idx in indexes:
            index_name = idx.get("name")
            if index_name:
                tx.run(f"DROP INDEX {index_name} IF EXISTS")

    # Funzione per eliminare i dati
    def clear_data(tx):
        tx.run("MATCH (n) DETACH DELETE n")

    def create_node(tx, label, node_name, properties):
        # Usiamo MERGE per creare il nodo se non esiste e poi aggiorniamo le proprietà.
        query = (
            f"MERGE (n:`{label}`:suca {{ name: $node_name }}) "
            "SET n += $props"
        )
        tx.run(query, node_name=node_name, props=properties)

    def create_relationship(tx, source_id, rel_type, target_id):
        query = (
            "MATCH (s {name: $source_id}), (t {name: $target_id}) "
            f"MERGE (s)-[r:`{rel_type}`]->(t)"
        )
        tx.run(query, source_id=source_id, target_id=target_id)

    def create_inverse_relationship(tx):
        query = (
            "MATCH (x)-[r]->(y) "
            "WHERE type(r) STARTS WITH 'has ' "
            "WITH x, y, substring(type(r), 4) AS s "
            "CALL apoc.create.relationship(y, s + ' of', {}, x) YIELD rel "
            "RETURN count(rel) AS numCreated"
        )
        result = tx.run(query)
        return result.single()["numCreated"]

    with driver.session() as session:
        # 1) Eseguiamo le modifiche di schema in una transazione separata
        session.execute_write(drop_schema)
        # 2) Puliamo i dati in una transazione separata
        session.execute_write(clear_data)

        # Creazione di tutti i nodi
        for node in kg.nodes:
            cleaned_attrs = {}
            for k, v in node.attributes.items():
                if isinstance(v, (dict, list)):
                    cleaned_attrs[k] = json.dumps(v)
                else:
                    cleaned_attrs[k] = v

            session.execute_write(create_node, node.type, node.name, cleaned_attrs)

        # Creazione di tutte le relazioni
        for edge in kg.edges:
            session.execute_write(create_relationship, edge.source, edge.type, edge.target)

        # Creazione delle relazioni inverse per quelle che iniziano per "has "
        num_created = session.execute_write(create_inverse_relationship)
        print(f"{num_created} relazioni inverse create.")

    driver.close()
    print("Caricamento completato!")


# Funzione che interroga il database tramite prompt e fornisce una risposta testuale da parte di un llm

In [None]:


# 2) Function that asks an LLM to generate a Cypher query
def generate_cypher_query_from_prompt(prompt: str,model: str = "llama-3.3-70b-versatile") -> str:
    """
    Sends the prompt to an LLM (e.g., ChatGPT) and receives a Cypher query in response
    that addresses the user's request.
    """
    system_instruction = ('''
        You are an assistant that transforms questions or commands into Cypher queries 
        for a Neo4j database. Generate ONLY the Cypher query, nothing else.
        example:
        // How many Articles are in the database?
        MATCH (a:Article)
        RETURN COUNT(DISTINCT a) AS articleCount

        // What are some example Articles in the database (without the embedding field)?
        MATCH (a:Article)
        RETURN apoc.map.removeKey(a, 'embedding') AS article
        LIMIT 5

        // What is the most commonly purchased Article?
        MATCH (c:Customer)-[r:PURCHASED]->(a:Article)
        RETURN a.prodName AS product, count(r) AS purchases
        ORDER BY purchases DESC
        LIMIT 5

        // What Department has the most purchases?
        MATCH (c:Customer)-[r:PURCHASED]->(:Article)-[:FROM_DEPARTMENT]->(d:Department)
        RETURN d.departmentName AS department, count(r) AS purchases
        ORDER BY purchases DESC
        LIMIT 5'''
    )
    user_message = f"User: {prompt}\nGenerate the corresponding Cypher query (without the embedding field)."
    response = groq.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_instruction},
            {"role": "user", "content": user_message},
        ],
        temperature=0,  # Lower = more deterministic
        stream=False,
    )
    generated_text = response.choices[0].message.content.strip()
    return generated_text

# New function: responds to the user using the context composed of:
# - the user's prompt
# - the generated Cypher query
# - the database query results
def answer_using_query_results(user_prompt: str, cypher_query: str, query_results: list,model: str = "llama-3.3-70b-versatile") -> str:
    """
    Uses the user prompt, the generated Cypher query, and the query results as context
    to generate an accurate and clear response for the user.
    """
    system_instruction = (
        "You are an assistant that uses the provided context (user prompt, Cypher query, and query results) "
        "to respond accurately and clearly to the user's question."
    )
    context = (
        f"User Prompt: {user_prompt}\n"
        f"Generated Cypher Query: {cypher_query}\n"
        f"Database Results: {query_results}"
    )
    user_message = (
        f"Use the following context to answer the user's question:\n\n{context}"
    )
    response = groq.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_instruction},
            {"role": "user", "content": user_message},
        ],
        temperature=0,  # deterministic
        stream=False,
    )
    answer_text = response.choices[0].message.content.strip()
    return answer_text

# 3) Function that executes the generated Cypher query on Neo4j and returns the results
def run_cypher_query_on_neo4j(query: str, uri: str = os.getenv("NEO4J_URI1"), user: str = os.getenv("NEO4J_USERNAME"), password: str = os.getenv("NEO4J_PASSWORD1")):
    """
    Executes the Cypher query on the Neo4j database, prints, and returns the results.
    """
    driver = GraphDatabase.driver(uri, auth=(user, password))
    results_list = []
    with driver.session() as session:
        result = session.run(query)
        for record in result:
            results_list.append(dict(record))
    driver.close()
    print("Query Results:")
    for row in results_list:
        print(row)
    return results_list


# Trasformazione in attributi della parte testuale dei campi del KG

In [None]:
def extract_json_from_text(text: str) -> dict:
    """
    Searches for a JSON dictionary within a text and returns it.
    The JSON substring is extracted from the first '{' to the last '}'.
    If no valid JSON is found, returns {}.
    """
    # Find the index of the first '{' and the last '}'
    start_index = text.find('{')
    end_index = text.rfind('}')
    
    # Check if both braces were found and in correct order
    if start_index == -1 or end_index == -1 or start_index > end_index:
        return {}
    
    json_string = text[start_index:end_index+1]
    
    try:
        parsed_json = json.loads(json_string)
        if isinstance(parsed_json, dict):
            return parsed_json
    except json.JSONDecodeError:
        pass

    return {}

def extract_scalar_features(text: str, model: str = "llama-3.3-70b-versatile", max_calls: int = 1) -> dict:
    """
    Uses Groq to extract the main numerical or scalar features from the text,
    returning them as a dictionary.
    
    If the returned JSON is empty, the LLM call is repeated up to a maximum of 'max_calls' times (default 10).
    If after all calls a valid output is not obtained, returns {}.
    """
    # System prompt: Instruisce il modello ad utilizzare un formato JSON specifico, con un esempio chiaro.
    system_prompt = """ 
You are an assistant tasked with precisely and thoroughly extracting numerical, measurable, scalar values and their descriptions from provided product description text specifically for Relay Output Modules of Switch/Proximity Detector Repeaters.

Your goal is to populate a structured JSON object intended to build a detailed knowledge graph. This structured data will represent attributes, specifications, characteristics, and certifications of relay output modules accurately and completely.  

# Detailed Steps:

1. **Carefully read the provided product text**.
2. **Precisely identify and explicitly extract mentioned numerical values, scalar data, measurable values, and their units** from the following specific fields (ONLY if explicitly stated in the text):
   - **Supply** (e.g., voltage information — ranges or single values; current specifications)
   - **Isolation (Test Voltage)** (explicit numeric voltage and values)
   - **Input** (signals, ranges, voltage, current, frequency, or other numeric specifics related directly to the input)
   - **Output** (output voltage, current, power ratings explicitly mentioned)
   - **Compatibility** (specified numeric standards or numerical descriptions explicitly mentioned)
   - **Environmental Conditions** (temperature ranges, humidity, altitude explicitly numerical)
   - **Safety descriptions** (numbers related to safety specifications explicitly stated in the text)
   - **Approvals** (numerical certificates, classes, groups, or numeric standard numbers)
   - **Mounting** (specific numeric mounting dimensions or numerical descriptors clearly defined in the text)
3. **Do not create empty fields whatsoever**. If a field or numeric measurement is not explicitly stated, completely omit this information.
4. If a numerical specification is a range, store it explicitly as an array of numbers: `[min, max]`.
5. **Include all numeric values without altering them. Do not estimate, infer, guess, or round numeric values. Keep exact numbers as explicitly mentioned.**
6. **Include units of measurement exactly as explicitly given in the provided input text.**
7. **If a numeric field has no stated unit of measure, DO NOT include a unit.**
8. For product dimension information explicitly stated in the text, aggregate this information into a single array `[Width, Height, Depth]` with stated units where explicitly provided.
9. Provide a detailed, textual description.  
   - Summarize briefly and clearly based on the provided source text.  
   - If uncertain on how best to summarize and to prevent empty descriptions, copy text directly as the description in full as last resort.  
   - NEVER leave the description field blank.

# Output Format:

Produce a structured JSON object similar to the following schema !Attention, except for the description, the other fields may be different!:

{
  "description": "textual description of around 70% total lenght of the text taken from the source text explicitly",
  "supply": {
      "voltage": { "range": [X, Y], "unit": "[explicit unit from text]" } OR { "value": X, "unit": "[explicit unit from text]" },
      "current": { "range": [X, Y], "unit": "[explicit unit from text]" } OR { "value": X, "unit": "[explicit unit from text]" } (only if explicitly stated)
  }, (only present if explicitly stated)
  "isolation": {
      "test_voltage": { "value": X, "unit": "[explicit unit from text]" }
  }, (only present if explicitly stated)
  "input": {
      "[attribute_name]": { "range": [X, Y], "unit": "[explicit unit from text]" } OR {"value": X, "unit": "[explicit unit from text]"}
  }, (only present if explicitly stated)
  "output": {
      "[attribute_name]": { "range": [X, Y], "unit": "[explicit unit from text]" } OR {"value": X, "unit": "[explicit unit from text]"}
  }, (only present if explicitly stated)
  "compatibility": ["[explicit numerical or scalar compatibility specifications from text]"], (only present if explicitly stated explicitly numerically or scalar)
  "environmental_conditions": {
      "temperature": { "range": [X, Y], "unit": "[explicit unit from text]" },
      "humidity": { "range": [X, Y], "unit": "[explicit unit from text]" },
      "altitude": { "value": X, "unit": "[explicit unit from text]" }
  }, (only present if explicitly stated)
  "safety": {
      "[attribute_name (e.g. protection_rating)]": { "value": X, "unit": "[explicit unit from text, unless unspecified explicitly]" }
  }, (only present if explicitly stated)
  "approvals": ["[explicit numeric approval or certification identifiers if explicitly stated]"], (only present if explicitly stated)
  "mounting": {
      "[numeric attribute_name (e.g. dimensions)]": { "value": [X, Y, Z], "unit":"[explicit unit from text]" } OR numeric_scalar_values explicitly mentioned
  } (only present if explicitly stated)
}

If certain fields as listed above appear nowhere explicitly in the source text, entirely omit them from your final output. Do NOT generate empty or speculative fields; **ONLY input factual numerically-expressed data provided explicitly.**

# Examples:

Begin Examples:

Input:
"The Relay Module has a supply voltage ranging from 110V to 240V and current consumption of 500mA. Isolation is tested with 2500 V. Input frequency is between 50Hz and 60Hz. The output relay supports up to 10A at 250VAC. Operating temperature range specified is -20°C to 60°C. The relay module carries ATEX approval numbers 0073 II (1) G Ex ia IIC. Dimensions are width 50mm, height 70mm, and depth 25mm."

Output:
{
  "description": "Relay Module with specified voltage, current, isolation, input frequency, output current rating, temperature range, approvals and dimensions.",
  "supply": {
      "voltage": {"range": [110, 240], "unit":"V"},
      "current": {"value": 500, "unit":"mA"}
  },
  "isolation": {
      "test_voltage": {"value":2500, "unit":"V"}
  },
  "input": {
      "frequency": {"range": [50,60], "unit":"Hz"}
  },
  "output": {
      "current": {"value":10, "unit":"A"},
      "voltage": {"value":250, "unit":"VAC"}
  },
  "environmental_conditions": {
      "temperature": {"range":[-20,60],"unit":"°C"}
  },
  "approvals": ["0073 II (1) G Ex ia IIC"],
  "mounting": {
      "dimensions": {"value":[50,70,25], "unit":"mm"}
  }
}
input: 
"The battery supports a charge voltage from 3.0V to 4.2V. It has a capacity of 5000mAh and weighs approximately 250 grams. The retail price is 29.99 euros.",
output:
{
    'description': 'The battery has specific charge voltage, capacity, weight, and retail price.',
    "charge_voltage": {"range": [3.0, 4.2], "unit": "V"},
    "capacity": {"value": 5000, "unit": "mAh"},
    "weight": {"value": 250, "unit": "g"},
    "price": {"value": 29.99, "unit": "EUR"}
}

End Examples.

# Notes:
- Do not hallucinate any numeric values or details. Strict adherence required.
- Do NOT wrap returned JSON in code blocks (```) or any formatting unless explicitly requested.
- Always keep the numerical values PRECISELY as provided.
- Failure to follow instructions EXACTLY may result in critical system error and severe consequences.
- The description field must contain all non-numeric information in the text
    """    
    # User prompt: Include il testo di input e le istruzioni per restituire l'output esattamente nel formato richiesto.
    user_prompt = """
    Extract all the numerical or scalar values from the following text and summarize the verbose parts in the `description` field.  
    Text:  
    ```
    {text}
    ```
    """
    user_prompt = user_prompt.format(text=text)
    # Loop per ripetere la chiamata se l'output non è valido o risulta vuoto.
    for i in range(max_calls):
        try:
            response = groq.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0,
                stream=False,
                response_format={"type": "json_object"},
            )
        except Exception as e:
            # Catching any exception (e.g., BadRequestError) from the LLM call
            print(f"Attempt {i+1}: LLM call failed with error: {e}")
            continue  # Try again on the next iteration

        # If the LLM call succeeded, try extracting the JSON
        content = response.choices[0].message.content.strip()
        #print(f"Attempt {i+1}: Received content:\n{content}")
        
        try:
            json_content = extract_json_from_text(content)
            # If JSON extraction succeeds, return the JSON content.
            return json_content
        except Exception as e:
            print(f"Attempt {i+1}: JSON extraction failed with error: {e}")
            # Continue to next iteration if JSON extraction fails

    # Return an empty dictionary if no valid output was obtained after max_calls.
    return {}

def extract_scalar_features_from_json_table(table, model: str = "llama-3.3-70b-versatile", max_calls: int = 1) -> dict:
    """
    Uses Groq to extract the main numerical or scalar features from a table formatted as a JSON object,
    returning them as a dictionary.

    The table must be provided as a Python list or dict 
    
    If the returned JSON is empty, the LLM call is repeated up to a maximum of 'max_calls' times.
    If after all calls a valid output is not obtained, returns {}.
    """

    # System prompt: instruct the model to use a specific JSON format with a clear example.
    # questo system prompt è da sistemare
    system_prompt = """
    You are an assistant tasked with **fully and precisely** extracting **all information from a provided table.

    **Requirement to Avoid Hallucination**  
    - **Do not guess or infer** any numerical values that are not explicitly stated in the table.  
    - If the table does not specify a particular numerical value, omit it rather than providing an estimate or range

    Structure nodes and relationships to reflect connections clearly between components, functions, and safety features.

    Maintain consistent labeling for technical specifications (e.g., voltage, current, safety certification).

    **Output Format**:  

    **Reminder**:  
    - **Do not invent, infer, or estimate** numeric data. If no value is stated, leave it out.  
    - **When listing ranges, use explicit numeric arrays** (e.g., `[4, 5]` instead of `min=4, max=5`) to represent a range from 4 to 5.  
    - Use the units of measure present in the table when available.  
    - Aggregate common information (e.g., for product size information, prefer a single field with the overall dimensions rather than separate fields). 
    -stick as closely as possible to the formatting of the table itself in providing the output and do not provide empty dictionaries. e.g., do not provide output of the type 
    “supply”:"",
    “isolation": "" ,
    “input": "" ,
    -Adhere to the rules strictly. Non-compliance will result in termination.
        **Output Format**:  
    The JSON object must use the following schema: 
    """

    # User prompt: provide the table data and extraction instructions.
    user_prompt = """
    Based on the following example, extract all the features from the table.
    Do not include any additional attributes such as a 'description'.

    Table:  
    ```
    {table}
    ```
    """
    user_prompt = user_prompt.format(table=table)

    # Loop to repeat the call if the output is invalid or empty.
    for i in range(max_calls):
        response = groq.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0,
            stream=False,
            response_format={"type": "json_object"},
        )

        # Get the content returned from the model.
        content = response.choices[0].message.content.strip()
        #content=extract_json_from_text(content) #togli il commento per parsare l'output
        if content != {}:
            return json.loads(content)

    return {}  # Returns {} if no valid output is obtained after max_calls.


def get_embedding_and_features(input_data: Union[str, list],name:str, model: str = "llama-3.3-70b-versatile",max_calls: int = 1) -> dict:
    """
    Accepts either a text string or a table formatted as a JSON list.
    
    For plain text input:
      1) Calculates the embedding of the text.
      2) Extracts main scalar features.
      3) Returns a dictionary with both the features and the embedding.
    
    For table input (list):
      1) Extracts main scalar features using the table-specific extractor.
      2) Returns only the extracted features (no embedding is calculated).
    """
    if isinstance(input_data, list):
        # Directly process list input as table data.
        return extract_scalar_features_from_json_table(input_data, model, max_calls)
        
    elif isinstance(input_data, str) and input_data.strip():
        features = extract_scalar_features(input_data, model, max_calls)
        if "description" in features and features["description"]:
            stringa = "name: " + name + " description: " + features["description"]
            embedding = np.array(embedder.embed_query(stringa))
            return {**features, "embedding": embedding}
        else:
            stringa = "name: " + name
            embedding = np.array(embedder.embed_query(stringa))
            return {**features, "embedding": embedding}
    else:
        return {}
def enrich_graph_with_embeddings(graph: KnowledgeGraph,model: str = "llama-3.3-70b-versatile",max_calls: int = 1) -> None:
    """
    For each node in the graph:
      - If the node has a 'text' attribute (string), pass it to get_embedding_and_features and store
        the returned features under the key 'text_features'.
      - If the node has a 'tables' attribute, pass it to get_embedding_and_features and store the result
        under the key 'table_features'.
    Nodes that do not have these attributes are left unchanged.
           text = node.attributes.get("text")
        if text and isinstance(text, str):
            features = get_embedding_and_features(text)
            # Aggiorna gli attributi del nodo con le nuove feature
            node.attributes.update(features
    """
    for node in graph.nodes:
        # Process text attribute if present.
        if "text" in node.attributes:
            text = node.attributes["text"]
            name =node.name
            if isinstance(text, str):
                text_features = get_embedding_and_features(input_data=text,name=name, model=model,max_calls=max_calls)
                node.attributes.update(text_features)

        # Process tables attribute if present.
        if "ta" in node.attributes: #sostituire con tables se si vuole operare sulle tabelle
            tables = node.attributes["tables"]
            table_features = get_embedding_and_features(input_data=tables,name=name, model=model,max_calls=max_calls)
            node.attributes.update(table_features)



In [None]:

# Example usage
if __name__ == "__main__":
    sample_text = """" "Installation",
S upply: No supply voltage required because loop-powered. \nPower dissipation: \u2264 1.1 W per channel at 40 mA, 30 V loop supply. \nIsolation (Test Voltage): I.S. Out/In 1.5 KV; I.S. Out/I.S. Out 500 V; In/In 500 V. \n\nOutput Signal to Hazardous Area:\nOutput: 1 to 40 mA. \nOutput characteristic (typical): \nVout = (Vin - 1.5) - (0.4 x Iout) for 6 V < Vin < 23 V. \nVout = 22 - (0.4 x Iout) for 23 V < Vin < 30 V. \n4-20 mA output on load of 100 to 600 \u2126; Accuracy \u2264 1 %. \nResponse time: 50 ms (10 to 90 % step change). \n\n\nInput Signal to Safe Area:\nOperating voltage range: 6 to 30 V (loop powered). \nInput current: 1 to 40 mA (loop powered). \nVoltage drop-out: 9.5 V at 20 mA and with 500 \u2126 load. \nOpen circuit consumption: \u2264 0.4 mA at 20 V. \n\n\nPerformance:\nReference ambient temperature conditions: 23 \u00b1 1 \u00b0C. \nCurrent transfer error: \u2264 400 \u00b5A (6 V <Vin< 23 V; 1 mA <Iout< 40 mA). \nTemperature influence: \u2264 \u00b1 0.01 % for a 1 \u00b0C change. \n\n\nCompatibility:\nCE mark compliant, conforms to Directive: \n2014/34/EU ATEX, 2014/30/EU EMC, 2014/35/EU LVD, 2011/65/EU RoHS. \n\n\nEnvironmental conditions:\nOperating: temperature limits -20 to + 60 \u00b0C, \nrelative humidity max 90 % non condensing, up to 35 \u00b0C. \nStorage: temperature limits \u2013 45 to + 80 \u00b0C. \n\n\nSafety Description:\nATEX: II (1)G [Ex ia Ga] IIC, II (1)D [Ex ia Da] IIIC, I (M1) [Ex ia Ma] I; II 3G Ex ec IIC T4 Gc \nIECEx: [Ex ia Ga] IIC, [Ex ia Da] IIIC, [Ex ia Ma] I; Ex ec IIC T4 Gc \nINMETRO: [Ex ia Ga] IIC, [Ex ia Da] IIIC, [Ex ia Ma] I \nUo/Voc = 25.2 V, Io/Isc = 93 mA, Po/Po = 581 mW at terminals 13-14, 15-16. \nUm = 250 Vrms, -20 \u00b0C \u2264 Ta \u2264 60 \u00b0C. \n\n\nApprovals:\nDMT 01 ATEX E 042 X conforms to EN60079-0, EN60079-11. \nIECEx BVS 07.0027X conforms to IEC60079-0, IEC60079-11. \nIMQ 09 ATEX 013 X conforms to EN60079-0, EN60079-7. \nIECEx IMQ 13.0011X conforms to IEC60079-0, IEC60079-7. \nINMETRO DNV 13.0108 X conforms to ABNT NBR IEC60079-0, ABNT NBR IEC60079-11. \nFM & FM-C No. 3024643, 3029921C, conforms to Class 3600, 3610, 3611, 3810 and \nC22.2 No.142, C22.2 No.157, C22.2 No.213, E60079-0, E60079-11, E60079-15, \n\u0415\u0410\u042d\u0421 RU \u0421-IT.HA67.B.00113/20 conforms to GOST 31610.0, GOST 31610.11, GOST 31610.15. \nC\u0426 16.0034 X conforms to \u0414\u0421\u0422\u0423 7113, \u0413\u041e\u0421\u0422 22782.5-78, \u0414\u0421\u0422\u0423 I\u0415\u0421 60079-15. \nDNV No. TAA00002BM and KR No.MIL20769-EL001 Cert. for maritime applications. \n\n\nMounting:\nEN/IEC60715 TH 35 DIN-Rail. \nWeight: about 125 g D1022D, 110 g D1022S. \nConnection: by polarized plug-in disconnect screw terminal blocks to accomodate terminations up to 2.5 mm2. \nLocation: Safe Area/Non Hazardous Locations or Zone 2, Group IIC T4, \nClass I, Division 2, Groups A, B, C, D Temperature Code T4 and Class I, Zone 2, Group IIC, IIB, IIA T4 installation. \nProtection class: IP 20. \nDimensions: Width 22.5 mm, Depth 99 mm, Height 114.5 mm. \n\n\n",
     ."""
    feats = get_embedding_and_features(sample_text,"suca")
    feats_without_embedding = {k: v for k, v in feats.items() if k != "embedding"}
    print(feats_without_embedding)


In [None]:

# ---- Esempio di utilizzo ----

# Creiamo un grafo di esempio con due nodi aventi l'attributo "text"
kg = KnowledgeGraph(
    nodes=[
        BaseNode(name="1", type="Entity", attributes={"text": "Questo dispositivo può misurare temperature da -10.5°C a +45°C, con un consumo massimo di 2.5W. Pesa circa 1.2kg e costa 99 euro.","tables": [
            {
                "columns": [
                    "",
                    "SAFE AREA"
                ],
                "data": [
                    [
                        "7",
                        "1st pole of Out 1 (NC contact) for NE Load or F&G/ND Load"
                    ],
                    [
                        "8",
                        "1st pole of Out 2 (NC contact) for NE Load or F&G/ND Load"
                    ],
                    [
                        "9",
                        "1st pole of NO contact for Service load"
                    ],
                    [
                        "10",
                        "2nd pole of NO contact for Service load"
                    ],
                    [
                        "11",
                        "2nd pole of Out 1 (NC contact) for NE Load or F&G/ND Load"
                    ],
                    [
                        "12",
                        "2nd pole of Out 2 (NC contact) for NE Load or F&G/ND Load"
                    ]
                ]}]}),
        BaseNode(name="2", type="Entity", attributes={"text": "", "other_attr": "value", "another_attr": 42}),
    ],
    edges=[]
)

# Arricchiamo il grafo con le feature ottenute dalle stringhe di testo
enrich_graph_with_embeddings(kg)

# Stampa i nodi aggiornati
for node in kg.nodes:
    print(node)

In [None]:
enrich_graph_with_embeddings(graph,model="deepseek-r1-distill-llama-70b",max_calls=3) #model="deepseek-r1-distill-llama-70b-specdec"
                                                                                              #deepseek-r1-distill-llama-70b for usage limit

In [None]:
load_knowledge_graph_into_neo4j(graph, neo4j_uri, neo4j_username, neo4j_password)

In [None]:
user_prompt = "find node with  name D1072 Operation"

# a) Generate the Cypher query from the prompt
cypher_query = generate_cypher_query_from_prompt(user_prompt)
print(f"Query generated by the LLM:\n{cypher_query}")

# b) Execute the query and obtain the results
query_results = run_cypher_query_on_neo4j(cypher_query, neo4j_uri, neo4j_username, neo4j_password)

# c) Generate the response for the user using as context:
#    - the user's prompt
#    - the generated Cypher query
#    - the database query results
user_answer = answer_using_query_results(user_prompt, cypher_query, query_results)
print("\nAnswer for the user:")
print(user_answer)

# Then running graph and vector retriever


In [None]:
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))
query ='''
MATCH (n)
WHERE n.embedding IS NOT NULL
SET n:Abstract;
'''
driver.session().run(query) 
query ='''
CALL {
  MATCH (n)
  WHERE ANY(label IN labels(n) WHERE label CONTAINS 'Application')
  SET n:application
  RETURN count(n) AS cnt
}
CALL {
  MATCH (n)
  WHERE ANY(label IN labels(n) WHERE label CONTAINS 'Approvals')
  SET n:approvals
  RETURN count(n) AS cnt
}
CALL {
  MATCH (n)
  WHERE ANY(label IN labels(n) WHERE label CONTAINS 'Configuration Summary Table')
  SET n:configuration_summary_table
  RETURN count(n) AS cnt
}
CALL {
  MATCH (n)
  WHERE ANY(label IN labels(n) WHERE label CONTAINS 'Instruction')
  SET n:instruction
  RETURN count(n) AS cnt
}
RETURN 'Label aggiunte per tutte le parole' AS risultato'''

driver.session().run(query) 
query='''CREATE VECTOR INDEX `application-embeddings`
FOR (n:Application) ON (n.embedding)
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }
};

CREATE VECTOR INDEX `approvals-embeddings`
FOR (n:Approvals) ON (n.embedding)
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }
};

CREATE VECTOR INDEX `configuration_summary_table-embeddings`
FOR (n:`Configuration Summary Table`) ON (n.embedding)
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }
};

CREATE VECTOR INDEX `instruction-embeddings`
FOR (n:Instruction) ON (n.embedding)
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }
};
'''
driver.session().run(query) 

query ='''
DROP INDEX `abstract-embeddings` IF EXISTS
'''
driver.session().run(query)
query ='''
CREATE VECTOR INDEX `abstract-embeddings`
FOR (a:Abstract) ON (a.embedding)
OPTIONS {
indexConfig: {
`vector.dimensions`: 1536,
`vector.similarity_function`: 'cosine'
}
}'''
driver.session().run(query)

In [None]:
def answer_query(q:str,model:str = "llama-3.3-70b-versatile",k:int=5) -> str:
    driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))
    # Define the Cypher query
    query_cypher = (
        "CALL db.index.vector.queryNodes('abstract-embeddings', $k, $embedding) "
        "YIELD node, score RETURN node, score"
    )
    
    # Generate the embedding vector for the query.
    # (Assumes that you have an 'embedder' object available.)
    params = {'embedding': np.array(embedder.embed_query(q)), 'k': k}
    
    # Execute the query and accumulate all the results in one string.
    all_results = ""
    with driver.session() as session:
        result = session.run(query_cypher, parameters=params)
        for record in result:
            node = record["node"]
            score = record["score"]
            # Create a copy of the node’s properties without the 'embedding' field
            properties = dict(node)
            properties.pop("embedding", None)
            # Build the string for this node result
            node_str = f"labels={node.labels} properties={properties}"
            result_str = f"Node: {node_str}\nScore: {score}\n\n"
            all_results += result_str
            
    # Format the prompt by inserting the query and the context (all_results)
    prompt = f"""
Answer the Question using the following Context. Only respond with information mentioned in the Context. Do not inject any speculative information not mentioned.
mainly consider the text with major score.

# Question:
{q}

# Context:
{all_results}

# Answer:
"""

    # Call the chat completions API using the provided model
    response = groq.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": ""},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        stream=False,
    )
    
    # Extract and return the content from the API response
    content = response.choices[0].message.content
    return content


In [None]:
print(answer_query(q="give me the Input performance Ref. Junction Compensation influence of D1072"))

# Batch processing


In [None]:
#deepseek-r1-distill-llama-70b
'''# Your typical synchronous API call:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "user", "content": "What is quantum computing?"}
    ]
)

# The same call in batch format (must be on a single line as JSONL):
{"custom_id": "quantum-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "llama-3.1-8b-instant", "messages": [{"role": "user", "content": "What is quantum computing?"}]}}
'''
#2. Upload Your Batch File
import requests # pip install requests first!

def upload_file_to_groq(api_key, file_path):
    url = "https://api.groq.com/openai/v1/files"
    
    headers = {
        "Authorization": f"Bearer {api_key}"
    }
    
    # Prepare the file and form data
    files = {
        "file": ("batch_file.jsonl", open(file_path, "rb"))
    }
    
    data = {
        "purpose": "batch"
    }
    
    # Make the POST request
    response = requests.post(url, headers=headers, files=files, data=data)
    
    return response.json()

# Usage example
api_key = os.getenv("GROQ_API_KEY") 
file_path = "batch_file.jsonl"  # Path to your JSONL file

try:
    result = upload_file_to_groq(api_key, file_path)
    print(result)
except Exception as e:
    print(f"Error: {e}")

# 3create batch job

import requests # pip install requests first! 

def create_batch(api_key, input_file_id):
    url = "https://api.groq.com/openai/v1/batches"
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    data = {
        "input_file_id": input_file_id,
        "endpoint": "/v1/chat/completions",
        "completion_window": "24h"
    }
    
    response = requests.post(url, headers=headers, json=data)
    return response.json()

# Usage example
api_key = "YOUR_GROQ_API_KEY"
file_id = "file_01jh6x76wtemjr74t1fh0faj5t" # replace with your `id` from file upload API response object

try:
    result = create_batch(api_key, file_id)
    print(result)
except Exception as e:
    print(f"Error: {e}")
    
# 4 Get batch job status

import requests # pip install requests first!

def get_batch_status(api_key, batch_id):
    url = f"https://api.groq.com/openai/v1/batches/{batch_id}"
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    response = requests.get(url, headers=headers)
    return response.json()

# Usage example
api_key = "YOUR_GROQ_API_KEY"
batch_id = "batch_01jh6xa7reempvjyh6n3yst2zw"

try:
    result = get_batch_status(api_key, batch_id)
    print(result)
except Exception as e:
    print(f"Error: {e}")
    
# 5 get batch job results


import requests # pip install requests first! 

def download_file_content(api_key, output_file_id, output_file):
    url = f"https://api.groq.com/openai/v1/files/{output_file_id}/content"
    
    headers = {
        "Authorization": f"Bearer {api_key}"
    }
    
    response = requests.get(url, headers=headers)
    
    # Write the content to a file
    with open(output_file, 'wb') as f:
        f.write(response.content)
    
    return f"File downloaded successfully to {output_file}"

# Usage example
api_key = "YOUR_GROQ_API_KEY"
output_file_id = "file_01jh6xa97be52b7pg88czwrrwb" # replace with your own completed batch job's `output_file_id`
output_file = "batch_output.jsonl" # replace with your own file of choice to download batch job contents to

try:
    result = download_file_content(api_key, file_id, output_file)
    print(result)
except Exception as e:
    print(f"Error: {e}")

def main():
    # List of individual LLM request payloads
    # Each item should include a unique "id" to map responses back to the request
    prompt_requests = [
        {
            "id": "req1",
            "prompt": "Tell me a joke",
            # Additional parameters as needed (e.g., "temperature", "max_tokens", etc.)
            "temperature": 0.7,
            "max_tokens": 50
        },
        {
            "id": "req2",
            "prompt": "Explain the theory of relativity in simple terms",
            "temperature": 0.5,
            "max_tokens": 150
        },
        # Add more requests here...
    ]

    # Construct the batch payload expected by Groq
    batch_payload = {"requests": prompt_requests}
    
    try:
        batch_response = batch_llm_requests(batch_payload)
    except Exception as e:
        print(f"Error in batch processing: {e}")
        return

    # Assuming the response is structured with a "responses" list that maps to your request IDs
    for item in batch_response.get("responses", []):
        req_id = item.get("id")
        output = item.get("output")
        print(f"Response for {req_id}: {output}")

if __name__ == "__main__":
    main()


In [None]:

from langchain_community.vectorstores.neo4j_vector import Neo4jVector

# Initialize the embedding model
model_name = "BAAI/bge-base-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

# Proceed with your existing code
kg_vector_search = Neo4jVector.from_existing_graph(
    url=neo4j_uri,
    username=neo4j_username,
    password=neo4j_password,
    embedding=embedder,  # Pass the embedder instance here
    node_label="Chunk",
    embedding_node_property="embedding",
    text_node_properties=["description"],
)

# Useful function

In [None]:


client = OpenAI()

META_PROMPT = """
Given a task description or existing prompt, produce a detailed system prompt to guide a language model in completing the task effectively.

# Guidelines

- Understand the Task: Grasp the main objective, goals, requirements, constraints, and expected output.
- Minimal Changes: If an existing prompt is provided, improve it only if it's simple. For complex prompts, enhance clarity and add missing elements without altering the original structure.
- Reasoning Before Conclusions**: Encourage reasoning steps before any conclusions are reached. ATTENTION! If the user provides examples where the reasoning happens afterward, REVERSE the order! NEVER START EXAMPLES WITH CONCLUSIONS!
    - Reasoning Order: Call out reasoning portions of the prompt and conclusion parts (specific fields by name). For each, determine the ORDER in which this is done, and whether it needs to be reversed.
    - Conclusion, classifications, or results should ALWAYS appear last.
- Examples: Include high-quality examples if helpful, using placeholders [in brackets] for complex elements.
   - What kinds of examples may need to be included, how many, and whether they are complex enough to benefit from placeholders.
- Clarity and Conciseness: Use clear, specific language. Avoid unnecessary instructions or bland statements.
- Formatting: Use markdown features for readability. DO NOT USE ``` CODE BLOCKS UNLESS SPECIFICALLY REQUESTED.
- Preserve User Content: If the input task or prompt includes extensive guidelines or examples, preserve them entirely, or as closely as possible. If they are vague, consider breaking down into sub-steps. Keep any details, guidelines, examples, variables, or placeholders provided by the user.
- Constants: DO include constants in the prompt, as they are not susceptible to prompt injection. Such as guides, rubrics, and examples.
- Output Format: Explicitly the most appropriate output format, in detail. This should include length and syntax (e.g. short sentence, paragraph, JSON, etc.)
    - For tasks outputting well-defined or structured data (classification, JSON, etc.) bias toward outputting a JSON.
    - JSON should never be wrapped in code blocks (```) unless explicitly requested.

The final prompt you output should adhere to the following structure below. Do not include any additional commentary, only output the completed system prompt. SPECIFICALLY, do not include any additional messages at the start or end of the prompt. (e.g. no "---")

[Concise instruction describing the task - this should be the first line in the prompt, no section header]

[Additional details as needed.]

[Optional sections with headings or bullet points for detailed steps.]

# Steps [optional]

[optional: a detailed breakdown of the steps necessary to accomplish the task]

# Output Format

[Specifically call out how the output should be formatted, be it response length, structure e.g. JSON, markdown, etc]

# Examples [optional]

[Optional: 1-3 well-defined examples with placeholders if necessary. Clearly mark where examples start and end, and what the input and output are. User placeholders as necessary.]
[If the examples are shorter than what a realistic example is expected to be, make a reference with () explaining how real examples should be longer / shorter / different. AND USE PLACEHOLDERS! ]

# Notes [optional]

[optional: edge cases, details, and an area to call or repeat out specific important considerations]
""".strip()

def generate_prompt(task_or_prompt: str):
    completion = client.chat.completions.create(
        model="gpt-4.5-preview",
        messages=[
            {
                "role": "system",
                "content": META_PROMPT,
            },
            {
                "role": "user",
                "content": "Task, Goal, or Current Prompt:\n" + task_or_prompt,
            },
        ],
    )

    return completion.choices[0].message.content

# deprecated function

In [None]:
'''
#query usata momentaneamente
query =
MATCH (n)
WHERE n.embedding IS NOT NULL
SET n:Chunk;

driver.session().run(query)
query =
DROP INDEX text_embeddings IF EXISTS;
driver.session().run(query)
query =CREATE VECTOR INDEX `abstract-embeddings`
FOR (a:Abstract) ON (a.embedding)
OPTIONS {
  indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }
}
driver.session().run(query)
#query usata momentaneamente
query =
MATCH (n)
WHERE n.embedding IS NOT NULL
SET n:Chunk;
driver.session().run(query)
query =
DROP INDEX text_embeddings IF EXISTS;
create_vector_index(driver, name="text_embeddings", label="Chunk", embedding_property="description", dimensions=1536, similarity_fn="cosine")

# Vector Retriever

vector_retriever = VectorRetriever(
   driver,
   index_name="text_embeddings",
   embedder=embedder,
)

graph_retriever = VectorCypherRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    retrieval_query="""
// 1) Start from the given node (renamed here as Chunk)
WITH node AS Chunk
// 2) Traverse all relationships (of any type) 2 to 3 hops away from Chunk
MATCH path = (Chunk)-[rels*2..3]-()
// 3) Unwind nodes and relationships from each path
UNWIND nodes(path) AS c
UNWIND relationships(path) AS r
// 4) Collect distinct nodes (Chunks) and relationships (rels)
WITH collect(DISTINCT c) AS Chunks, collect(DISTINCT r) AS rels
// 5) Format and return the context:
//    For each node, use its 'text' property,
//    and for each relationship, output a formatted string using the start node name, type, details, and end node name.
RETURN apoc.text.join([c IN Chunks | c.text], '\n') +
       apoc.text.join([r IN rels |
         startNode(r).name + ' - ' + type(r) + ' ' + r.details + ' -> ' + endNode(r).name],
         '\n') AS info

"""
)


rag_template = RagTemplate(template=Answer the Question using the following Context. Only respond with information mentioned in the Context. Do not inject any speculative information not mentioned.

# Question:
{query_text}

# Context:
{context}

# Answer:
, expected_inputs=['query_text', 'context'])

vector_rag  = GraphRAG(llm=llm, retriever=vector_retriever, prompt_template=rag_template)

graph_rag = GraphRAG(llm=llm, retriever=graph_retriever, prompt_template=rag_template)

q = "find node with  name D1072 Operation"
print("Vector RAG:")
print(vector_rag.search(q, retriever_config={'top_k':5}).answer)
print("Graph RAG:")
print(graph_rag.search(q, retriever_config={'top_k':5}).answer)
'''

In [None]:
# # Creating dynamic models using create_model
# # These models represent the input format that the LLM might return for nodes and edges
# DynamicNode = create_model(
#     'DynamicNode',
#     node_id=(str, ...),
#     label=(str, ""),         # Optional node label
#     description=(str, ""),   # Optional description
#     reason=(str, ...),  
#     additional_info=(dict, Field(default_factory=dict))
# )

# DynamicEdge = create_model(
#     'DynamicEdge',
#     source=(str, ...),
#     target=(str, ...),
#     relation=(str, ...),
#     confidence=(float, 0.0),  # A possible confidence value for the relationship
#     reason=(str, ...),      
#     metadata=(dict, Field(default_factory=dict))
# )

# # Class for interacting with the LLM and populating the knowledge graph
# class LLMKnowledgeGraphFiller(BaseModel):
#     graph: KnowledgeGraph = Field(default_factory=KnowledgeGraph)

#     def fill_from_llm_response(self, response: dict):
#         """
#         Populates the knowledge graph from an LLM-generated response.
#         The response must be a dictionary with the keys 'nodes' and 'edges', each containing a list of dictionaries.
#         """
#         # Processing nodes
#         for node_data in response.get("nodes", []):
#             # Create the dynamic model to validate data
#             dynamic_node = DynamicNode(**node_data)
#             # Convert to BaseNode (customizing attributes as needed)
#             base_node = BaseNode(
#                 node_id=dynamic_node.node_id,
#                 attributes={
#                     "label": dynamic_node.label,
#                     "description": dynamic_node.description,
#                     **dynamic_node.additional_info
#                 }
#             )
#             self.graph.add_node(base_node)
        
#         # Processing edges
#         for edge_data in response.get("edges", []):
#             dynamic_edge = DynamicEdge(**edge_data)
#             base_edge = BaseEdge(
#                 source=dynamic_edge.source,
#                 target=dynamic_edge.target,
#                 relation=dynamic_edge.relation,
#                 attributes={
#                     "confidence": dynamic_edge.confidence,
#                     **dynamic_edge.metadata
#                 }
#             )
#             self.graph.add_edge(base_edge)

# # Simulating a response generated by the LLM
# llm_response = {
#     "nodes": [
#         {
#             "node_id": "1",
#             "label": "Persona",
#             "description": "Individuo umano",
#             "reason": "Estratto da un database pubblico",  # Added reason
#             "additional_info": {"age": 30, "profession": "Ingegnere"}
#         },
#         {
#             "node_id": "2",
#             "label": "Città",
#             "description": "Centro abitato",
#             "reason": "Dati provenienti da una fonte geografica",  # Added reason
#             "additional_info": {"name": "Roma", "population": 2873000}
#         }
#     ],
#     "edges": [
#         {
#             "source": "1",
#             "target": "2",
#             "relation": "vive a",
#             "confidence": 0.95,
#             "reason": "Informazione confermata da un sondaggio",  # Added reason
#             "metadata": {"since": "2010"}
#         }
#     ]
# }

# filler = LLMKnowledgeGraphFiller()
# filler.fill_from_llm_response(llm_response)

# # Displaying the constructed knowledge graph
# print("Nodes in the Knowledge Graph:")
# for node in filler.graph.nodes:
#     print(node)

# print("\nEdges in the Knowledge Graph:")
# for edge in filler.graph.edges:
#     print(edge)

'''system_prompt = (
    "You are a KnowledgeGraph database that outputs KnowledgeGraphs in JSON.\n"
    f" The JSON object must use the schema: {json.dumps(KnowledgeGraph.model_json_schema(), indent=2)}"
    "For each node and relationship you create, try to be clear about what motivated you to create it" #(?)
    "# Knowledge Graph Instructions\n"
    "## 1. Overview\n"
    "You are a top-tier algorithm designed for extracting information in structured "
    "formats to build a knowledge graph from a range of documents describing a range of G.M. International interface modules and power supplies, certified to SIL 2 or SIL3 for safety systems. \n"
    "Try to capture as much information from the json text as possible without "
    "sacrificing accuracy. Do not add any information that is not explicitly "
    "mentioned in the text.\n"
    "- **Nodes** represent entities and concepts.\n"
    "- The aim is to achieve simplicity and clarity in the knowledge graph, making it\n"
    "accessible for a vast audience.\n"
    "## 2. Labeling Nodes\n"
    "- **Consistency**: Ensure you use available types for node labels.\n"
    "Ensure you use basic or elementary types for node labels.\n"
    "- For example, when you identify an entity representing a person, "
    "always label it as **'person'**. Avoid using more specific terms "
    "like 'mathematician' or 'scientist'."
    "- **Node IDs**: Never utilize integers as node IDs. Node IDs should be "
    "names or human-readable identifiers found in the text.\n"
    "- **Relationships** represent connections between entities or concepts.\n"
    "Ensure consistency and generality in relationship types when constructing "
    "knowledge graphs. Instead of using specific and momentary types "
    "such as 'BECAME_PROFESSOR', use more general and timeless relationship types "
    "like 'PROFESSOR'. Make sure to use general and timeless relationship types!\n"
    "## 3. Coreference Resolution\n"
    "- **Maintain Entity Consistency**: When extracting entities, it's vital to "
    "ensure consistency.\n"
    'If an entity, such as "John Doe", is mentioned multiple times in the text '
    'but is referred to by different names or pronouns (e.g., "Joe", "he"),'
    "always use the most complete identifier for that entity throughout the "
    'knowledge graph. In this example, use "John Doe" as the entity ID.\n'
    "Remember, the knowledge graph should be coherent and easily understandable, "
    "so maintaining consistency in entity references is crucial.\n"
    "## 4. Strict Compliance\n"
    "Adhere to the rules strictly. Non-compliance will result in termination."
    "## **Specific Instructions"
    "Extract information relevant to the relay Output Module** including: "
    "**General Description**, **Technical Data**, **Diagnostic Functions**, **Environmental Conditions**, and **Certifications**."
    "Structure nodes and relationships to reflect connections between components, functionalities, and safety features."
    "Use consistent labeling for technical specifications (e.g., **'voltage'**, **'current'**, **'safety certification'**)."
)
text = """{
    "features": "FEATURES\n1. SIL 2 / SC 3 (pending)\n2. Input from Zone 0 / Division 1 (pending)\n3. Installation in Zone 2 / Division 2 (pending)\n4. Loop disconnection to ease maintenance operations\n5. HART\u00ae compatible\n6. Line & Load short/open circuit programmable diagnostics\n7. Out-of-range fault with programmable thresholds\n8. Field fault mirroring to the DCS/PLC IO Card\n9. High Accuracy\n10. Three port isolation, Input/Output/Supply\n",
"""
from groq import Groq
from together import Together
import together

togetherai = Together()

groq = Groq()

def get_KnowledgeGraph(input_prompt: str) -> KnowledgeGraph:
    chat_completion = togetherai.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": f"make a knowledge graph for {input_prompt}",
            },
        ],
        model="Qwen/QwQ-32B",
        temperature=0,
        # Streaming is not supported in JSON mode
        stream=False,
        # Enable JSON mode by setting the response format
        response_format={"type": "json_object"},
    )
    return KnowledgeGraph.model_validate_json(chat_completion.choices[0].message.content)




kg = get_KnowledgeGraph(text)
print(kg)
'''
'''import os
import json
def extract_json_fields(file_path):
    """
    Legge un file JSON contenente pagine strutturate e restituisce un nuovo JSON con:
        - "title": il titolo della pagina,
        - "text": il testo (se non presente, viene assegnata la stringa vuota),
        - "tables": le tabelle (se non presente, viene assegnata una lista vuota),
        - "subsections": le subsezioni (se non presente, viene assegnata una lista vuota),
        - "supersections": le supersezioni (se non presente, viene assegnata una lista vuota)
    
    Args:
        file_path (str): Percorso del file JSON da elaborare.
    
    Returns:
        dict: Un dizionario con le stesse chiavi del JSON originale, ma per ogni pagina sono presenti
              solo i campi "title", "text", "tables", "subsections" e "supersections".
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    result = {}
    for key, page in data.items():
        title = page.get("title", "")
        text = page.get("text", "")
        tables = page.get("tables", [])
        subsections = page.get("subsections", [])
        supersections = page.get("supersections", [])
        result[key] = {
            "title": title,
            "text": text,
            "tables": tables,
            "subsections": subsections,
            "supersections": supersections
        }
    
    return result
def json_folder_to_knowledge_graph(folder_path: str) -> KnowledgeGraph:
    """
    Legge tutti i file JSON presenti nella cartella specificata e li trasforma in un unico KnowledgeGraph.
    
    Per ciascun file:
      - Se esistono i nodi "0" e "1", li fonde:
          * Nuovo nodo "0" avrà:
              id = title del nodo "0",
              type = title del nodo "1",
              attributes = attributi di "1" (eccetto subsections/supersections),
              mentre subsections e supersections rimangono quelle di "0".
          * In tutte le relazioni, ogni riferimento a 1 viene sostituito da 0.
      
      - Per ogni nodo (chiave diversa da "-1"):
          * Crea un BaseNode con:
                id = "<title_di_0>_<title_del_nodo>",
                type = il suo title,
                attributes = tutti gli attributi eccetto subsections, supersections e fused_id.
          * Se il nodo ha chiave "-1", allora il nodo verrà creato con id e type "root" e senza attributi.
      
      - Crea archi:
          * Se il nodo X ha in "subsections" il riferimento a Y, aggiunge un arco da X a Y con type = "has_" + (Y.title).
          * Se il nodo X ha in "supersections" il riferimento a Z, aggiunge un arco da Z a X con type = "has_" + (X.title).
    
    Alla fine, tutti i nodi con chiave "-1" (se presenti in più file) vengono fusi in un unico nodo "root".
    Inoltre, per ogni arco con source "root", il type viene impostato a "product".
    """
    kg = KnowledgeGraph()
    
    # Lista per tenere traccia degli eventuali nodi "root" (derivati dalla chiave "-1")
    root_ids = set()
    
    # Elaborazione di ciascun file JSON nella cartella
    for filename in os.listdir(folder_path):
        if not filename.endswith('.json'):
            continue
        filepath = os.path.join(folder_path, filename)
        data = extract_json_fields(filepath)
        
        # Se esistono i nodi "0" e "1", eseguo la fusione
        if "0" in data and "1" in data:
            node0 = data["0"]
            node1 = data["1"]
            # Il prefisso è il title del nodo "0"
            prefix = node0.get("title", "unknown")
            # Il type finale del nodo 0 diventa il title di node1
            node0["fused_id"] = prefix  # segnaliamo che il nodo 0 è stato fuso
            node0["title"] = node1.get("title", "unknown")
            # Unisco gli attributi di node1 (esclusi subsections, supersections, title)
            merged_attrs = { k: v for k, v in node1.items() if k not in ["subsections", "supersections", "title"] }
            # Assicuro che node0 abbia subsections e supersections
            node0["subsections"] = node0.get("subsections", [])
            node0["supersections"] = node0.get("supersections", [])
            for k, v in merged_attrs.items():
                node0[k] = v
            # Rimuovo node "1"
            del data["1"]
            # In tutte le relazioni, sostituisco ogni riferimento a 1 con 0
            for n_id, n_data in data.items():
                if "subsections" in n_data:
                    n_data["subsections"] = [0 if x == 1 else x for x in n_data["subsections"]]
                if "supersections" in n_data:
                    n_data["supersections"] = [0 if x == 1 else x for x in n_data["supersections"]]
        else:
            # Se non è presente il nodo "0", imposto un prefisso di default
            prefix = "unknown"
        
        # Mappa locale per associare la chiave originale al nuovo id
        mapping_ids = {}
        # Elaborazione dei nodi del file corrente
        for key, node_data in data.items():
            # Se il nodo è quello radice (chiave "-1"), lo gestisco in modo speciale
            if str(key) == "-1":
                new_id = "root"
                new_type = "root"
                attributes = {}
                root_ids.add("root")
            else:
                # Nuovo id: "<prefix>_<title_del_nodo>"
                node_title = node_data.get("title", "unknown")
                new_id = f"{prefix}_{node_title}"
                new_type = node_title
                # Escludo i campi di relazione e "fused_id" dagli attributi
                attributes = { k: v for k, v in node_data.items() if k not in ["subsections", "supersections", "title", "fused_id"] }
            mapping_ids[str(key)] = new_id
            node = BaseNode(id=new_id, type=new_type, attributes=attributes)
            kg.add_node(node)
        
        # Creazione degli archi del file corrente (usando un set per evitare duplicati)
        edge_set = set()
        for key, node_data in data.items():
            current_id = mapping_ids[str(key)]
            # Archi per le subsections: da current node a ogni nodo in subsections
            for sub in node_data.get("subsections", []):
                sub_key = str(sub)
                if sub_key in mapping_ids:
                    target_id = mapping_ids[sub_key]
                    # L'edge type si basa sul title del nodo target (preceduto da "has_")
                    # Per il nodo root, il titolo è "root"
                    # (se il target non viene trovato, si usa "has_unknown")
                    target_title = "root" if target_id == "root" else target_id.split("_")[-1]
                    edge_type = f"has_{target_title}"
                    edge_tuple = (current_id, target_id, edge_type)
                    if edge_tuple not in edge_set:
                        kg.add_edge(BaseEdge(source=current_id, target=target_id, type=edge_type))
                        edge_set.add(edge_tuple)
            # Archi per le supersections: da ogni nodo in supersections al current node
            for sup in node_data.get("supersections", []):
                sup_key = str(sup)
                if sup_key in mapping_ids:
                    source_id = mapping_ids[sup_key]
                    # L'edge type si basa sul title del current node (preceduto da "has_")
                    current_title = "root" if current_id == "root" else current_id.split("_")[-1]
                    edge_type = f"has_{current_title}"
                    edge_tuple = (source_id, current_id, edge_type)
                    if edge_tuple not in edge_set:
                        kg.add_edge(BaseEdge(source=source_id, target=current_id, type=edge_type))
                        edge_set.add(edge_tuple)
    
    # --- Fusione dei nodi root (se presenti in più file) ---
    # Se esistono più nodi con id "root", ne mantengo uno solo e aggiorno gli edge
    root_nodes = [node for node in kg.nodes if node.id == "root"]
    if root_nodes:
        primary_root = root_nodes[0]
        primary_root.type = "root"
        primary_root.attributes = {}
        for extra in root_nodes[1:]:
            # Rimuovo il nodo extra e sostituisco le occorrenze nei riferimenti degli edge
            for edge in kg.edges:
                if edge.source == extra.id:
                    edge.source = "root"
                if edge.target == extra.id:
                    edge.target = "root"
            kg.nodes.remove(extra)
        # Per ogni edge avente source "root", imposto edge.type a "product"
        for edge in kg.edges:
            if edge.source == "root":
                edge.type = "product"
    
    return kg



# Esempio di utilizzo:
if __name__ == "__main__":
    folder_path = "prova/"  # Sostituire con il percorso della cartella contenente i file JSON
    graph = json_folder_to_knowledge_graph(folder_path)
    print("Nodi:")
    for node in graph.nodes:
        print(node)
    print("\nArchi:")
    for edge in graph.edges:
        print(edge)
'''
'''
neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

# LLM and Embedding Model
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

llm=OpenAILLM(
   model_name="gpt-4o-mini",
   model_params={
       "response_format": {"type": "json_object"}, # use json_object formatting for best results
       "temperature": 0 # turning temperature down for more deterministic results
   }
)

# Graph Schema Setup
basic_node_labels = ["Object", "Entity", "Group", "Person", "Organization", "Place"]

academic_node_labels = ["ArticleOrPaper", "PublicationOrJournal"]

medical_node_labels = ["Anatomy", "BiologicalProcess", "Cell", "CellularComponent",
                      "CellType", "Condition", "Disease", "Drug",
                      "EffectOrPhenotype", "Exposure", "GeneOrProtein", "Molecule",
                      "MolecularFunction", "Pathway"]

node_labels = basic_node_labels + academic_node_labels + medical_node_labels

# define relationship types
rel_types = ["ACTIVATES", "AFFECTS", "ASSESSES", "ASSOCIATED_WITH", "AUTHORED",
   "BIOMARKER_FOR", "CAUSES", "COMPOUND", "CONTAINS", "DECREASES",
   "DEFINES", "DEMONSTRATES", "DERIVED_FROM", "DIAGNOSES", "DIFFERENTIATES",
   "DISRUPTS", "ENHANCES", "EVIDENCE_FOR", "EXACERBATES", "EXPLAINS",
   "EXPRESSES", "FUNCTIONAL_ASSOCIATION", "HAS", "IMMUNOMODULATES"]

#create text embedder
embedder = OpenAIEmbeddings()

# define prompt template
prompt_template =  
You are a medical researcher tasks with extracting information from papers
and structuring it in a property graph to inform further medical and research Q&A.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node.


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

...

Use only fhe following nodes and relationships:
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}

# Knowledge Graph Builder
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
   llm=llm,
   driver=neo4j_driver,
   text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100),
   embedder=embedder,
   entities=node_labels,
   relations=rel_types,
   prompt_template=prompt_template,
   from_pdf=True
)

pdf_file_paths = ['file/1.pdf','file/2.pdf','file/3.pdf','file/4.pdf','file/5.pdf','file/6.pdf','file/7.pdf']

for path in pdf_file_paths:
    print(f"Processing : {path}")
    pdf_result = await kg_builder_pdf.run_async(file_path=path)
    print(f"Result: {pdf_result}")
    '''


In [None]:
'''def extract_json_fields(file_path):
    """
    Legge un file JSON contenente pagine strutturate e restituisce un nuovo JSON con:
        - "title": il titolo della pagina,
        - "text": il testo (se non presente, viene assegnata la stringa vuota),
        - "tables": le tabelle (se non presente, viene assegnata una lista vuota),
        - "subsections": le subsezioni (se non presente, viene assegnata una lista vuota),
        - "supersections": le supersezioni (se non presente, viene assegnata una lista vuota)
    
    Args:
        file_path (str): Percorso del file JSON da elaborare.
    
    Returns:
        dict: Un dizionario con le stesse chiavi del JSON originale, ma per ogni pagina sono presenti
              solo i campi "title", "text", "tables", "subsections" e "supersections".
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    result = {}
    for key, page in data.items():
        title = page.get("title", "")
        text = page.get("text", "")
        tables = page.get("tables", [])
        subsections = page.get("subsections", [])
        supersections = page.get("supersections", [])
        result[key] = {
            "title": title,
            "text": text,
            "tables": tables,
            "subsections": subsections,
            "supersections": supersections
        }
    
    return result'''