In [2]:

from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field, field_validator, ValidationError
from neo4j import GraphDatabase, basic_auth

# --- 1. Conceptual Ontology & Pydantic Models ---
# (Same Pydantic models as before)

class BaseNode(BaseModel):
    id: str = Field(..., description="Unique identifier for the node (e.g., name, official ID).")
    type: str = Field(..., description="Type of the node (e.g., 'Well', 'Formation').")
    attributes: Dict[str, Any] = Field(default_factory=dict, description="Key-value properties of the node.")

    @field_validator('id', mode='before')
    def sanitize_id(cls, v):
        return str(v).replace(" ", "_").replace("/", "_").replace(":", "_")

class Well(BaseNode):
    type: str = "Well"
    wellbore_name: Optional[str] = None
    purpose: Optional[str] = None
    completion_date: Optional[str] = None
    total_depth_m: Optional[float] = None
    water_depth_m: Optional[float] = None

class Formation(BaseNode):
    type: str = "Formation"
    geologic_age: Optional[str] = None
    lithology_description: Optional[str] = None

class Field(BaseNode):
    type: str = "Field"
    discovery_year: Optional[int] = None
    status: Optional[str] = None

class License(BaseNode):
    type: str = "License"
    awarded_date: Optional[str] = None
    valid_until_date: Optional[str] = None

class Company(BaseNode):
    type: str = "Company"
    country_of_registration: Optional[str] = None

class Relationship(BaseModel):
    source_id: str
    source_type: str
    target_id: str
    target_type: str
    relationship_type: str
    properties: Dict[str, Any] = {}

    @field_validator('source_id', 'target_id', mode='before')
    def sanitize_ids_in_relationship(cls, v):
        return str(v).replace(" ", "_").replace("/", "_").replace(":", "_")

class KnowledgeGraphData(BaseModel):
    nodes: List[BaseNode]
    relationships: List[Relationship]



In [4]:
# --- 2. LLM-supported NER (Simulated Output) ---
EXAMPLE_INPUT_TEXT = """
The Statfjord field, discovered in 1974, is a major oil and gas field in the Norwegian sector of the North Sea.
Well 33/9-A-12, completed on 1980-05-15, targets the Brent Formation within production license PL037.
PL037 is operated by Equinor ASA. The Brent Formation is of Middle Jurassic age and primarily consists of sandstone.
"""

MOCK_LLM_OUTPUT = {
    "nodes": [
        {"id": "Statfjord Field", "type": "Field", "attributes": {"discovery_year": 1974, "status": "Producing", "location": "Norwegian North Sea"}},
        {"id": "33/9-A-12", "type": "Well", "attributes": {"wellbore_name": "33/9-A-12 H", "purpose": "Production", "completion_date": "1980-05-15", "total_depth_m": 3000.0}},
        {"id": "Brent Formation", "type": "Formation", "attributes": {"geologic_age": "Middle Jurassic", "lithology_description": "Primarily sandstone"}},
        {"id": "PL037", "type": "License", "attributes": {"awarded_date": "1975-01-01"}},
        {"id": "Equinor ASA", "type": "Company", "attributes": {"country_of_registration": "Norway"}}
    ],
    "relationships": [
        {"source_id": "33/9-A-12", "source_type": "Well", "target_id": "Brent Formation", "target_type": "Formation", "relationship_type": "TARGETS_FORMATION", "properties": {"confidence_score": 0.95}},
        {"source_id": "33/9-A-12", "source_type": "Well", "target_id": "Statfjord Field", "target_type": "Field", "relationship_type": "IS_IN_FIELD", "properties": {}},
        {"source_id": "33/9-A-12", "source_type": "Well", "target_id": "PL037", "target_type": "License", "relationship_type": "DRILLED_IN_LICENSE", "properties": {}},
        {"source_id": "PL037", "source_type": "License", "target_id": "Equinor ASA", "target_type": "Company", "relationship_type": "OPERATED_BY", "properties": {"operator_share_percentage": 45.0}},
        {"source_id": "Statfjord Field", "source_type": "Field", "target_id": "PL037", "target_type": "License", "relationship_type": "ASSOCIATED_WITH_LICENSE", "properties": {}}
    ]
}

# --- 3. Pydantic Validation Function ---
#def parse_and_validate_llm_output(llm_json_output: Dict) -> Optional[KnowledgeGraphData]:
    #try:
parsed_nodes = []
for node_data in MOCK_LLM_OUTPUT.get("nodes", []):
    node_type = node_data.get("type")
    attributes = node_data.get("attributes", {})
    node_id = node_data.get("id")
    common_data = {"id": node_id, "attributes": attributes}

    # Dynamically assign specific fields if they exist in Pydantic model and attributes
    specific_model_map = {
        "Well": Well, "Formation": Formation, "Field": Field,
        "License": License, "Company": Company
    }
    model_class = specific_model_map.get(node_type, BaseNode)
    specific_fields_data = {}
    # Iterate through fields of the specific model (e.g., Well)
    # and pop them from 'attributes' if they are present, to avoid duplication.
    if model_class != BaseNode:
        for field_name in model_class.model_fields:
            if field_name in attributes and field_name not in BaseNode.model_fields:
                specific_fields_data[field_name] = attributes.pop(field_name)
            
        # Create the node instance
        if model_class == BaseNode and node_type not in specific_model_map:
            print(f"Unknown node type '{node_type}' for id '{node_id}'. Using BaseNode.")
            parsed_nodes.append(BaseNode(id=node_id, type=node_type or "Unknown", attributes=attributes))
        else:
            # Pass common_data, specific_fields_data, and remaining attributes
            # Ensure 'type' is explicitly passed if model_class is BaseNode but type is known
            if model_class == BaseNode:
                parsed_nodes.append(model_class(**common_data, type=node_type, **specific_fields_data))
            else:
                parsed_nodes.append(model_class(**common_data, **specific_fields_data))
        parsed_relationships = [Relationship(**rel_data) for rel_data in MOCK_LLM_OUTPUT.get("relationships", [])]
        kg_data = KnowledgeGraphData(nodes=parsed_nodes, relationships=parsed_relationships)
print(kg_data)

nodes=[Field(id='Statfjord_Field', type='Field', attributes={'location': 'Norwegian North Sea'}, discovery_year=1974, status='Producing'), Well(id='33_9-A-12', type='Well', attributes={}, wellbore_name='33/9-A-12 H', purpose='Production', completion_date='1980-05-15', total_depth_m=3000.0, water_depth_m=None), Formation(id='Brent_Formation', type='Formation', attributes={}, geologic_age='Middle Jurassic', lithology_description='Primarily sandstone'), License(id='PL037', type='License', attributes={}, awarded_date='1975-01-01', valid_until_date=None), Company(id='Equinor_ASA', type='Company', attributes={}, country_of_registration='Norway')] relationships=[Relationship(source_id='33_9-A-12', source_type='Well', target_id='Brent_Formation', target_type='Formation', relationship_type='TARGETS_FORMATION', properties={'confidence_score': 0.95}), Relationship(source_id='33_9-A-12', source_type='Well', target_id='Statfjord_Field', target_type='Field', relationship_type='IS_IN_FIELD', properti