In [2]:

from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field, field_validator, ValidationError
from neo4j import GraphDatabase, basic_auth

# --- 1. Conceptual Ontology & Pydantic Models ---
# (Same Pydantic models as before)

class BaseNode(BaseModel):
    id: str = Field(..., description="Unique identifier for the node (e.g., name, official ID).")
    type: str = Field(..., description="Type of the node (e.g., 'Well', 'Formation').")
    attributes: Dict[str, Any] = Field(default_factory=dict, description="Key-value properties of the node.")

    @field_validator('id', mode='before')
    def sanitize_id(cls, v):
        return str(v).replace(" ", "_").replace("/", "_").replace(":", "_")

class Well(BaseNode):
    type: str = "Well"
    wellbore_name: Optional[str] = None
    purpose: Optional[str] = None
    completion_date: Optional[str] = None
    total_depth_m: Optional[float] = None
    water_depth_m: Optional[float] = None

class Formation(BaseNode):
    type: str = "Formation"
    geologic_age: Optional[str] = None
    lithology_description: Optional[str] = None

class Field(BaseNode):
    type: str = "Field"
    discovery_year: Optional[int] = None
    status: Optional[str] = None

class License(BaseNode):
    type: str = "License"
    awarded_date: Optional[str] = None
    valid_until_date: Optional[str] = None

class Company(BaseNode):
    type: str = "Company"
    country_of_registration: Optional[str] = None

class Relationship(BaseModel):
    source_id: str
    source_type: str
    target_id: str
    target_type: str
    relationship_type: str
    properties: Dict[str, Any] = {}

    @field_validator('source_id', 'target_id', mode='before')
    def sanitize_ids_in_relationship(cls, v):
        return str(v).replace(" ", "_").replace("/", "_").replace(":", "_")

class KnowledgeGraphData(BaseModel):
    nodes: List[BaseNode]
    relationships: List[Relationship]



In [6]:
# --- 2. LLM-supported NER (Simulated Output) ---
EXAMPLE_INPUT_TEXT = """
The Statfjord field, discovered in 1974, is a major oil and gas field in the Norwegian sector of the North Sea.
Well 33/9-A-12, completed on 1980-05-15, targets the Brent Formation within production license PL037.
PL037 is operated by Equinor ASA. The Brent Formation is of Middle Jurassic age and primarily consists of sandstone.
"""

MOCK_LLM_OUTPUT = {
    "nodes": [
        {"id": "Statfjord Field", "type": "Field", "attributes": {"discovery_year": 1974, "status": "Producing", "location": "Norwegian North Sea"}},
        {"id": "33/9-A-12", "type": "Well", "attributes": {"wellbore_name": "33/9-A-12 H", "purpose": "Production", "completion_date": "1980-05-15", "total_depth_m": 3000.0}},
        {"id": "Brent Formation", "type": "Formation", "attributes": {"geologic_age": "Middle Jurassic", "lithology_description": "Primarily sandstone"}},
        {"id": "PL037", "type": "License", "attributes": {"awarded_date": "1975-01-01"}},
        {"id": "Equinor ASA", "type": "Company", "attributes": {"country_of_registration": "Norway"}}
    ],
    "relationships": [
        {"source_id": "33/9-A-12", "source_type": "Well", "target_id": "Brent Formation", "target_type": "Formation", "relationship_type": "TARGETS_FORMATION", "properties": {"confidence_score": 0.95}},
        {"source_id": "33/9-A-12", "source_type": "Well", "target_id": "Statfjord Field", "target_type": "Field", "relationship_type": "IS_IN_FIELD", "properties": {}},
        {"source_id": "33/9-A-12", "source_type": "Well", "target_id": "PL037", "target_type": "License", "relationship_type": "DRILLED_IN_LICENSE", "properties": {}},
        {"source_id": "PL037", "source_type": "License", "target_id": "Equinor ASA", "target_type": "Company", "relationship_type": "OPERATED_BY", "properties": {"operator_share_percentage": 45.0}},
        {"source_id": "Statfjord Field", "source_type": "Field", "target_id": "PL037", "target_type": "License", "relationship_type": "ASSOCIATED_WITH_LICENSE", "properties": {}}
    ]
}

# --- 3. Pydantic Validation Function ---
def parse_and_validate_llm_output(llm_json_output: Dict) -> Optional[KnowledgeGraphData]:
    #try:
    parsed_nodes = []
    for node_data in llm_json_output.get("nodes", []):
        node_type = node_data.get("type")
        attributes = node_data.get("attributes", {})
        node_id = node_data.get("id")
        common_data = {"id": node_id, "attributes": attributes}

        # Dynamically assign specific fields if they exist in Pydantic model and attributes
        specific_model_map = {
            "Well": Well, "Formation": Formation, "Field": Field,
            "License": License, "Company": Company
        }
        model_class = specific_model_map.get(node_type, BaseNode)
        specific_fields_data = {}
        # Iterate through fields of the specific model (e.g., Well)
        # and pop them from 'attributes' if they are present, to avoid duplication.
        if model_class != BaseNode:
            for field_name in model_class.model_fields:
                if field_name in attributes and field_name not in BaseNode.model_fields:
                    specific_fields_data[field_name] = attributes.pop(field_name)
                
            # Create the node instance
            if model_class == BaseNode and node_type not in specific_model_map:
                print(f"Unknown node type '{node_type}' for id '{node_id}'. Using BaseNode.")
                parsed_nodes.append(BaseNode(id=node_id, type=node_type or "Unknown", attributes=attributes))
            else:
                # Pass common_data, specific_fields_data, and remaining attributes
                # Ensure 'type' is explicitly passed if model_class is BaseNode but type is known
                if model_class == BaseNode:
                    parsed_nodes.append(model_class(**common_data, type=node_type, **specific_fields_data))
                else:
                    parsed_nodes.append(model_class(**common_data, **specific_fields_data))
            parsed_relationships = [Relationship(**rel_data) for rel_data in llm_json_output.get("relationships", [])]
            kg_data = KnowledgeGraphData(nodes=parsed_nodes, relationships=parsed_relationships)
            return kg_data
#print(kg_data)

In [8]:
LLM_OUTPUT= {
  "nodes": [
    {
      "id": "1/2-1",
      "type": "Well",
      "attributes": {
        "wellbore_name": "1/2-1",
        "coordinates_latitude": "56°53'16.07\"N",
        "coordinates_longitude": "02°28'35.70\"E",
        "utm_coordinates_north": "630515919 N",
        "utm_coordinates_east": "46810652 E",
        "permit_no": "604",
        "elevation_kb_m": 24.0,
        "water_depth_m": 70.0,
        "total_depth_m": 3576.0,
        "bottom_hole_temp_c": 123.0,
        "spud_date": "1989-03-20",
        "completion_date": "1989-06-04",
        "spud_class": "WILDCAT",
        "completion_class": "P&A. OIL/GAS DISC.",
        "purpose": "Exploration",
        "formation_at_td_name": "CRETACEOUS",
        "producing_formation_name": "Forties Fm",
        "reservoir_properties": {"porosity_avg_percent": 18.5, "permeability_md": 49.0},
        "sidetracked_from_depth_m_rkb": 3078.5,
        "shallow_gas_detected": False
      }
    },
    {
      "id": "143",
      "type": "License",
      "attributes": {}
    },
    {
      "id": "PHILLIPS",
      "type": "Company",
      "attributes": {"country_of_registration": "Norway"}
    },
    {
      "id": "TRANSNOR RIG AS",
      "type": "Company",
      "attributes": {"country_of_registration": "Norway"}
    },
    {
      "id": "ROSS ISLE",
      "type": "Rig",
      "attributes": {"rig_type": "SEMI-SUB."}
    },
    {
      "id": "CRETACEOUS",
      "type": "Formation",
      "attributes": {"geologic_age": "Cretaceous", "lithology_description": "General rocks of Cretaceous age"}
    },
    {
      "id": "BP PETROLEUM DEV. OF NORWAY AS",
      "type": "Company",
      "attributes": {"country_of_registration": "Norway"}
    },
    {
      "id": "CONOCO NORWAY INC.",
      "type": "Company",
      "attributes": {"country_of_registration": "Norway"}
    },
    {
      "id": "NORSKE MOECO A/S",
      "type": "Company",
      "attributes": {"country_of_registration": "Norway"}
    },
    {
      "id": "KS PELICAN & CO A/S",
      "type": "Company",
      "attributes": {"country_of_registration": "Norway"}
    },
    {
      "id": "DEN NORSKE STATS OLJESELSKAP A.S",
      "type": "Company",
      "attributes": {"country_of_registration": "Norway"}
    },
    {
      "id": "NORSKE AEDC A/S",
      "type": "Company",
      "attributes": {"country_of_registration": "Norway"}
    },
    {
      "id": "1/2-1_C1",
      "type": "Core",
      "attributes": {"core_number": "1", "interval_start_m": 3111.4, "interval_end_m": 3112.3, "recovery_m": 0.3, "recovery_percentage": 33.3}
    },
    {
      "id": "1/2-1_C2",
      "type": "Core",
      "attributes": {"core_number": "2", "interval_start_m": 3113.8, "interval_end_m": 3119.6, "recovery_m": 5.2, "recovery_percentage": 89.7}
    },
    {
      "id": "1/2-1_C4",
      "type": "Core",
      "attributes": {"core_number": "4", "interval_start_m": 3126.0, "interval_end_m": 3134.9, "recovery_m": 9.1, "recovery_percentage": 102.2}
    },
    {
      "id": "1/2-1_C5",
      "type": "Core",
      "attributes": {"core_number": "5", "interval_start_m": 3135.2, "interval_end_m": 3153.5, "recovery_m": 18.3, "recovery_percentage": 100.0}
    },
    {
      "id": "1/2-1_C7",
      "type": "Core",
      "attributes": {"core_number": "7", "interval_start_m": 3160.2, "interval_end_m": 3163.8, "recovery_m": 3.0, "recovery_percentage": 83.3}
    },
    {
      "id": "1/2-1_C8",
      "type": "Core",
      "attributes": {"core_number": "8", "interval_start_m": 3160.2, "interval_end_m": 3168.7, "recovery_m": 8.2, "recovery_percentage": 96.5}
    },
    {
      "id": "1/2-1_DST1",
      "type": "DST",
      "attributes": {
        "test_number": "1.0",
        "interval_start_m": 3122.3,
        "interval_end_m": 3137.0,
        "choke_size_mm": 25.4,
        "oil_rate_sm3_d": 859.0,
        "gas_rate_sm3_d": 57000.0,
        "oil_gravity_g_cm3": 0.81,
        "gor_m3_m3": 66.36
      }
    },
    {
      "id": "Forties Fm",
      "type": "Formation",
      "attributes": {"geologic_age": "Paleocene", "top_depth_m_rkb": 3069.0, "lithology_description": "Sandstone"}
    },
    {
      "id": "Ekofisk Fm",
      "type": "Formation",
      "attributes": {"geologic_age": "Late Cretaceous", "top_depth_m_rkb": 3406.0, "lithology_description": "Chalk"}
    },
    {
      "id": "Tor Fm",
      "type": "Formation",
      "attributes": {"geologic_age": "Late Cretaceous", "top_depth_m_rkb": 3513.0, "lithology_description": "Chalk"}
    },
    {
      "id": "Nordland Group",
      "type": "GeologicalGroup",
      "attributes": {"top_depth_m_rkb": 94.0}
    },
    {
      "id": "Hordaland Group",
      "type": "GeologicalGroup",
      "attributes": {"top_depth_m_rkb": 1777.0}
    },
    {
      "id": "Rogaland Group",
      "type": "GeologicalGroup",
      "attributes": {"geologic_age": "Paleocene", "top_depth_m_rkb": 3058.0}
    },
    {
      "id": "Balder Fm",
      "type": "Formation",
      "attributes": {"geologic_age": "Paleocene", "top_depth_m_rkb": 3058.0}
    },
    {
      "id": "Sele Fm",
      "type": "Formation",
      "attributes": {"geologic_age": "Paleocene", "top_depth_m_rkb": 3066.0}
    },
    {
      "id": "Lista Fm",
      "type": "Formation",
      "attributes": {"geologic_age": "Paleocene", "top_depth_m_rkb": 3274.0}
    },
    {
      "id": "Maureen Fm",
      "type": "Formation",
      "attributes": {"geologic_age": "Paleocene", "top_depth_m_rkb": 3334.0}
    },
    {
      "id": "Shetland Group",
      "type": "GeologicalGroup",
      "attributes": {"geologic_age": "Late Cretaceous", "top_depth_m_rkb": 3406.0}
    }
  ],
  "relationships": [
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "143", "target_type": "License", "relationship_type": "DRILLED_IN_LICENSE", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "ROSS ISLE", "target_type": "Rig", "relationship_type": "DRILLED_BY_RIG", "properties": {}
    },
    {
      "source_id": "ROSS ISLE", "source_type": "Rig", "target_id": "TRANSNOR RIG AS", "target_type": "Company", "relationship_type": "OPERATED_BY_CONTRACTOR", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "PHILLIPS", "target_type": "Company", "relationship_type": "OPERATED_BY", "properties": {}
    },
    {
      "source_id": "143", "source_type": "License", "target_id": "PHILLIPS", "target_type": "Company", "relationship_type": "OPERATED_BY", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "CRETACEOUS", "target_type": "Formation", "relationship_type": "ENDS_IN_FORMATION", "properties": {"depth_m": 3576.0}
    },
    {
      "source_id": "143", "source_type": "License", "target_id": "BP PETROLEUM DEV. OF NORWAY AS", "target_type": "Company", "relationship_type": "HELD_BY_LICENSEE", "properties": {"share_percentage": 26.625}
    },
    {
      "source_id": "143", "source_type": "License", "target_id": "CONOCO NORWAY INC.", "target_type": "Company", "relationship_type": "HELD_BY_LICENSEE", "properties": {"share_percentage": 9.375}
    },
    {
      "source_id": "143", "source_type": "License", "target_id": "NORSKE MOECO A/S", "target_type": "Company", "relationship_type": "HELD_BY_LICENSEE", "properties": {"share_percentage": 5.0}
    },
    {
      "source_id": "143", "source_type": "License", "target_id": "KS PELICAN & CO A/S", "target_type": "Company", "relationship_type": "HELD_BY_LICENSEE", "properties": {"share_percentage": 4.0}
    },
    {
      "source_id": "143", "source_type": "License", "target_id": "DEN NORSKE STATS OLJESELSKAP A.S", "target_type": "Company", "relationship_type": "HELD_BY_LICENSEE", "properties": {"share_percentage": 50.0}
    },
    {
      "source_id": "143", "source_type": "License", "target_id": "NORSKE AEDC A/S", "target_type": "Company", "relationship_type": "HELD_BY_LICENSEE", "properties": {"share_percentage": 5.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "1/2-1_C1", "target_type": "Core", "relationship_type": "HAS_CORE", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "1/2-1_C2", "target_type": "Core", "relationship_type": "HAS_CORE", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "1/2-1_C4", "target_type": "Core", "relationship_type": "HAS_CORE", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "1/2-1_C5", "target_type": "Core", "relationship_type": "HAS_CORE", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "1/2-1_C7", "target_type": "Core", "relationship_type": "HAS_CORE", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "1/2-1_C8", "target_type": "Core", "relationship_type": "HAS_CORE", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "1/2-1_DST1", "target_type": "DST", "relationship_type": "HAS_DST", "properties": {}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Forties Fm", "target_type": "Formation", "relationship_type": "TARGETS_FORMATION", "properties": {"status": "Producing"}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Ekofisk Fm", "target_type": "Formation", "relationship_type": "TARGETS_FORMATION", "properties": {"status": "Secondary Target"}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Tor Fm", "target_type": "Formation", "relationship_type": "TARGETS_FORMATION", "properties": {"status": "Secondary Target"}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Nordland Group", "target_type": "GeologicalGroup", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 94.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Hordaland Group", "target_type": "GeologicalGroup", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 1777.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Rogaland Group", "target_type": "GeologicalGroup", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3058.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Balder Fm", "target_type": "Formation", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3058.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Sele Fm", "target_type": "Formation", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3066.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Forties Fm", "target_type": "Formation", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3069.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Lista Fm", "target_type": "Formation", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3274.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Maureen Fm", "target_type": "Formation", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3334.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Shetland Group", "target_type": "GeologicalGroup", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3406.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Ekofisk Fm", "target_type": "Formation", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3406.0}
    },
    {
      "source_id": "1/2-1", "source_type": "Well", "target_id": "Tor Fm", "target_type": "Formation", "relationship_type": "HAS_FORMATION_TOP", "properties": {"depth_m_rkb": 3513.0}
    },
    {
      "source_id": "Balder Fm", "source_type": "Formation", "target_id": "Rogaland Group", "target_type": "GeologicalGroup", "relationship_type": "PART_OF_GROUP", "properties": {}
    },
    {
      "source_id": "Sele Fm", "source_type": "Formation", "target_id": "Rogaland Group", "target_type": "GeologicalGroup", "relationship_type": "PART_OF_GROUP", "properties": {}
    },
    {
      "source_id": "Forties Fm", "source_type": "Formation", "target_id": "Rogaland Group", "target_type": "GeologicalGroup", "relationship_type": "PART_OF_GROUP", "properties": {}
    },
    {
      "source_id": "Lista Fm", "source_type": "Formation", "target_id": "Rogaland Group", "target_type": "GeologicalGroup", "relationship_type": "PART_OF_GROUP", "properties": {}
    },
    {
      "source_id": "Maureen Fm", "source_type": "Formation", "target_id": "Rogaland Group", "target_type": "GeologicalGroup", "relationship_type": "PART_OF_GROUP", "properties": {}
    },
    {
      "source_id": "Ekofisk Fm", "source_type": "Formation", "target_id": "Shetland Group", "target_type": "GeologicalGroup", "relationship_type": "PART_OF_GROUP", "properties": {}
    },
    {
      "source_id": "Tor Fm", "source_type": "Formation", "target_id": "Shetland Group", "target_type": "GeologicalGroup", "relationship_type": "PART_OF_GROUP", "properties": {}
    }
  ]
}

In [9]:
print(parse_and_validate_llm_output(LLM_OUTPUT))

nodes=[Well(id='1_2-1', type='Well', attributes={'coordinates_latitude': '56°53\'16.07"N', 'coordinates_longitude': '02°28\'35.70"E', 'utm_coordinates_north': '630515919 N', 'utm_coordinates_east': '46810652 E', 'permit_no': '604', 'elevation_kb_m': 24.0, 'bottom_hole_temp_c': 123.0, 'spud_date': '1989-03-20', 'spud_class': 'WILDCAT', 'completion_class': 'P&A. OIL/GAS DISC.', 'formation_at_td_name': 'CRETACEOUS', 'producing_formation_name': 'Forties Fm', 'reservoir_properties': {'porosity_avg_percent': 18.5, 'permeability_md': 49.0}, 'sidetracked_from_depth_m_rkb': 3078.5, 'shallow_gas_detected': False}, wellbore_name='1/2-1', purpose='Exploration', completion_date='1989-06-04', total_depth_m=3576.0, water_depth_m=70.0)] relationships=[Relationship(source_id='1_2-1', source_type='Well', target_id='143', target_type='License', relationship_type='DRILLED_IN_LICENSE', properties={}), Relationship(source_id='1_2-1', source_type='Well', target_id='ROSS_ISLE', target_type='Rig', relationship