In [None]:
pip install langchain==0.1.10 langchain-community==0.0.25 pgvector==0.3.6 sqlalchemy==2.0.26 SPARQLWrapper==2.0.0

In [None]:
import re
from rdflib import Graph, RDF, RDFS, Namespace, URIRef
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import BeautifulSoupTransformer
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores.pgvector import PGVector
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain.agents import Tool

In [None]:
import os
import configparser

key = dbutils.secrets.get(scope="scope_name", key="key_name")
config = configparser.ConfigParser()


path = "your_secrets_path"
config.read(path)
os.environ['YOUR_API_KEY'] = config['model']['MODEL_API_KEY']
os.environ['YOUR_API_TYPE'] ='your_api_type'
os.environ['YOUR_API_VERSION'] = config['model']['MODEL_API_VERSION']
os.environ["YOUR_AI_ENDPOINT"] = config['model']['MODEL_API_BASE']
openai_emb_model  = "text-embedding-3-large"
openai_emb_deployment= "text-embedding-3-large"

embeddings = AzureOpenAIEmbeddings(deployment=openai_emb_deployment, chunk_size=500,
                                   retry_max_seconds=120, retry_min_seconds=2, max_retries=6)

In [None]:
import xml.etree.ElementTree as ET
# Parse the ontology file
tree = ET.parse('your_ontology_file')
root = tree.getroot()

In [None]:
# Define the namespace prefixes
kkg_ns = {
    'owl': 'http://www.w3.org/2002/07/owl#',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
    'xsd': 'http://www.w3.org/2001/XMLSchema#',
    'nen3610': 'https://data.kkg.kadaster.nl/nen3610/model/def/',
    'nen3610-shp': 'https://data.kkg.kadaster.nl/nen3610/model/shp/',
    'skos': 'http://www.w3.org/2004/02/skos/core#',
    'bag': 'http://bag.basisregistraties.overheid.nl/def/bag#',
    'bag-begrip': 'http://bag.basisregistraties.overheid.nl/id/begrip/',
    'brt': 'http://brt.basisregistraties.overheid.nl/def/top10nl#',
    'wbk': 'https://data.labs.kadaster.nl/cbs/wbk/vocab/',
    'geo': 'http://www.opengis.net/ont/geosparql#',
    'kad': 'https://data.kkg.kadaster.nl/kad/model/def/',
    'kad-con': 'https://data.kkg.kadaster.nl/kad/model/con/',
    'kad-shp': 'https://data.kkg.kadaster.nl/kad/model/shp/',
    'sor': 'https://data.kkg.kadaster.nl/sor/model/def/',
    'sor-con': 'https://data.kkg.kadaster.nl/sor/model/con/',
    'sor-shp': 'https://data.kkg.kadaster.nl/sor/model/shp/',
    "bgt": "http://bgt.basisregistraties.overheid.nl/def/bgt#",
    "bgt-pand": "http://bgt.basisregistraties.overheid.nl/id/pand/",
    "bnode": "https://data.kkg.kadaster.nl/well-known/genid/",
    "brt": "http://brt.basisregistraties.overheid.nl/def/top10nl#",
    "brt-gebouw": "http://brt.basisregistraties.overheid.nl/id/gebouw/",
    "brt-scheme": "http://brt.basisregistraties.overheid.nl/top10nl/id/scheme/",
    "brt-shp": "http://brt.basisregistraties.overheid.nl/top10nl/id/shape/",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "gebouw": "https://data.kkg.kadaster.nl/id/gebouw/",
    "bouwzone": "https://data.kkg.kadaster.nl/id/gebouwzone/",
    "gemeente": "https://data.kkg.kadaster.nl/id/gemeente/",
    "mim": "http://bp4mc2.org/def/mim#",
    "purl": "http://purl.org/linked-data/cube#",
    "prov": "http://www.w3.org/ns/prov#",
    "gml": "http://www.opengis.net/ont/gml#",
    "time": "http://www.w3.org/2006/time#",
    "shacl": "http://www.w3.org/ns/shacl#",
    "schema": "https://schema.org/",
    "triplydb": "https://triplydb.com/Triply/value/"
}

def map_iri_to_concise(iri, element):
    if iri is None:
        return None
    for key, val in kkg_ns.items():
        if iri.startswith(val):
            return iri.replace(val, key+':')
    return None

def get_all_info_from_element(element):
    info = []
    superclasses = []

    # Extract text content and nested elements
    for sub_element in element.iter():
        #Access the text content of the sub_element and remove the leading and trailing whitespace from the text with .strip(). This ensures that the text is not just whitespace.
        if sub_element.text and sub_element.text.strip():
            if sub_element.tag in ('{http://www.w3.org/2004/02/skos/core#}definition',
                                   '{http://www.w3.org/2000/01/rdf-schema#}comment',
                                   '{http://www.w3.org/ns/shacl#}description',
                                   '{http://www.w3.org/2004/02/skos/core#}scopeNote',
                                   '{http://bp4mc2.org/def/mim#}toelichting'):
                info.append(f"class annotation: {sub_element.text.strip()}")
            elif sub_element.tag in ('{http://www.w3.org/ns/shacl#}name',
                                     '{http://www.w3.org/2000/01/rdf-schema#}label',
                                     '{http://www.w3.org/2004/02/skos/core#}prefLabel'):
                info.append(f"class label: {sub_element.text.strip()}")
            else:
                info.append(sub_element.text.strip())

        for sub_attr, sub_value in sub_element.attrib.items():
            #print(sub_attr + ': ' + sub_value)
            concise_sub_value = map_iri_to_concise(sub_value, sub_element)
            if concise_sub_value != None:
                # Only append if it's not a subClassOf attribute to avoid duplication.
                if sub_element.tag != '{http://www.w3.org/2000/01/rdf-schema#}subClassOf':
                    info.append(f"{concise_sub_value}")

        # Extract superclass information
        if sub_element.tag == '{http://www.w3.org/2000/01/rdf-schema#}subClassOf':
            superclass = map_iri_to_concise(sub_element.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource'), sub_element)
            if superclass:
                info.append(f"subclass of: {superclass}")
                superclasses.append(superclass)


    return ' | '.join(info), superclasses


In [None]:
# A function to split the words that are written together without a space in the string
def camel_to_normal(s):
    # Insert a space before each uppercase letter that follows a lowercase letter
    spaced = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', s)
    # Insert a space before each uppercase letter that is followed by a lowercase letter, except at the start of the string
    spaced = re.sub(r'(?<!^)(?=[A-Z][a-z])', ' ', spaced)

    # Split the string into words
    words = spaced.split()

    # Process each word to maintain acronyms
    processed_words = []
    for word in words:
        if word.isupper() and len(word) > 1:
            # Keep acronyms as they are
            processed_words.append(word)
        else:
            # Convert other words to lowercase
            processed_words.append(word.lower())

    # Join the processed words back into a single string
    return ' '.join(processed_words)

# Example usage
# print(camel_to_normal('DataComponentNASA'))  # Output: 'data component NASA'
# print(camel_to_normal('WTOTradeCommission'))  # Output: 'WTO trade commission'

def remove_namespace(element_id):
    """
    Removes namespace prefix from ontology element ID.

    Args:
        element_id (str): The element ID with namespace (e.g., 'bag:Gebruiksdoel')

    Returns:
        str: The element name without namespace (e.g., 'Gebruiksdoel')
    """
    return element_id.split(':')[-1] if ':' in element_id else element_id

def get_label_or_id(element, default_id):
    """Helper function to get the label or fallback to the concise ID."""
    if element is not None:
        rdfs_label = element.find('rdfs:label', kkg_ns)
        skos_label = element.find('skos:prefLabel', kkg_ns)

        # Print the labels for debugging.
        # print(f"rdfs:label: {rdfs_label.text if rdfs_label is not None else 'None'}")
        # print(f"skos:prefLabel: {skos_label.text if skos_label is not None else 'None'}")

        # label = rdfs_label or skos_label
        label = rdfs_label if rdfs_label is not None else skos_label # I don't understand why "or" doesn't work.
        # label = element.find('rdfs:label', kkg_ns) or element.find('skos:prefLabel', kkg_ns)

        if label is not None and label.text:
            return label.text
        else:
            default_id = remove_namespace(default_id)
            return default_id

In [None]:
classRela_strings = []
classRela_metadata_list = []
classRela_documents = []

# Extract classes
for class_element in root.findall('owl:Class', kkg_ns):
    class_id = map_iri_to_concise(class_element.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about'), class_element)
    class_label = camel_to_normal(get_label_or_id(class_element, class_id))
    # print(f"class_label:{class_label}")
    parentClasses = class_element.findall('rdfs:subClassOf', kkg_ns)
    parentClass_iris = [parentClass.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource') for parentClass in parentClasses if parentClass is not None]
    parentClass_ids = [map_iri_to_concise(iri, parentClass) for iri, parentClass in zip(parentClass_iris, parentClasses) if iri is not None]
    # if not parentClass_ids:
    #     parentClass_ids = ['No parent class']

    parentClass_labels = []

    for iri in parentClass_iris:
        parentClass_element = root.find(f".//owl:Class[@rdf:about='{iri}']", kkg_ns) # should use root.find instead of root.findall. The latter returns a list instead of an Element object.

        if parentClass_element is None:
            print(f"Warning: No element found for parent class IRI {iri}")
        parentClass_label = get_label_or_id(parentClass_element, map_iri_to_concise(iri, parentClass_element))
        parentClass_labels.append(parentClass_label)

    # Maybe class_id makes more sense because subclass relation must be clarified with the specific namespace.
    for parentClass_id, parentClass_label in zip(parentClass_ids, parentClass_labels):
        classRela_string = f"{class_id} is a subclass of {parentClass_id}"
        print(classRela_string)
        classRela_strings.append(classRela_string)

        classRela_metadata = {
            'class_id': class_id,
            'class_label': class_label,
            'parentClass_id': parentClass_id,
            'parentClass_label': parentClass_label
        }

        classRela_metadata_list.append(classRela_metadata)

        classRela_document = Document(
                    page_content=classRela_string,
                    metadata=classRela_metadata
            )

        classRela_documents.append(classRela_document)
        print(classRela_document)

In [None]:
# Extract object properties and store semantic strings in a list
obj_strings = []
# Initialize the metadata list
obj_metadata_list = []
# Extract the triples of the object properties and generate embeddings
obj_documents = []

for object_prop_element in root.findall('owl:ObjectProperty', kkg_ns):
    # object_prop_id = map_iri_to_concise(object_prop_element.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about'), object_prop_element)
    object_prop_iri = object_prop_element.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
    object_prop_id = map_iri_to_concise(object_prop_iri, object_prop_element)
    object_prop_label = camel_to_normal(get_label_or_id(object_prop_element, object_prop_id))

    # Extract rdfs:domain and rdfs:range
    domains = object_prop_element.findall('rdfs:domain', kkg_ns)
    ranges = object_prop_element.findall('rdfs:range', kkg_ns)

    # Extract full IRIs for domains and ranges
    domain_iris = [domain.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource') for domain in domains if domain is not None]
    range_iris = [range.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource') for range in ranges if range is not None]

    # Map all domain and range IRIs to concise IDs
    domain_ids = [map_iri_to_concise(iri, domain) for iri, domain in zip(domain_iris, domains) if iri is not None]
    range_ids = [map_iri_to_concise(iri, range) for iri, range in zip(range_iris, ranges) if iri is not None]

    # Handle cases with no domains or ranges
    if not domain_ids:
        domain_ids = ['No rdfs:domain']
    if not range_ids:
        range_ids = ['No rdfs:range']

    # Extract labels for domains and ranges
    domain_labels = []
    range_labels = []

    for iri in domain_iris:
        domain_element = root.find(f".//owl:Class[@rdf:about='{iri}']", kkg_ns) # should use root.find instead of root.findall. The latter returns a list instead of an Element object.

        if domain_element is None:
            print(f"Warning: No element found for domain IRI {iri}")
        domain_label = get_label_or_id(domain_element, map_iri_to_concise(iri, domain_element))
        print(domain_label)
        domain_labels.append(domain_label)


    for iri in range_iris:
        range_element = root.find(f'.//owl:Class[@rdf:about="{iri}"]', kkg_ns)
        if range_element is None:
            print(f"Warning: No element found for range IRI {iri}")
        range_label = get_label_or_id(range_element, map_iri_to_concise(iri, range_element))
        range_labels.append(range_label)

    # Construct semantic string structure and metadata dictionary
    for domain_id, domain_label in zip(domain_ids, domain_labels):
        for range_id, range_label in zip(range_ids, range_labels):
            obj_string = f"{domain_label} {object_prop_label} {range_label}, subject:<{domain_id}>predicate:<{object_prop_id}>object:<{range_id}>"
            obj_strings.append(obj_string)

            # Construct metadata dictionary with full iris
            obj_metadata = {
                "subject": domain_id,
                "predicate": object_prop_id,
                "object": range_id,
                "id":  f"{domain_id} {object_prop_id} {range_id};"
            }
            obj_metadata_list.append(obj_metadata)

            obj_document = Document(
                    page_content=obj_string,
                    metadata=obj_metadata
            )
            obj_documents.append(obj_document)
            print(obj_document)

In [None]:
# Extract datatype properties and store semantic strings in a list
datatype_strings = []
# Initialize the metadata list
datatype_metadata_list = []
# Extract the triples of the datatype properties and generate embeddings
datatype_documents = []

for datatype_prop_element in root.findall('owl:DatatypeProperty', kkg_ns):
    datatype_prop_iri = datatype_prop_element.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
    datatype_prop_id = map_iri_to_concise(datatype_prop_iri, datatype_prop_element)
    datatype_prop_label = camel_to_normal(get_label_or_id(datatype_prop_element, datatype_prop_id))

    # Extract rdfs:domain and rdfs:range
    domains = datatype_prop_element.findall('rdfs:domain', kkg_ns)
    ranges = datatype_prop_element.findall('rdfs:range', kkg_ns)

    # Extract full IRIs for domains and ranges
    domain_iris = [domain.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource') for domain in domains if domain is not None]
    range_iris = [range.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource') for range in ranges if range is not None]

    # Map all domain and range IRIs to concise IDs
    domain_ids = [map_iri_to_concise(iri, domain) for iri, domain in zip(domain_iris, domains) if iri is not None]
    range_ids = [map_iri_to_concise(iri, range) for iri, range in zip(range_iris, ranges) if iri is not None]

    # Handle cases with no domains or ranges
    if not domain_ids:
        domain_ids = ['No rdfs:domain']
    if not range_ids:
        range_ids = ['No rdfs:range']

    # Extract labels for domains and ranges
    domain_labels = []
    range_labels = []

    for iri in domain_iris:
        # print(iri)
        domain_element = root.find(f".//owl:Class[@rdf:about='{iri}']", kkg_ns) # should use root.find instead of root.findall. The latter returns a list instead of an Element object.

        if domain_element is None:
            print(f"Warning: No element found for domain IRI {iri}")
        domain_label = get_label_or_id(domain_element, map_iri_to_concise(iri, domain_element))
        print(domain_label)
        domain_labels.append(domain_label)


    for iri in range_iris:
        range_element = root.find(f'.//owl:Class[@rdf:about="{iri}"]', kkg_ns)
        if range_element is None:
            print(f"Warning: No element found for range IRI {iri}")
        range_label = get_label_or_id(range_element, map_iri_to_concise(iri, range_element))
        range_labels.append(range_label)

    # Construct semantic string structure and metadata dictionary
    for domain_id, domain_label in zip(domain_ids, domain_labels):
        for range_id, range_label in zip(range_ids, range_labels):
            datatype_string = f"{domain_label} {datatype_prop_label} {range_label}, subject:<{domain_id}>predicate:<{datatype_prop_id}object:<{range_id}>"
            datatype_strings.append(datatype_string)

            # Construct metadata dictionary with full iris
            datatype_metadata = {
                "subject": domain_id,
                "predicate": datatype_prop_id,
                "object": range_id,
                "id":  f"{domain_id} {datatype_prop_id} {range_id};"
            }
            datatype_metadata_list.append(datatype_metadata)

            datatype_document = Document(
                    page_content=datatype_string,
                    metadata=datatype_metadata
            )
            datatype_documents.append(datatype_document)
            print(datatype_document)
print(datatype_documents)

In [None]:
# Add the documents of customized triple patterns on the class level.
additional_documents = []
string1 = f"Please bear in mind that the equivalence applies to the individual level of class sor:Gemeente and class wbk:Gemeente in the knowledge graph (owl:sameAs)"
string1_metadata = {
    "description": "instance of sor:Gemeente owl:sameAs instance of wbk:Gemeente"
}

string2 = f"Perceel within Gemeente, subject:<sor:Perceel>predicate:<geo:sfWithin>object:<sor:Gemeente>"
string2_metadata = {
    "subject": "sor:Perceel",
    "predicate": "geo:sfWithin",
    "object": "sor:Gemeente",
    "id":  "sor:Perceel geo:sfWithin sor:Gemeente;"
}

string3 = f"Gebouw within Buurt, subject:<sor:Gebouw>predicate:<geo:sfWithin>object:<wbk:Buurt>"
string3_metadata = {
    "subject": "sor:Gebouw",
    "predicate": "geo:sfWithin",
    "object": "wbk:Buurt",
    "id":  "sor:Gebouw geo:sfWithin wbk:Buurt;"
}

string4 = f"Buurt within Wijk, subject:<wbk:Buurt>predicate:<geo:sfWithin>object:<wbk:Wijk>"
string4_metadata = {
    "subject": "wbk:Buurt",
    "predicate": "geo:sfWithin",
    "object": "wbk:Wijk",
    "id":  "wbk:Buurt geo:sfWithin wbk:Wijk;"
}

string5 = f"Wijk within Gemeente, subject:<wbk:Wijk>predicate:<geo:sfWithin>object:<wbk:Gemeente>"
string5_metadata = {
    "subject": "wbk:Wijk",
    "predicate": "geo:sfWithin",
    "object": "wbk:Gemeente",
    "id":  "wbk:Wijk geo:sfWithin wbk:Gemeente;"
}

string6 = f"Gemeente within Provincie, subject:<sor:Gemeente>predicate:<geo:sfWithin>object:<sor:Provincie>"
string6_metadata = {
    "subject": "sor:Gemeente",
    "predicate": "geo:sfWithin",
    "object": "sor:Provincie",
    "id":  "sor:Gemeente geo:sfWithin sor:Provincie;"
}

for i in range(1, 7):
    additional_document = Document(
        page_content=globals()[f'string{i}'],
        metadata=globals()[f'string{i}_metadata']
    )
    print(additional_document)
    additional_documents.append(additional_document)

In [None]:
# Embed the selected individuals
# Define the namespaces
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
SHACL = Namespace("http://www.w3.org/ns/shacl#")
OWL = Namespace("http://www.w3.org/2002/07/owl#")

# Load the ontology
g = Graph()
g.parse("kkg_complete_edition1.owl")

# Define the target classes
target_classes = [
    URIRef("https://data.kkg.kadaster.nl/sor/model/def/Gebruiksdoel"),
    URIRef("https://data.kkg.kadaster.nl/kad/model/def/Gebouwtype"),
    URIRef("http://bag.basisregistraties.overheid.nl/def/bag#Gebruiksdoel")
]

# Function to convert full IRI to concise namespace format
def to_concise_format(iri):
    for prefix, namespace in kkg_ns.items():
        if iri.startswith(namespace):
            return f"{prefix}:{iri[len(namespace):]}"
    return iri

# Function to extract information of named individuals that are instances of the target classes
def extract_named_individuals_info(graph, target_classes):
    individuals_info = []

    for s in graph.subjects(RDF.type, None):
        if any(tc in graph.objects(s, RDF.type) for tc in target_classes):
            individual_info = {}
            individual_info['individual'] = to_concise_format(str(s))

            # Extract sameAs relationships
            same_as_list = [to_concise_format(str(o)) for o in graph.objects(s, OWL.sameAs)]
            # Only add the sameAs key if the list is not empty
            if same_as_list:
                individual_info['sameAs'] = same_as_list

            individual_info['rdf:type'] = [to_concise_format(str(o)) for o in graph.objects(s, RDF.type)]
            # Remove 'owl:NamedIndividual' from the key 'rdf:type' because every individual is surely an individual of owl:NameIndividual, so it is not useful information.
            individual_info['rdf:type'].remove('owl:NamedIndividual')

            individual_info['label'] = None
            individual_info['definition'] = None

            for label_predicate in [SKOS.prefLabel, RDFS.label, SHACL.name]:
                label = graph.value(s, label_predicate)
                if label:
                    individual_info['label'] = str(label)
                    break

            for definition_predicate in [SKOS.definition, RDFS.comment, SHACL.description]:
                definition = graph.value(s, definition_predicate)
                if definition:
                    individual_info['definition'] = str(definition)
                    break

            individuals_info.append(individual_info)

    return individuals_info

# Extract and print the information of named individuals that are instances of the target classes
named_individuals_info = extract_named_individuals_info(g, target_classes)

individual_strings = []
individual_documents = []
for info in named_individuals_info:
    # print(info)
    individual_string = f"{info['label']} is an individual of class {info['rdf:type']}, {info}"
    # print(individual_string)
    individual_strings.append(individual_string)

    individual_document = Document(
        page_content = individual_string,
        metadata = info
    )
    print(individual_document)
    individual_documents.append(individual_document)

In [None]:
# Set up the vector database for retrieval data
from urllib.parse import quote_plus
from sqlalchemy import create_engine
import psycopg2
import io

vectorhost= "vectorhost_name"
vectorport= "vectorport_name"
vectordbname= "vector_db_name"
vectorusername=  "vector_user_name"

vectorpassword = quote_plus(dbutils.secrets.get(scope="scope_name", key="key_name"))

In [None]:
vectorconnectstr=  f"postgresql+psycopg2://{vectorusername}:{vectorpassword}@{vectorhost}:{vectorport}/{vectordbname}?sslmode=require"
print(vectorconnectstr)
vectorengine = create_engine(vectorconnectstr)
vectorconn = vectorengine.raw_connection()

In [None]:
collection_name= "your_collection_name"
db = PGVector(
    collection_name=collection_name,
    connection_string=vectorconnectstr,
    embedding_function=embeddings,
)

In [None]:
# Store the embedding in the vector database
batch_size = 50
for i in range(0, len(obj_documents), batch_size):
    size = min(batch_size, len(obj_documents) - i)
    print(size)
    print(i)
    obj_batch = obj_documents[i:i+size]
    print(len(obj_batch))
    db.add_documents(obj_batch)
    print(f"Added {size} documents")

In [None]:
batch_size = 50
for i in range(0, len(classRela_documents), batch_size):
    size = min(batch_size, len(classRela_documents) - i)
    print(size)
    print(i)
    classRela_batch = classRela_documents[i:i+size]
    print(len(classRela_batch))
    db.add_documents(classRela_batch)
    print(f"Added {size} documents")

In [None]:
batch_size = 50
for i in range(0, len(datatype_documents), batch_size):
    size = min(batch_size, len(datatype_documents) - i)
    print(size)
    print(i)
    datatype_batch = datatype_documents[i:i+size]
    print(len(datatype_batch))
    db.add_documents(datatype_batch)
    print(f"Added {size} documents")

In [None]:
batch_size = 50
for i in range(0, len(additional_documents), batch_size):
    size = min(batch_size, len(additional_documents) - i)
    print(size)
    print(i)
    additional_batch = additional_documents[i:i+size]
    print(len(additional_batch))
    db.add_documents(additional_batch)
    print(f"Added {size} documents")

In [None]:
batch_size = 50
for i in range(0, len(individual_documents), batch_size):
    size = min(batch_size, len(individual_documents) - i)
    print(size)
    print(i)
    individual_batch = individual_documents[i:i+size]
    print(len(individual_batch))
    db.add_documents(individual_batch)
    print(f"Added {size} documents")

In [None]:
# Set up the few-shot learning examples
def example_solution1():
    question = "Hoeveel ziekenhuizen zijn er in de provincie Gelderland?"
    query = """
PREFIX bag: <http://bag.basisregistraties.overheid.nl/def/bag#>
PREFIX kad: <https://data.kkg.kadaster.nl/kad/model/def/>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX sor: <https://data.kkg.kadaster.nl/sor/model/def/>
PREFIX sor-con: <https://data.kkg.kadaster.nl/sor/model/con/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX provincie: <https://data.kkg.kadaster.nl/id/provincie/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
prefix wbk: <https://data.labs.kadaster.nl/cbs/wbk/vocab/>

SELECT (COUNT(DISTINCT ?geb) as ?aantal)
WHERE {{
    ?vbo a sor:Verblijfsobject;
        sor:oppervlakte ?wo;
        sor:hoofdadres ?na;
        sor:maaktDeelUitVan ?geb.

    ?geb a sor:Gebouw;
        sor:oorspronkelijkBouwjaar ?bo;
        geo:hasGeometry [
            geo:asWKT ?geo_wgs84;
            rdfs:isDefinedBy bag:
        ].

    OPTIONAL {{
        ?per a sor:Perceel;
            sor:hoortBij ?na;
            sor:oppervlakte ?po;
    }}

    OPTIONAL {{
        ?gebz sor:hoortBij ?vbo;
            kad:gebouwtype/skos:prefLabel ?tg.
    }}

    ?wbk_buurt a wbk:Buurt;
              rdfs:label ?buurt_naam;
              geo:hasGeometry [
                  geo:asWKT ?buurt_geo_wgs84;
              ];
              ^geo:sfWithin ?geb;
              geo:sfWithin ?wbk_wijk.

    ?wbk_wijk a wbk:Wijk;
              rdfs:label ?wijk_naam;
              geo:hasGeometry [
                  geo:asWKT ?wijk_geo_wgs84;
              ];
              geo:sfWithin ?wbk_gemeente.

    ?wbk_gemeente a wbk:Gemeente;
              rdfs:label ?wbk_gemeente_naam;
              ^owl:sameAs ?gemeente.

    ?gemeente geo:sfWithin ?provincie;
              skos:prefLabel ?gemeente_naam.

    ?provincie a sor:Provincie;
               skos:prefLabel "Gelderland"@nl.

    FILTER (?tg = "ziekenhuis"@nl)
}}
LIMIT 9999
"""
    return query

def example_solution2():
    question = "Hoeveel panden staan er aan de straat zandweg in de gemeente Maasdriel die voor 1980 zijn gebouwd?"
    query = """
PREFIX bag: <http://bag.basisregistraties.overheid.nl/def/bag#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sor: <https://data.kkg.kadaster.nl/sor/model/def/>
PREFIX sor-con: <https://data.kkg.kadaster.nl/sor/model/con/>
PREFIX kad: <https://data.kkg.kadaster.nl/kad/model/def/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix owl: <http://www.w3.org/2002/07/owl#>
prefix wbk: <https://data.labs.kadaster.nl/cbs/wbk/vocab/>

SELECT (COUNT(DISTINCT ?geb) as ?aantal)
WHERE {{
    ?openbareruimte a sor:OpenbareRuimte;
                    skos:prefLabel "Zandweg"@nl;
                    ^sor:ligtAan ?na.

    ?na a sor:Nummeraanduiding;
         ^sor:hoofdadres ?vbo.

    ?vbo a sor:Verblijfsobject;
         sor:oppervlakte ?wo;
         sor:maaktDeelUitVan ?geb.

    ?geb a sor:Gebouw;
        sor:oorspronkelijkBouwjaar ?bo;
         geo:hasGeometry [
             geo:asWKT ?geo_wgs84;
             rdfs:isDefinedBy bag:
         ].

    ?wbk_buurt a wbk:Buurt;
              ^geo:sfWithin ?geb;
               geo:sfWithin ?wbk_wijk.

    ?wbk_wijk a wbk:Wijk;
              geo:sfWithin ?wbk_gemeente.

    ?wbk_gemeente a wbk:Gemeente;
              ^owl:sameAs ?gemeente.

    ?gemeente a sor:Gemeente;
              owl:sameAs ?wbk_gemeente;
              skos:prefLabel "Maasdriel"@nl.

    FILTER (?bo < "1980"^^xsd:gYear)
}}
LIMIT 9999
"""
    return query

def example_solution3():
    question = "Hoeveel woningen staan er aan de richtersweg in de woonplaats Ugchelen?"
    query = """
PREFIX bag: <http://bag.basisregistraties.overheid.nl/def/bag#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sor: <https://data.kkg.kadaster.nl/sor/model/def/>
PREFIX kad: <https://data.kkg.kadaster.nl/kad/model/def/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX sor-con: <https://data.kkg.kadaster.nl/sor/model/con/>

SELECT (COUNT(DISTINCT ?geb) as ?aantal)
WHERE {{
        ?woonplaats a sor:Woonplaats;
                skos:prefLabel "Ugchelen"@nl;
                ^sor:ligtIn ?openbareruimte.

        ?openbareruimte a sor:OpenbareRuimte;
                skos:prefLabel "Richtersweg"@nl;
                ^sor:ligtAan ?na.

        ?na a sor:Nummeraanduiding;
         sor:huisnummer ?huisnummers;
         ^sor:hoofdadres ?vbo.

        ?vbo a sor:Verblijfsobject;
         sor:oppervlakte ?wo;
         sor:maaktDeelUitVan ?geb;
         sor:gebruiksdoel sor-con:woonfunctie.

        ?geb a sor:Gebouw;
         sor:oorspronkelijkBouwjaar ?bo;
         geo:hasGeometry [
             geo:asWKT ?geo_wgs84;
             rdfs:isDefinedBy bag:
         ].

    OPTIONAL {{
        ?per a sor:Perceel;
             sor:hoortBij ?na;
             sor:oppervlakte ?po;
    }}

    OPTIONAL {{
        ?gebz sor:hoortBij ?vbo;
               kad:gebouwtype/skos:prefLabel ?tg.
    }}
}}
LIMIT 9999
"""
    return query

def example_solution4():
    question = "Welke gemeente in de provincie Zuid-Holland heeft de meest openbare ruimte?"
    query = """
PREFIX bag: <http://bag.basisregistraties.overheid.nl/def/bag#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sor: <https://data.kkg.kadaster.nl/sor/model/def/>
PREFIX sor-con: <https://data.kkg.kadaster.nl/sor/model/con/>
PREFIX kad: <https://data.kkg.kadaster.nl/kad/model/def/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
prefix wbk: <https://data.labs.kadaster.nl/cbs/wbk/vocab/>

SELECT ?gemeente (COUNT(DISTINCT ?openbareruimte) as ?aantal)
WHERE {{
    ?openbareruimte a sor:OpenbareRuimte;
                    ^sor:ligtAan ?na.

    ?na a sor:Nummeraanduiding;
         ^sor:hoofdadres ?vbo.

    ?vbo a sor:Verblijfsobject;
         sor:oppervlakte ?wo;
         sor:maaktDeelUitVan ?geb.

    ?geb a sor:Gebouw;
         sor:oorspronkelijkBouwjaar ?bo;
         geo:hasGeometry [
             geo:asWKT ?gebouw_geo_wgs84;
             rdfs:isDefinedBy bag:
         ].

    OPTIONAL {{
        ?per a sor:Perceel;
             sor:hoortBij ?na;
             sor:oppervlakte ?po.
    }}

    OPTIONAL {{
        ?gebz sor:hoortBij ?vbo;
               kad:gebouwtype/skos:prefLabel ?tg.
    }}

    ?wbk_buurt a wbk:Buurt;
              rdfs:label ?buurt_naam;
              geo:hasGeometry [
                  geo:asWKT ?buurt_geo_wgs84;
              ];
              ^geo:sfWithin ?geb;
              geo:sfWithin ?wbk_wijk.

    ?wbk_wijk a wbk:Wijk;
              rdfs:label ?wijk_naam;
              geo:hasGeometry [
                  geo:asWKT ?wijk_geo_wgs84;
              ];
              geo:sfWithin ?wbk_gemeente.

    ?wbk_gemeente a wbk:Gemeente;
              rdfs:label ?wbk_gemeente_naam;
              ^owl:sameAs ?gemeente.

    ?gemeente skos:prefLabel ?gemeente_naam;
              geo:sfWithin ?provincie.

    ?provincie a sor:Provincie;
              skos:prefLabel "Zuid-Holland"@nl.
}}
GROUP BY ?gemeente
ORDER BY DESC(COUNT(DISTINCT ?openbareruimte))
LIMIT 1
"""
    return query

def example_solution5():
    question = "Mag ik het dichtstbijzijnde gemeentehuis vanaf het adres Bisschopstraat 19A-02 Rotterdam zien?"
    query = """
PREFIX bag: <http://bag.basisregistraties.overheid.nl/def/bag#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sor: <https://data.kkg.kadaster.nl/sor/model/def/>
PREFIX sor-con: <https://data.kkg.kadaster.nl/sor/model/con/>
PREFIX kad: <https://data.kkg.kadaster.nl/kad/model/def/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX geo: <http://www.opengis.net/ont/geosparql#>
PREFIX nen3610: <https://data.kkg.kadaster.nl/nen3610/model/def/>
prefix brt: <http://brt.basisregistraties.overheid.nl/def/top10nl#>
prefix uom: <http://www.opengis.net/def/uom/OGC/1.0/>
prefix geof: <http://www.opengis.net/def/function/geosparql/>

SELECT ?gebouwShape ?gemeentehuis_geometrie_wgs84 ?gemeentehuis ?afstand
WHERE {{
    ?woonplaats a sor:Woonplaats;
                skos:prefLabel "Rotterdam"@nl;
                ^sor:ligtIn ?openbareruimte.

    ?openbareruimte a sor:OpenbareRuimte;
                    skos:prefLabel "Bisschopstraat"@nl;
                    ^sor:ligtAan ?na.

    ?na a sor:Nummeraanduiding;
         sor:huisnummer 19;
         ^sor:hoofdadres ?vbo.
         Optional{{?na sor:huisletter "A"}}
         Optional{{?na sor:huisnummertoevoeging "02"}}

    ?vbo a sor:Verblijfsobject;
         sor:maaktDeelUitVan ?geb.

    ?geb a sor:Gebouw;
         sor:oorspronkelijkBouwjaar ?bo;
         geo:hasGeometry [
             geo:asWKT ?gebouwShape;
             rdfs:isDefinedBy bag:
         ].

    ?gemeentehuis a sor:Gebouwzone;
                  kad:gebouwtype ?gebouwtype;
                  geo:hasGeometry/geo:asWKT ?gemeentehuis_geometrie_wgs84.
    ?gebouwtype skos:prefLabel "gemeentehuis"@nl.
    BIND(geof:distance(?gebouwShape, ?gemeentehuis_geometrie_wgs84, uom:metre) as ?afstand)
}}
ORDER BY ?afstand
LIMIT 1
"""
    return query

In [None]:
# Integrate examples for a few-shot leanrning chain in the prompt.
examples = [
    Tool(
        name = "Hoeveel ziekenhuizen zijn er in de provincie Gelderland?",
        func = example_solution1,
        description = "This function returns an example sparql query for an example question. 'name' denotes to the question and 'func' return the example query solution to the question"
    ),
    Tool(
        name = "Hoeveel panden staan er aan de straat zandweg in de gemeente Maasdriel die voor 1980 zijn gebouwd?",
        func = example_solution2,
        description = "This function returns an example sparql query for an example question. 'name' denotes to the question and 'func' return the example query solution to the question"
    ),
    Tool(
        name = "Hoeveel woningen staan er aan de richtersweg in ugchelen?",
        func = example_solution3,
        description = "This function returns an example sparql query for an example question. 'name' denotes to the question and 'func' return the example query solution to the question"
    ),
    Tool(
        name = "Welke gemeente in de provincie Zuid-Holland heeft de meest openbare ruimte?",
        func = example_solution4,
        description = "This function returns an example sparql query for an example question. 'name' denotes to the question and 'func' return the example query solution to the question"
    ),
    Tool(
        name = "Mag ik het dichtstbijzijnde gemeentehuis vanaf het adres Bisschopstraat 19A-02 Rotterdam zien?",
        func = example_solution5,
        description = "This function returns an example sparql query for an example question. 'name' denotes to the question and 'func' return the example query solution to the question"
    ),
]

In [None]:
# Construct a function format_example to accommodate a variety of formatted examples in the prompt for more flexibility.
def format_examples(examples):
    formatted_examples = ""
    for example in examples:
        question = example.name
        sparql_query = example.func()  # Assuming func returns the SPARQL query as a string
        formatted_examples += f"Question: {question}\nSPARQL Query: {sparql_query}\n\n"
    return formatted_examples

def create_qa_system(vector_store, examples, model_name='gpt-4-32k', temperature=0):
    """
    Create a question answering system based on RAG vector store.

    Args:
        vector_store: The PGVector store containing the documents
        model_name: Name of the LLM model to use
        temperature: Temperature parameter for the LLM

    Returns:
        qa: A RetrievalQA chain object
    """
    # Create the LLM
    llm = AzureChatOpenAI(
        model_name=model_name,
        temperature=temperature
    )

    # Format the examples
    formatted_examples = format_examples(examples)

    # Create a custom prompt template
    prompt_template = f"""You are a sematic web expert and you master advance Dutch language. You are provided with a vector database as an external data source to use RAG to solve a sparql generation problem. The data source consists of ontology data of Dutch cadastral system, including triple relations between classes and properties(object property or data property), subclass relations, and individual-class relations. You receive a user input question regarding Dutch cadastral information. Please only use the RAG data source, context of this prompt, as well as the question, to generate a SPARQL query that can extract the relevant data to return the answer to the question.
    Let's think step by step:
1. Analyze the question carefully. If you find any name of places of Netherlands in the question, it is the pertinent location to this question. The location can be a house address, a town(woonplaats/gemeente), a province(provincie) or a street(straat). If you figure out that a specific house address is needed but not provided in the question, ask "Wat is het adres waar u in geïnteresseerd bent?".

2. The question is in Dutch. Use only Dutch to name the variables in your sparql query. While creating the SPARQL query, you only use the triple relations, subclass relations, and individual-class relations stored in the vector data source. With these ontology relations, please traverse the property paths to navigate through a sequence of relationships, which is useful for finding connections between entities that are several steps apart in the ontology. You can also use recursive query to connect the difference entities in the queryif necessary. Sometimes you need to use an inverse relation to trace back to a subject entity if necessary. For example, "^sor:ligtAan" or "^sor:maaktDeelUitVan".

3. Always use "DISTINCT" on the variable if you need to select or count this certain variable. We don't need duplications in the result.

4. Don't forget any prefix declaration of the sparql query. Check the prefix declaration you need in the query line by line and write them at the top. You should only use the following prefixes:
'owl': 'http://www.w3.org/2002/07/owl#',
'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
'xsd': 'http://www.w3.org/2001/XMLSchema#',
'nen3610': 'https://data.kkg.kadaster.nl/nen3610/model/def/',
'nen3610-shp': 'https://data.kkg.kadaster.nl/nen3610/model/shp/',
'skos': 'http://www.w3.org/2004/02/skos/core#',
'bag': 'http://bag.basisregistraties.overheid.nl/def/bag#',
'bag-begrip': 'http://bag.basisregistraties.overheid.nl/id/begrip/',
'brt': 'http://brt.basisregistraties.overheid.nl/def/top10nl#',
'wbk': 'https://data.labs.kadaster.nl/cbs/wbk/vocab/',
'geo': 'http://www.opengis.net/ont/geosparql#',
'kad': 'https://data.kkg.kadaster.nl/kad/model/def/',
'kad-con': 'https://data.kkg.kadaster.nl/kad/model/con/',
'kad-shp': 'https://data.kkg.kadaster.nl/kad/model/shp/',
'sor': 'https://data.kkg.kadaster.nl/sor/model/def/',
'sor-con': 'https://data.kkg.kadaster.nl/sor/model/con/',
'sor-shp': 'https://data.kkg.kadaster.nl/sor/model/shp/',
"bgt": "http://bgt.basisregistraties.overheid.nl/def/bgt#",
"bgt-pand": "http://bgt.basisregistraties.overheid.nl/id/pand/",
"bnode": "https://data.kkg.kadaster.nl/well-known/genid/",
"brt": "http://brt.basisregistraties.overheid.nl/def/top10nl#",
"brt-gebouw": "http://brt.basisregistraties.overheid.nl/id/gebouw/",
"brt-scheme": "http://brt.basisregistraties.overheid.nl/top10nl/id/scheme/",
"brt-shp": "http://brt.basisregistraties.overheid.nl/top10nl/id/shape/",
"foaf": "http://xmlns.com/foaf/0.1/",
"gebouw": "https://data.kkg.kadaster.nl/id/gebouw/",
"bouwzone": "https://data.kkg.kadaster.nl/id/gebouwzone/",
"gemeente": "https://data.kkg.kadaster.nl/id/gemeente/",
"mim": "http://bp4mc2.org/def/mim#",
"purl": "http://purl.org/linked-data/cube#",
"prov": "http://www.w3.org/ns/prov#",
"gml": "http://www.opengis.net/ont/gml#",
"time": "http://www.w3.org/2006/time#",
"shacl": "http://www.w3.org/ns/shacl#",
"schema": "https://schema.org/",
"triplydb": "https://triplydb.com/Triply/value/"

5. You should not include irrelevant ontology that are not in the data source or in the prompt, in the generated SPARQL query. If you have a location name of Netherlands in the label of filter, add @nl after the name. Sometimes the user input location may have a typo and you should correct it in the query. Make sure all the location names are written in the complete form of the Dutch way. For example, you should use 's-Gravenhage instead of Den Haag or The Hague, use 's-Hertogenbosch instead of Den Bosch.

6.The concepts of Woonplaats and Gemeente are different. They can have the same name, such as woonplaats 'Utrecht' and gemeente 'Utrecht', but mean different things. In the question, if the user gives a city name without specifying it is woonplaats or gemeente, consider it as woonplaats. For example, if the user asks "Hoeveel kerken zijn er in Utrecht?", the user means Woonplaats Utrecht. But if the user asks "Hoeveel kerken zijn er in gemeente Utrecht?", that means Gemeente Utrecht.

7. For the concept "perceel", there are synonyms such as "tuin" and "kavel" to be used interchangeably in the question. For the concept "bouwjaar", which is a property of "gebouw", synonyms such as "oorspronkelijk bouwjaar" and "leeftijd" are used interchangeably in the question. For the concept "status", synonyms such as "bouw status" is used interchangeably in the question.

8. "huis" and "woning" refer to a verblijfsobject(vbo) with woonfunctie as gebruiksdoel.

9. If you are asked to show an administrative entity, like "Laat me mijn wijk/gemeente zien, adres Gustav Mahlerlaan 10, Amsterdam.", besides the data iri and geometry of the entity, you should also show the name of the administrative entity.

10. Following this chain-of-thought process, you can learn from some example questions and their corresponding SPARQL query solutions {formatted_examples}.

11.If you need a specific address to answer the question but is not provided by the question, or the question is not cadastral related, don't generate a query. Say you can't help with the question.

12. Check again if you make sure all necessary prefix declarations are added properly to the query.

13. In the answer, only give the sparql query. Don't output any context or description.
    Context: {{context}}
    Question: {{question}}
    Answer:"""

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question", "examples"]
    )

    # Create the RetrievalQA chain
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 15},  # Return top 10 most relevant documents
            # search_type="similarity_score_threshold",
            # search_kwargs={"score_threshold": 0.6, "k": 10}
        ),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT},
        verbose=True
    )

    return qa

def ask_question(qa_chain, question):
    """
    Ask a question to the QA system.

    Args:
        qa_chain: The RetrievalQA chain
        question: String containing the question

    Returns:
        dict: Contains the answer and source documents
    """
    result = qa_chain(question)

    return {
        "answer": result["result"],
        "source_documents": result["source_documents"]
    }

In [None]:
# Initialize the QA system
qa_system = create_qa_system(db, examples)  # db is my existing PGVector store

# Ask questions
question = "Hoeveel gebouwen zijn in 1960 in de woonplaats Rotterdam gebouwd?"
response = ask_question(qa_system, question)

# Print the results
print("Answer:", response["answer"])
print("\nSources used:")
for doc in response["source_documents"]:
    print(f"- {doc.page_content}")

In [None]:
sparql_query = response["answer"]

# Print only the SPARQL query
print(sparql_query)

In [None]:
# Use HTTP requests to test the SPARQL query remotely, without using SPARQLWrapper.
import requests

def execute_sparql_query(endpoint_url, sparql_query):
    headers = {
        'Content-Type': 'application/sparql-query',
        'Accept': 'application/json'
    }
    response = requests.post(endpoint_url, data=sparql_query, headers=headers)
    response.raise_for_status()  # Raise an exception for HTTP errors
    return response.json()

def test_sparql_query(endpoint_url, sparql_query):
    try:
        results = execute_sparql_query(endpoint_url, sparql_query)
        print("JSON response:")
        print(results)  # Print the entire JSON response for debugging
        # count = results[0]["aantal"]
        # print(f"Query executed successfully. Count: {count}")
    except Exception as e:
        print(f"An error occurred: {e}")


def main():
    endpoint_url = "https://api.labs.kadaster.nl/datasets/dst/kkg/services/default/sparql"

    # Assuming you have already obtained the sparql_query from your RAG system
    sparql_query = response["answer"]

    print(f"Testing query:\n{sparql_query}")
    test_sparql_query(endpoint_url, sparql_query)

if __name__ == "__main__":
    main()

In [None]:
# Using SPARQLWrapper to test the validity of the generated sparql query.

from SPARQLWrapper import SPARQLWrapper, JSON

def execute_sparql_query(endpoint_url, sparql_query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results

def test_sparql_query(endpoint_url, sparql_query):
    try:
        results = execute_sparql_query(endpoint_url, sparql_query)
        print("JSON response:")
        print(results)  # Print the entire JSON response for debugging
        count = results["results"]["bindings"][0]["aantal"]["value"]
        print(f"Query executed successfully. Count: {count}")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
# Continue to test the generated sparql query with SPARQLWrapper
def main():
    endpoint_url = "https://api.labs.kadaster.nl/datasets/dst/kkg/services/default/sparql"

    # Assuming you have already obtained the sparql_query from your RAG system
    sparql_query = response["answer"]

    print(f"Testing query:\n{sparql_query}")
    test_sparql_query(endpoint_url, sparql_query)

if __name__ == "__main__":
    main()

In [None]:
# Define a function to delete the content of a collection in the vector store
def clear_collection(collection_name):
    with vectorconn.cursor() as cur:
        cur.execute(f"""delete from langchain_pg_embedding e
                        where collection_id = (select uuid from langchain_pg_collection where name = '{collection_name}')""")
        print(f"Records with collection_name = {collection_name} have been deleted.")
        vectorconn.commit()

In [None]:
clear_collection("your_collection_name")