In [1]:
from rdflib import Graph
from typing import Any, List, Optional
from operator import itemgetter
import getpass
import os

from langchain.globals import set_debug
from langchain.memory import ConversationBufferMemory
from langchain.prompts import ChatPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import format_document
from langchain_openai import OpenAI
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import get_buffer_string
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Provide your OpenAI API Key")



In [2]:
extract_classes_query = """
PREFIX schema: <http://schema.org/>

SELECT ?uri ?predicate ?label ?type
WHERE {
    ?uri a ?type;
        ?predicate ?label.
    FILTER (
        ?predicate = schema:name
    )
}"""

class OntologyLoader(BaseLoader):
    """Load an OWL ontology and extract classes and properties as documents."""

    def __init__(self, ontology_url: str, format: Optional[str] = None):
        self.ontology_url = ontology_url
        self.format = format
        self.graph = Graph(store="Oxigraph")
        self.graph.parse(source=self.ontology_url, format=self.format)

    def load(self) -> List[Document]:
        """Load and return documents (classes and properties) from the OWL ontology."""
        docs: List[Document] = []
        for cls in self.graph.query(extract_classes_query):
            docs.append(self._create_document(cls))
        return docs

    def _create_document(self, result_row: Any) -> Document:
        """Create a Document object from a query result row."""
        label = str(result_row.label)
        return Document(
            page_content=label,
            # NOTE: you can include more metadata retrieved by the SPARQL query here
            metadata={
                "label": label,
                "uri": str(result_row.uri),
                "type": str(result_row.type),
                "predicate": str(result_row.predicate),
                "ontology": self.ontology_url,
            },
        )

In [3]:
flag_embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5", max_length=512)
loader = OntologyLoader("football_data_small.ttl", format="ttl")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Qdrant.from_documents(
    splits,
    flag_embeddings,
    collection_name="ontologies",
    location=":memory:",
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 15})
llm = OpenAI(temperature=0)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [65]:
# Create the memory object that is used to add messages
memory = ConversationBufferMemory(
    return_messages=True, output_key="answer", input_key="question"
)
# Add a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)

# Prompt to reformulate the question using the chat history
reform_template = """
You are a football analyst. Reformulate the question such that it contains all the contextual information needed. Do not answer anything, only reformulate the question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
REFORM_QUESTION_PROMPT = PromptTemplate.from_template(reform_template)

# Prompt to ask to answer the reformulated question
answer_template = """
Answer the question as a list, as succinctly as you can. 
{context}

Question: {question}
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(answer_template)

# Format how the ontology concepts are passed as context to the LLM
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
    template="Concept label: {page_content} | URI: {uri} | Type: {type} | Predicate: {predicate} | Ontology: {ontology}"
)
def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    # print("Formatted docs:", doc_strings)
    return document_separator.join(doc_strings)

# Reformulate the question using chat history
reformulated_question = {
    "reformulated_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: get_buffer_string(x["chat_history"]),
    }
    | REFORM_QUESTION_PROMPT
    | llm
    | StrOutputParser(),
}
# Retrieve the documents using the reformulated question
retrieved_documents = {
    "docs": itemgetter("reformulated_question") | retriever,
    "question": lambda x: print("💭 Reformulated question:", x["reformulated_question"]) or x["reformulated_question"],
    # "question": lambda x: x["reformulated_question"],
}
# Construct the inputs for the final prompt using retrieved documents
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question"),
}
# Generate the answer using the retrieved documents and answer prompt
answer = {
    "answer": final_inputs | ANSWER_PROMPT | llm,
    "docs": itemgetter("docs"),
}
# Put the chain together
final_chain = loaded_memory | reformulated_question | retrieved_documents | answer

def stream_chain(final_chain, memory: ConversationBufferMemory, inputs: dict[str, str]) -> dict[str, Any]:
    """Ask question, stream the answer output, and return the answer with source documents."""
    output = {"answer": ""}
    for chunk in final_chain.stream(inputs):
        # print(chunk, flush=True)
        if "docs" in chunk:
            output["docs"] = [doc.dict() for doc in chunk["docs"]]
            print("📚 Documents retrieved:")
            for doc in output["docs"]:
                print(f"· {doc['page_content']} ({doc['metadata']['uri']})")
            # print(json.dumps(output["docs"], indent=2))
        if "answer" in chunk:
            output["answer"] += chunk["answer"]
            print(chunk["answer"], end="", flush=True)
    return output

In [69]:
# set_debug(True)   # Uncomment to enable detailed LangChain debugging
output = stream_chain(final_chain, memory, {
    "question": "Give me one Brazilian player that has played for a Premier League team and also won a trophy? Which team did he play for when he won?"
})

💭 Reformulated question:  Can you name a Brazilian player who has played for a Premier League team and won a trophy? Which team did they play for when they won the trophy?
📚 Documents retrieved:
· Premier League (http://www.wikidata.org/entity/Q206073)
· Premier League (http://www.wikidata.org/entity/Q9448)
· Campeonato Brasileiro (http://www.wikidata.org/entity/Q1199581)
· Campeonato Brasileiro Série A (http://www.wikidata.org/entity/Q206813)
· Brazil national football team (http://www.wikidata.org/entity/Q83459)
· Russian Premier League (http://www.wikidata.org/entity/Q182165)
· São Paulo FC (http://www.wikidata.org/entity/Q38568)
· Santos F.C. (http://www.wikidata.org/entity/Q80955)
· 2014 FIFA World Cup Match 1, Brazil v Croatia (http://www.wikidata.org/entity/Q17486314)
· Cristiano Ronaldo (http://www.wikidata.org/entity/Q11571)
· Réunion Premier League (http://www.wikidata.org/entity/Q2261921)
· S.L. Benfica (http://www.wikidata.org/entity/Q131499)
· Brazil Olympic football team 

In [59]:
output['answer']

'\n1. Casemiro\n2. Christian Cueva\n3. Denis Cheryshev\n4. Eden Hazard\n5. Emil Forsberg'