In [118]:
!pip install langchain-openai fastembed langchain oxrdflib qdrant-client

^C


In [2]:
from rdflib import Graph
from typing import Any, List, Optional
import getpass
import os

from langchain.prompts.prompt import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import format_document
from langchain_openai import OpenAI
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Provide your OpenAI API Key")

In [3]:
extract_classes_query = """
PREFIX schema: <http://schema.org/>

SELECT ?uri ?predicate ?label ?type
WHERE {
    ?uri a ?type;
        ?predicate ?label.
    FILTER (
        ?predicate = schema:name
    )
}"""

class OntologyLoader(BaseLoader):
    """Load an OWL ontology and extract classes and properties as documents."""

    def __init__(self, ontology_url: str, format: Optional[str] = None):
        self.ontology_url = ontology_url
        self.format = format
        self.graph = Graph(store="Oxigraph")
        self.graph.parse(source=self.ontology_url, format=self.format)

    def load(self) -> List[Document]:
        """Load and return documents (classes and properties) from the OWL ontology."""
        docs: List[Document] = []
        for cls in self.graph.query(extract_classes_query):
            docs.append(self._create_document(cls))
        return docs

    def _create_document(self, result_row: Any) -> Document:
        """Create a Document object from a query result row."""
        label = str(result_row.label)
        return Document(
            page_content=label,
            # NOTE: you can include more metadata retrieved by the SPARQL query here
            metadata={
                "label": label,
                "uri": str(result_row.uri),
                "type": str(result_row.type),
                "predicate": str(result_row.predicate),
                "ontology": self.ontology_url,
            },
        )

In [7]:
flag_embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5", max_length=512)
loader = OntologyLoader("football_data.ttl", format="ttl")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Qdrant.from_documents(
    splits,
    flag_embeddings,
    collection_name="ontologies",
    location=":memory:",
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
llm = OpenAI(temperature=0)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [31]:
def reformulate_question(question: str, llm) -> str:
    """
    Reformulate the question to include necessary contextual information.
    """
    reformulation_prompt = f"""
    Reformulate the question such that it explicitly mentions additionally required information. Only reformulate the question if the original question is not specific enough to answer.
    Question: {question}
    Standalone question:"""
    reformulated_question = llm.generate([reformulation_prompt], max_tokens=200)
    return reformulated_question.generations[0][0].text

def generate_answer(context: str, question: str, llm) -> str:
    """
    Generate an answer using the retrieved documents as context.
    """
    answer_prompt = f"""
    Answer the question as a list by giving all that is asked for in csv format. Do not give information simply because it is related to the question. If you cannot answer the question, say so. Do not use any information outside this context:
    {context}

    Question: {question}
    """
    answer = llm.generate([answer_prompt], max_tokens=100)
    return answer.generations[0][0].text

# Format how the ontology concepts are passed as context to the LLM
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
    template="Concept label: {page_content} | URI: {uri} | Type: {type} | Predicate: {predicate} | Ontology: {ontology}"
)
def combine_documents(docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    # print("Formatted docs:", doc_strings)
    return document_separator.join(doc_strings)

def ask_question(question: str, retriever, llm) -> str:
    """
    Ask a question to the RAG system and return the generated answer.
    """
    reformulated_question = reformulate_question(question, llm)
    print("Reformulated question:", reformulated_question)
    docs = retriever.get_relevant_documents(reformulated_question)
    context = combine_documents(docs)
    answer = generate_answer(context, reformulated_question, llm)
    return answer


# Question Answering

In [32]:
question = "Names of Croatian players who won a UEFA Champions League, played for Real Madrid, and scored more than 10 goals in the tournament."
answer = ask_question(question, retriever, llm)
answer

Reformulated question:  What are the names of Croatian players who have won a UEFA Champions League, played for Real Madrid, and scored more than 10 goals in the tournament?


'\nLuka Modrić, Mateo Kovačić, and Ivan Rakitić'