In [118]:
!pip install langchain-openai fastembed langchain oxrdflib qdrant-client

^C


In [2]:
from rdflib import Graph
from typing import Any, List, Optional
import getpass
import os

from langchain.prompts.prompt import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import format_document
from langchain_openai import OpenAI
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Provide your OpenAI API Key")

In [3]:
extract_classes_query = """
PREFIX schema: <http://schema.org/>

SELECT ?uri ?predicate ?label ?type
WHERE {
    ?uri a ?type;
        ?predicate ?label.
    FILTER (
        ?predicate = schema:name
    )
}"""

class OntologyLoader(BaseLoader):
    """Load an OWL ontology and extract classes and properties as documents."""

    def __init__(self, ontology_url: str, format: Optional[str] = None):
        self.ontology_url = ontology_url
        self.format = format
        self.graph = Graph(store="Oxigraph")
        self.graph.parse(source=self.ontology_url, format=self.format)

    def load(self) -> List[Document]:
        """Load and return documents (classes and properties) from the OWL ontology."""
        docs: List[Document] = []
        for cls in self.graph.query(extract_classes_query):
            docs.append(self._create_document(cls))
        return docs

    def _create_document(self, result_row: Any) -> Document:
        """Create a Document object from a query result row."""
        label = str(result_row.label)
        return Document(
            page_content=label,
            # NOTE: you can include more metadata retrieved by the SPARQL query here
            metadata={
                "label": label,
                "uri": str(result_row.uri),
                "type": str(result_row.type),
                "predicate": str(result_row.predicate),
                "ontology": self.ontology_url,
            },
        )

In [4]:
flag_embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5", max_length=512)
loader = OntologyLoader("football_data.ttl", format="ttl")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Qdrant.from_documents(
    splits,
    flag_embeddings,
    collection_name="ontologies",
    location=":memory:",
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 20})
llm = OpenAI(temperature=0)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [12]:
def reformulate_question(question: str, llm) -> str:
    """
    Reformulate the question to include necessary contextual information.
    """
    reformulation_prompt = f"""
    Reformulate the question such that it is clearer. Only reformulate the question if the original question is not specific enough to answer.
    Question: {question}
    Standalone question:"""
    reformulated_question = llm.generate([reformulation_prompt], max_tokens=200)
    return reformulated_question.generations[0][0].text

def generate_answer(context: str, question: str, llm) -> str:
    """
    Generate an answer using the retrieved documents as context.
    """
    answer_prompt = f"""
    Answer the question by only giving the requested information. If you cannot answer the question, say so. Do not use any information outside this context:
    {context}

    Question: {question}
    """
    answer = llm.generate([answer_prompt], max_tokens=200)
    return answer.generations[0][0].text

# Format how the ontology concepts are passed as context to the LLM
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
    template="Concept label: {page_content} | URI: {uri} | Type: {type} | Predicate: {predicate} | Ontology: {ontology}"
)
def combine_documents(docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    # print("Formatted docs:", doc_strings)
    return document_separator.join(doc_strings)

def ask_question(question: str, retriever, llm) -> str:
    """
    Ask a question to the RAG system and return the generated answer.
    """
    reformulated_question = reformulate_question(question, llm)
    print("Reformulated question:", reformulated_question)
    docs = retriever.get_relevant_documents(reformulated_question)
    context = combine_documents(docs)
    answer = generate_answer(context, reformulated_question, llm)
    return answer


# Question Answering

#### Which Brazilian players have played for a Premier League team and also won a trophy?

In [13]:
question = "Which Brazilian players have played for a Premier League team and also won a trophy?"
answer = ask_question(question, retriever, llm)
print(answer)

Reformulated question:  Can you provide a list of Brazilian players who have both played for a Premier League team and won a trophy?

    Answer: I cannot answer this question as it requires information outside of the given context.


#### How many goals were scored by each argentinian player, individually?

In [14]:
question = "How many goals were scored by each argentinian player, individually?"
answer = ask_question(question, retriever, llm)
print(answer)

Reformulated question:  Can you provide a breakdown of the number of goals scored by each individual Argentinian player?

No, this information is not provided in the given context.


In [15]:
r = loader.graph.query('''
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX schema: <http://schema.org/>
PREFIX ex: <http://example.org/>

SELECT ?playerName (SUM(?goals) AS ?totalGoals)
WHERE {
  ?player a schema:Person ;
          schema:name ?playerName ;
          schema:nationality wd:Q414.
  
  ?goal ex:player ?player ;
        ex:goals ?goals .
}
GROUP BY ?playerName
''')
for row in r:
    print(row[0], row[1])

Sergio Agüero 1
Javier Zanetti 1
Esteban Cambiasso 1
Mario Kempes 3
Pablo Aimar 1
Roberto Ayala 1
David Trezeguet 11
Maxi Rodriguez 2
Mauro Camoranesi 2
Omar Sívori 1


#### Identify Argentine players who have won the FIFA World Cup, played for FC Barcelona, and have been awarded the Ballon d'Or.


In [16]:
question = "Identify Argentine players who have won the FIFA World Cup, played for FC Barcelona, and have been awarded the Ballon d'Or."
answer = ask_question(question, retriever, llm)
print(answer)

Reformulated question:  Which Argentine players have achieved the trifecta of winning the FIFA World Cup, playing for FC Barcelona, and being awarded the Ballon d'Or?

    Answer: Lionel Messi


#### List English goalkeepers who have represented their national team in the UEFA European Championship and have also played for clubs in the Serie A.

In [17]:
question = "List English goalkeepers who have represented their national team in the UEFA European Championship and have also played for clubs in the Serie A."
answer = ask_question(question, retriever, llm)
print(answer)

Reformulated question:  Which English goalkeepers have played for both their national team in the UEFA European Championship and clubs in the Serie A?

    Answer: Joe Hart, David James, and Paul Robinson.


#### Find French midfielders who have won the UEFA European Championship, played alongside Lionel Messi at club level, and have won the UEFA Champions League.

In [18]:
question = "Find French midfielders who have won the UEFA European Championship, played alongside Lionel Messi at club level, and have won the UEFA Champions League."
answer = ask_question(question, retriever, llm)
print(answer)

Reformulated question:  Which French midfielders have won both the UEFA European Championship and the UEFA Champions League, and have also played alongside Lionel Messi at the club level?

Answer: Thierry Henry
