# BMKG - Assignment 3

This notebook intends to automatically generate a SCHEMA for any given KG.

In [6]:
from operator import itemgetter
import getpass
import os

from typing import Any

from rdflib import Graph

from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain.memory import ConversationBufferMemory
from langchain_core.prompts import PromptTemplate, format_document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import get_buffer_string
from langchain_openai import OpenAI



In [7]:
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Provide your OpenAI API Key")

1) Get the RDF graph

In [189]:
with open('general_representation_of_our_vocabulary.txt', 'r') as file:
    schema = file.read()
    
graph = Graph()
graph = graph.parse("../Assignment 1/src/graph_from_data_sources.ttl")

print("Number of triples: ", len(graph))

Number of triples:  178548


In [190]:
llm = OpenAI(temperature=0)

In [194]:
from langchain_core.prompts import ChatPromptTemplate

# Create the memory object that is used to add messages
memory = ConversationBufferMemory(
    return_messages=True, output_key="answer", input_key="question"
)
# Add a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)

# Prompt to reformulate the question using the chat history
reform_template = """Given the following chat history and a follow up question,
rephrase the follow up question to be a standalone straightforward question, in its original language.
Do not answer the question! Just rephrase reusing information from the chat history.
Make it short and straight to the point.

Chat History:
{chat_history}
Follow up input:
{question}

Standalone question:
"""
REFORM_QUESTION_PROMPT = PromptTemplate.from_template(reform_template)

answer_template = """Briefly answer the question based only on the following context,
do not use any information outside this context:
{context}"""

# Prompt to ask to answer the reformulated question
answer_template = """Write a valid SPARQL query to answer the question based only on the
following schema, do not use any information outside this schema:
{schema}

Only output the query, without the prefixes. You should not use any URI in the query, 
instead add relations to exploit information you are sure about.

Question: {question}
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(answer_template)

In [195]:
# TODO: ask it to generate a schema as well?

# Reformulate the question using chat history
reformulated_question = {
    "reformulated_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: get_buffer_string(x["chat_history"]),
    }
    | REFORM_QUESTION_PROMPT
    | llm
    | StrOutputParser(),
}
final_inputs = {
    "schema": lambda x: schema,
    "question": lambda x: print("💭 Reformulated question:", x["reformulated_question"]) or x["reformulated_question"],
}
answer = {
    "answer": final_inputs | ANSWER_PROMPT | llm,
}
# Put the chain together
final_chain = loaded_memory | reformulated_question | answer

def stream_chain(final_chain, memory: ConversationBufferMemory, inputs: dict[str, str]) -> dict[str, Any]:
    """Ask question, stream the answer output, and return the answer."""
    output = {"answer": ""}
    for chunk in final_chain.stream(inputs):
        if "answer" in chunk:
            output["answer"] += chunk["answer"]
            print(chunk["answer"], end="", flush=True)
    
    # Add messages to chat history
    memory.save_context(inputs, {"answer": output["answer"]})
    
    return output

In [199]:
output = stream_chain(final_chain, memory, {
    "question": "In which year was the highest CO2 concentration in Belgium?"}
)

💭 Reformulated question: What was the highest CO2 concentration in Belgium and in which year was it recorded?

SELECT ?year ?value
WHERE {
    ?observation ex:isAbout ?country .
    ?observation ex:measured gas:CO2 .
    ?observation ex:value ?value .
    ?observation ex:year ?year .
    ?country rdfs:label "Belgium"^^xsd:string .
}
ORDER BY DESC(?value)
LIMIT 1

In [200]:
import pandas as pd
from IPython.display import display, HTML
from pygments import highlight
from pygments.lexers import SparqlLexer
from pygments.formatters import HtmlFormatter

def run_query(graph, query):
    # Execute the SPARQL query
    results = graph.query(query)
    
    # Display the SPARQL query
    formatted_query = highlight(query, SparqlLexer(), HtmlFormatter(style='solarized-dark', full=True, nobackground=True))
    display(HTML(formatted_query))
    
    # Convert results to a Pandas DataFrame
    res_list = []
    for row in results:
        res_list.append([str(item) for item in row])
    df = pd.DataFrame(res_list, columns=[str(var) for var in results.vars]) if len(res_list) > 0 else pd.DataFrame()

    # Display the DataFrame as a table in Jupyter Notebook
    display(HTML(df.to_html()))

In [201]:
print(output["answer"])
run_query(graph, output["answer"])


SELECT ?year ?value
WHERE {
    ?observation ex:isAbout ?country .
    ?observation ex:measured gas:CO2 .
    ?observation ex:value ?value .
    ?observation ex:year ?year .
    ?country rdfs:label "Belgium"^^xsd:string .
}
ORDER BY DESC(?value)
LIMIT 1


Unnamed: 0,year,value
0,1998,488.067140001
