In [1]:
import sys
import os 

sys.path.append(os.path.dirname(os.getcwd()))
from Augur.rag import GraphRAG, SparQLRAG
from Augur.agent_templates import StanzaTaggingAgent, GenerativeTaggingAgent
import pandas as pd

EMB_MODEL_ID = "sentence-transformers/all-mpnet-base-v2"

df_train = pd.read_json("../datafiles/test-data.json")

ontology_files = ['../datafiles/dbpedia_2016-10_extended.owl']

ontology_db = GraphRAG(EMB_MODEL_ID, ontology_files)   
queries_db = SparQLRAG(
    EMB_MODEL_ID, 
    df_train["corrected_question"].to_list(), 
    df_train["sparql_query"].to_list()
)              

In [2]:
from rdflib import Graph

dbpedia_graph = Graph().parse('../datafiles/dbpedia_2016-10_extended.owl', format='xml')
dbpedia_ids = {str(element) for triple in dbpedia_graph for element in triple if 'http://dbpedia.org/' in element}

In [3]:
# model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
model_id = "deepseek-ai/deepseek-coder-6.7b-instruct"
#model_id = 'codellama/CodeLlama-13b-Instruct-hf'
#agent = StanzaTaggingAgent(model_id)
agent = GenerativeTaggingAgent(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
agent(' Name the river with source as Columbia Lake and river mouth is located in Clatsop Country ?')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] ###You are a helpful, respectful and honest assistant expert coding,  ontologies and semantic web. ONLY perform the instruction requested, in a single JSON. DO enclose the code in a code block:
```json
code
```
### EXAMPLE
# Extract possible classes, properties, and relations. Write a JSON with the names and succint generic descriptions of them: What color is the cup?

Let's think step by step to map POS tags to ontology elements, let's start by identifying the roles of nouns, verbs, adjectives, and compound names within a sentence.

1. Nouns usually represent classes or instances in an ontology. For example, if we see the noun "cup," we can think of it as either an instance of a class Cup or the class itself in our ontology. This helps us understand what entities we're dealing with.
2. Verbs and Adjectives play a crucial role in identifying properties. Verbs often indicate actions or relations between entities, suggesting object properties in an ontology. For instance, the 

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] ###You are a helpful, respectful and honest assistant expert coding,  ontologies and semantic web. ONLY perform the instruction requested, in a single JSON. DO enclose the code in a code block:
```json
code
```
### EXAMPLE
# Extract possible classes, properties, and relations. Write a JSON with the names and succint generic descriptions of them: What color is the cup?

Let's think step by step to map POS tags to ontology elements, let's start by identifying the roles of nouns, verbs, adjectives, and compound names within a sentence.

1. Nouns usually represent classes or instances in an ontology. For example, if we see the noun "cup," we can think of it as either an instance of a class Cup or the class itself in our ontology. This helps us understand what entities we're dealing with.
2. Verbs and Adjectives play a crucial role in identifying properties. Verbs often indicate actions or relations between entities, suggesting object properties in an ontology. For instance, the 

{'predicted_ids': {'River': "A 'River' is a class within the ontology representing a natural body of water that flows through a valley or between hills.",
  'Source': "The 'Source' property is a data property within the ontology that links instances of the 'River' class to their source, which is represented as a string value describing the source of the river.",
  'Columbia Lake': "The 'Columbia Lake' is a class within the ontology representing a lake that is the source of the river.",
  'Clatsop Country': "The 'Clatsop Country' is a class within the ontology representing a region where the river mouth is located."},
 'predicted_names': ['Columbia Lake', 'Clatsop Country']}

In [4]:
from Augur import db_endpoint
from rdflib import URIRef, Graph, RDFS, Literal

def search_resource(name):

    query = (
        "SELECT ?s ?o  "
        "WHERE {"
        " ?s <http://www.w3.org/2000/01/rdf-schema#label> ?o ."
        f" ?o bif:contains '\"{name}\"'@en"
        "} LIMIT 5" 
        )

    _output = db_endpoint.send_consult_json(query)
    _output = _output["results"]["bindings"]

    ids= set()
    g = Graph()
    for result in _output:
        if 'Category:' in result["s"]["value"]: continue
        resource_uri = URIRef(result["s"]["value"])
        label = Literal(result["o"]["value"])
        g.add([resource_uri, RDFS.label, label])
        ids.add(str(resource_uri))
    
    return ids
    



In [11]:
search_resource('Clatsop')

{'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Clatsop_County,_Oregon'}, 'o': {'type': 'literal', 'xml:lang': 'en', 'value': 'Clatsop County, Oregon'}}


{'http://dbpedia.org/resource/Clatsop_County,_Oregon'}

In [4]:
test_df = df_train.sample(100, random_state=42)

In [5]:
test_df = test_df.reset_index()

In [6]:
input_text = agent(test_df.corrected_question[4])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [7]:
input_text

{'Karakuri Dji Ultimo': ' Karakuri Dji Ultimo is a type of robotic arm that is capable of performing a variety of tasks, including picking up and moving objects, and is often used in industrial and manufacturing settings.',
 'author': ' "The author of a work, such as a book, article, or piece of music.'}

In [8]:
df_train.corrected_question[20]

'Who built the stadium which was rented for the 2013 Copa Centroamericana ?'

In [9]:
query = list(input_text.values())
output = ontology_db.process_query(query, max_k=2)
graph  = Graph().parse(data=output, format='turtle')

In [10]:
print(graph.serialize(format='turtle'))

@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://dbpedia.org/ontology/author> rdfs:comment "An author is a person who creates literature or other artistic works. In DBpedia, this identifier represents the RDF type for authors."@en .

<http://dbpedia.org/ontology/lunarRover> rdfs:comment "lunarRover\": A type of robotic vehicle used for exploration of the Moon's surface, enabling scientists to gather data and conduct experiments in hazardous or inaccessible environments."@en .




In [6]:
test = {str(x) for triplet in graph for x in triplet if x.startswith('http://dbpedia.org')}

NameError: name 'graph' is not defined

In [5]:
import re

pattern = r'<(http[s]?://(?!dbpedia\.org/resource/)[^ >]*dbpedia\.org/[^ >]*)>'
pattern_with_resources =  r'<(http[s]?://dbpedia\.org/[^ >]*)>'
pattern_resources =  r'<(http[s]?://dbpedia\.org/resource/[^ >]*)>'

df_train['identifiers'] = df_train.sparql_query.apply( lambda x : set(re.findall(pattern, x)))
df_train['identifiers_resource'] = df_train.sparql_query.apply( lambda x : set(re.findall(pattern_with_resources, x)))
df_train['resources'] = df_train.sparql_query.apply( lambda x : set(re.findall(pattern_resources, x)))

In [12]:
df_train['resources']

0      {http://dbpedia.org/resource/Marine_Corps_Air_...
1       {http://dbpedia.org/resource/Muslim_Brotherhood}
2      {http://dbpedia.org/resource/Google_Web_Toolki...
3      {http://dbpedia.org/resource/Sam_Loyd, http://...
4      {http://dbpedia.org/resource/Pizza, http://dbp...
                             ...                        
995    {http://dbpedia.org/resource/Judaism, http://d...
996    {http://dbpedia.org/resource/American_Hockey_L...
997    {http://dbpedia.org/resource/Easy_Street_(film...
998       {http://dbpedia.org/resource/Pietermaritzburg}
999           {http://dbpedia.org/resource/Richard_Coke}
Name: resources, Length: 1000, dtype: object

In [6]:
def ids_resource_matches(text, agent, gold_std):
    predictions = agent(text)
    ids = set()
    for match in predictions['predicted_names']:
        ids |= search_resource(match)
    matches = gold_std['resources'] & ids
    return (len(matches)/len(gold_std['resources']))


In [15]:
def ids_matches_count(text, agent, rag_db, gold_std):
    predictions = agent(text)
    query = list(predictions.values())
    query.append(text)
    output = rag_db.process_query(query, max_k=30)
    retrieved_graph  = Graph().parse(data=output, format='turtle')
    retrieved_ids = {str(x) for triplet in retrieved_graph for x in triplet if x.startswith('http://dbpedia.org')}
    return len(retrieved_ids & gold_std)

In [16]:
def ids_matches_count_noagent(text, rag_db, gold_std):
    output = rag_db.process_query(text, max_k=30)
    retrieved_graph  = Graph().parse(data=output, format='turtle')
    retrieved_ids = {str(x) for triplet in retrieved_graph for x in triplet if x.startswith('http://dbpedia.org')}
    return len(retrieved_ids & gold_std)

In [17]:
def ids_matches_count_with_fs(text, agent, rag_db, rag_fs, gold_std):
    predictions = agent(text)
    query = list(predictions.values())
    output = rag_db.process_query(query, max_k=30)
    retrieved_graph  = Graph().parse(data=output, format='turtle')
    retrieved_ids = {str(x) for triplet in retrieved_graph for x in triplet if x.startswith('http://dbpedia.org')}

    fs_rag_output = rag_fs.process_query(text)
    retrieved_ids_fs = {x for ids in fs_rag_output for x in set(re.findall(pattern, ids['metadata']['consult']))}
    
    retrieved_ids |= retrieved_ids_fs
    return len(retrieved_ids & gold_std)

In [18]:
def ids_matches_count_fs(text,rag_fs, gold_std):
    fs_rag_output = rag_fs.process_query(text)
    retrieved_ids = {x for ids in fs_rag_output for x in set(re.findall(pattern, ids['metadata']['consult']))}
    return len(retrieved_ids & gold_std)

In [19]:
test_df = df_train.sample(100, random_state=42).reset_index()

In [20]:
dbpedia_ids

{'http://dbpedia.org/property/currentRegimentCommander',
 'http://dbpedia.org/ontology/cannonNumber',
 'http://dbpedia.org/property/row142Color',
 'http://dbpedia.org/ontology/signName',
 'http://dbpedia.org/ontology/electionDateLeader',
 'http://dbpedia.org/ontology/MouseGene',
 'http://dbpedia.org/ontology/Relationship',
 'http://dbpedia.org/property/judge1Score8Boxa',
 'http://dbpedia.org/property/displayAuthors',
 'Represents the portion or part of a larger whole."\n\nExplanation: The <http://dbpedia.org/property/portion> property is used to describe the relationship between a part and the whole. It indicates the size or extent of the part within the larger context. For example, "The Statue of Liberty is a portion of New York City." or "The engine is a portion of a car.',
 'http://dbpedia.org/property/majorHighways',
 'http://dbpedia.org/ontology/literaryGenre',
 'http://dbpedia.org/property/dns',
 'http://dbpedia.org/property/last189',
 'http://dbpedia.org/property/rd1Score07secon

In [21]:
sum(len(dbpedia_ids & x) for x in test_df['identifiers'].to_list())

198

In [22]:
ids_matches_count_with_fs(test_df.corrected_question[18], agent, ontology_db, queries_db, test_df.identifiers[18])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

1

In [35]:
from tqdm.notebook import tqdm
tqdm.pandas()
result = df_train.sample(100, random_state=42).progress_apply(lambda x: ids_matches_count_fs(x['corrected_question'], queries_db, x['identifiers']), axis=1)

  0%|          | 0/100 [00:00<?, ?it/s]

In [23]:
from tqdm.notebook import tqdm
tqdm.pandas()
result = df_train.sample(100, random_state=42).progress_apply(lambda x: ids_matches_count(x['corrected_question'],agent, ontology_db, x['identifiers']), axis=1)

  0%|          | 0/100 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attentio

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

result = df_train.sample(10, random_state=42).progress_apply(lambda x: ids_resource_matches(x['corrected_question'], agent, x), axis=1)

In [8]:
result

521    1.0
737    0.0
740    0.0
660    0.0
411    1.0
678    1.0
626    1.0
513    1.0
859    1.0
136    0.0
dtype: float64

In [17]:
df_train

Unnamed: 0,_id,corrected_question,intermediary_question,sparql_query,sparql_template_id,identifiers,identifiers_resource,resources
0,1701,Which architect of Marine Corps Air Station Ka...,What is the <architect> of the <Marine Corps A...,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,16,"{http://dbpedia.org/property/architect, http:/...",{http://dbpedia.org/resource/Marine_Corps_Air_...,{http://dbpedia.org/resource/Marine_Corps_Air_...
1,3293,Which sects people are part of local political...,What is the <faith> of the <political parties>...,SELECT DISTINCT ?uri WHERE { ?x <http://dbpedi...,305,"{http://dbpedia.org/property/international, ht...","{http://dbpedia.org/property/international, ht...",{http://dbpedia.org/resource/Muslim_Brotherhood}
2,2161,List common systems are developed by of the Go...,What is the <developed by> of the <Google Vide...,SELECT DISTINCT ?uri WHERE { <http://dbpedia....,16,"{http://dbpedia.org/ontology/author, http://db...","{http://dbpedia.org/ontology/author, http://db...",{http://dbpedia.org/resource/Google_Web_Toolki...
3,1136,For which games are Sam Loyd and Eric Schiller...,What is the <known for> of the <Sam Loyd> and ...,SELECT DISTINCT ?uri WHERE { <http://dbpedia.o...,15,{http://dbpedia.org/ontology/knownFor},"{http://dbpedia.org/ontology/knownFor, http://...","{http://dbpedia.org/resource/Sam_Loyd, http://..."
4,987,Is Peter Piper Pizza in the pizza industry?,Is <Peter Piper Pizza> the <field of exercise>...,ASK WHERE { <http://dbpedia.org/resource/Peter...,152,{http://dbpedia.org/ontology/industry},"{http://dbpedia.org/resource/Pizza, http://dbp...","{http://dbpedia.org/resource/Pizza, http://dbp..."
...,...,...,...,...,...,...,...,...
995,4426,Who is venerated in Judaism and Islam?,What are the <saints> whose <venerateds in> ar...,SELECT DISTINCT ?uri WHERE { ?uri <http://dbpe...,7,{http://dbpedia.org/ontology/veneratedIn},"{http://dbpedia.org/resource/Judaism, http://d...","{http://dbpedia.org/resource/Judaism, http://d..."
996,466,Count the number teams which have former ice ...,What is the total number of other <former team...,SELECT DISTINCT COUNT(?uri) WHERE { ?x <http:...,111,{http://dbpedia.org/ontology/formerTeam},"{http://dbpedia.org/ontology/formerTeam, http:...",{http://dbpedia.org/resource/American_Hockey_L...
997,3092,Name the common editor of Easy Street (film) a...,Who is the <edited by> of the <Easy Street (fi...,SELECT DISTINCT ?uri WHERE { <http://dbpedia.o...,15,{http://dbpedia.org/ontology/editing},"{http://dbpedia.org/ontology/editing, http://d...",{http://dbpedia.org/resource/Easy_Street_(film...
998,3772,List all the schools of the rugby player whose...,What are the other <school> of the <rugby play...,SELECT DISTINCT ?uri WHERE { ?x <http://dbpedi...,11,{http://dbpedia.org/property/school},"{http://dbpedia.org/resource/Pietermaritzburg,...",{http://dbpedia.org/resource/Pietermaritzburg}


In [24]:
sum(result.values)

80

In [20]:
sum(result.to_list())

83

In [22]:
sum(result.to_list())

56

In [22]:
sum(result.to_list())

86

In [18]:
df_train['identifiers'][20]

{'http://dbpedia.org/ontology/Stadium',
 'http://dbpedia.org/ontology/builder',
 'http://dbpedia.org/ontology/tenant',
 'http://dbpedia.org/resource/2013_Copa_Centroamericana'}

In [19]:
df_train.iloc[20]

_id                                                                    926
corrected_question       Who built the stadium which was rented for the...
intermediary_question    What is the <builder> of the <stadium> whose <...
sparql_query             SELECT DISTINCT ?uri WHERE { ?x <http://dbpedi...
sparql_template_id                                                     305
identifiers              {http://dbpedia.org/ontology/tenant, http://db...
Name: 20, dtype: object

In [20]:
test & df_train.identifiers[20]

{'http://dbpedia.org/ontology/Stadium', 'http://dbpedia.org/ontology/builder'}

In [48]:
for s,p,o in dbpedia_graph:
    if 'resource' in p:
        print(o)

In [26]:
gold_standard.output[0]

"{'head': {'link': [], 'vars': ['uri']}, 'results': {'distinct': False, 'ordered': True, 'bindings': [{'uri': {'type': 'uri', 'value': 'http://dbpedia.org/resource/United_States_Navy'}}]}}"

In [51]:
gold_standard.sparql_query[1]

'SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/international> <http://dbpedia.org/resource/Muslim_Brotherhood> . ?x <http://dbpedia.org/ontology/religion> ?uri  . ?x <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/PoliticalParty>}'

In [55]:
df_fs = df_train.iloc[:6][['corrected_question', 'sparql_query']]

In [59]:
df_ask = df_train[['corrected_question', 'sparql_query']][df_train.apply(lambda x: 'ASK' in x['sparql_query'], axis=1)].iloc[:2]

In [67]:
FEW_SHOT = (
        "### Some example user requests and corresponding SparQL queries are provided "
        "based on similar problems:\n"
    )
FEW_SHOT_TEMPLATE = "# Write the SparQL code that retrieves the answer to this request: {question}\n\n{consult}\n<|EOT|>\n\n"

In [68]:
fs_static = pd.concat([df_fs, df_ask]).reset_index(drop=True)

In [75]:
cadena = [FEW_SHOT]
for a , b in fs_static.iterrows():
    cadena.append(FEW_SHOT_TEMPLATE.format(question=b['corrected_question'], consult=b['sparql_query']))

In [79]:
''.join(cadena)

"### Some example user requests and corresponding SparQL queries are provided based on similar problems:\n# Write the SparQL code that retrieves the answer to this request: How many movies did Stanley Kubrick direct?\n\nSELECT DISTINCT COUNT(?uri) WHERE {?uri <http://dbpedia.org/ontology/director> <http://dbpedia.org/resource/Stanley_Kubrick>  . }\n<|EOT|>\n\n# Write the SparQL code that retrieves the answer to this request: Which city's foundeer is John Forbes?\n\nSELECT DISTINCT ?uri WHERE {?uri <http://dbpedia.org/ontology/founder> <http://dbpedia.org/resource/John_Forbes_(British_Army_officer)>  . ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/City>}\n<|EOT|>\n\n# Write the SparQL code that retrieves the answer to this request: What is the river whose mouth is in deadsea?\n\nSELECT DISTINCT ?uri WHERE {?uri <http://dbpedia.org/ontology/riverMouth> <http://dbpedia.org/resource/Dead_Sea>  . ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <h