In [2]:
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append('../Augur')
from transformers import (AutoConfig, 
                          AutoModelForCausalLM, 
                          AutoTokenizer, 
                          TextIteratorStreamer, 
                          BitsAndBytesConfig, 
                          TextStreamer,
                          Conversation, 
                          pipeline,
                          LlamaForCausalLM)
import torch

from Augur import model

In [3]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [3]:
def load_quant_model(model_id):
    # This requires the `bitsandbytes` library
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model_config = AutoConfig.from_pretrained(model_id)

    _model = model.AugurLlamaModel.from_pretrained(
        model_id,
        trust_remote_code=True,
        config=model_config,
        quantization_config=bnb_config,
        device_map='auto',
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    return _model, tokenizer

In [4]:
models, tokenizer = load_quant_model('codellama/CodeLlama-13b-Instruct-hf')

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Beam search

In [5]:
output = models.generate(do_sample=True, num_beams=3, num_beam_groups=1, num_return_sequences=3, diversity_penalty=0.0)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [7]:
pipe = pipeline('text-generation', model=models, tokenizer=tokenizer)

The model 'AugurLlamaModel' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalL

In [5]:
answer = tokenizer.decode(tokenizer.apply_chat_template([{'role':'user', 'content':'generate the dbpedia sparql code that answer the following question: retrieve all the authors of the book harry potter'}]))

# Sampling

In [14]:
streamer = TextStreamer(tokenizer=tokenizer)

In [17]:
pipe = pipeline('conversational', model=models, tokenizer=tokenizer, streamer=streamer)

In [None]:
from rdflib import Graph
import re

In [220]:
def get_queries(prompt, temperature=1.0, n_times=4):
    pipe_output = []
    for _ in range(n_times):
        conver = Conversation()
        system = ("You are a helpful, respectful and honest assistant expert coding"
                  " in SparQL, ontologies and semantic web. Always DO answer the code in SparQL that retrieves the "
                  "information asked. DO ALWAYS return a query. DO enclose the code in a code block:\n```\ncode\n```\n).\n"
                 )  
        conver.add_message({'role':'system', 'content': system})
        conver.add_user_input(f"# Write the SparQL code that retrieves the answer to ONLY this request: {prompt}.\n\n")
        pipe_output.append(pipe(conver, max_length=1000, do_sample=True, temperature=temperature).generated_responses[-1])
    return pipe_output

In [73]:
def capture_code(text): 
    code_pattern = r"```(?:sparql|SPARQL)?(.*?)```"
    code = re.findall(code_pattern, text, re.DOTALL)
    return code[0] if code else "None"

In [128]:
# Function to replace prefixed names with their full URIs
def replace_prefixed_names_with_uris(query):
    if not re.search("PREFIX", query):
        return query
    # Extract prefixes and their URIs
    prefix_pattern = r"\s*PREFIX\s*?([a-zA-Z0-9_\-]+):\s*?<([^>]+)>"
    prefixes = dict(re.findall(prefix_pattern, query))
    
    # Pattern to match prefixed names (simple version, might need refinement)
    prefixed_name_pattern = r'\b(' + '|'.join(prefixes.keys()) + r'):([a-zA-Z0-9_\-]+)'

    # Function to replace match with full URI
    def repl(match):
        prefix, local_part = match.groups()
        return f"<{prefixes[prefix]}{local_part}>"

    # Replace all occurrences of prefixed names in the query
    query = re.sub(".*PREFIX.*\n+", "", query)
    return re.sub(prefixed_name_pattern, repl, query)

In [114]:
def extract_identifiers(resolved_query):
    uri_pattern = r'<(http[s]?://[^ >]*dbpedia\.org[^ >]*)>'
    
    # Use regex to find all URIs in the SPARQL query
    uris = re.findall(uri_pattern, resolved_query, re.I)

    return set(uris)

In [115]:
def parse_ontology(ontology_path):
    name = ontology_path.split("/")[-1].split("_")[0]
    return name

In [226]:
def count_common_elements(list_of_sets):
    num_sets = len(list_of_sets)
    common_elements_count = [0] * num_sets
    
    for i in range(num_sets):
        for j in range(num_sets):
            if i != j:  # Skip comparing the set with itself
                common_elements_count[i] += len(list_of_sets[i].intersection(list_of_sets[j]))
    return common_elements_count

In [266]:
def get_max_indexes(lst):
    if not lst:
        return []  # Return an empty list if input list is empty
    max_val = max(lst)
    return [i for i, val in enumerate(lst) if val == max_val]

In [289]:
def self_consistency(queries_list, ontology_path):
    ontology = Graph().parse(ontology_path)
    ontology = {str(element) for triple in ontology for element in triple if 'http://dbpedia.org/' in element}
    
    captured_code = [capture_code(query) for query in queries_list]
    parsed_code = [replace_prefixed_names_with_uris(query) for query in captured_code]
    extracted_identifiers = [extract_identifiers(code) for code in parsed_code]

    agreement = count_common_elements(extracted_identifiers)
    max_index = get_max_indexes(agreement)
    if max_index != 1:
        candidates = [extracted_identifiers[idx] for idx in max_index]
        consistency = [len(query_uri & ontology) for query_uri in candidates]
        max_index = get_max_indexes(consistency)
        if max_index != 1:
            return captured_code[random.choice(max_index)]
    return captured_code[max_index]

In [116]:
ontology_path = '../datafiles/dbpedia_2016-10.owl'

In [195]:
user_prompt = 'Which sects people are part of local political parties which fall under the International Muslim Brotherhood?'

In [239]:
user_prompt = 'Give me all the names of the authors of the Harry Potter books on dbpedia'

In [240]:
pipe_outputs = get_queries(user_prompt, temperature=1, n_times=4)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] <<SYS>>
You are a helpful, respectful and honest assistant expert coding in SparQL, ontologies and semantic web. Always DO answer the code in SparQL that retrieves the information asked. DO ALWAYS return a query. DO enclose the code in a code block:
```
code
```
).

<</SYS>>

# Write the SparQL code that retrieves the answer to ONLY this request: Give me all the names of the authors of the Harry Potter books on dbpedia. [/INST]  ```
PREFIX dbpedia: <http://dbpedia.org/ontology/>

SELECT ?author_name
WHERE {
 ?book dbpedia:author ?author .
 ?author dbpedia:name ?author_name .
 ?book dbpedia:title "Harry Potter" .
}
```
This SPARQL query uses the DBpedia ontology to retrieve the information about Harry Potter books. The query searches for books that have "Harry Potter" in their title, and then retrieves the name of each author as an output.</s>


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] <<SYS>>
You are a helpful, respectful and honest assistant expert coding in SparQL, ontologies and semantic web. Always DO answer the code in SparQL that retrieves the information asked. DO ALWAYS return a query. DO enclose the code in a code block:
```
code
```
).

<</SYS>>

# Write the SparQL code that retrieves the answer to ONLY this request: Give me all the names of the authors of the Harry Potter books on dbpedia. [/INST]  ```sparql
SELECT ?authorName
WHERE {
 ?book dbpedia-owl:author ?author .
 ?author rdfs:label ?authorName .
 FILTER (?book = dbpedia:Harry_Potter)
}
```
This SparQL query retrieves the names of the authors of the Harry Potter books from DBpedia using the dbpedia-owl ontology. The query filters the results to only include authors of the Harry Potter books, which are denoted by the URI `dbpedia:Harry_Potter`. The `rdfs:label` property is used to retrieve the name of each author. The resulting query returns a list of the names of the authors of the Harry

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] <<SYS>>
You are a helpful, respectful and honest assistant expert coding in SparQL, ontologies and semantic web. Always DO answer the code in SparQL that retrieves the information asked. DO ALWAYS return a query. DO enclose the code in a code block:
```
code
```
).

<</SYS>>

# Write the SparQL code that retrieves the answer to ONLY this request: Give me all the names of the authors of the Harry Potter books on dbpedia. [/INST]  ```
PREFIX dbpediaOnt: <http://dbpedia.org/ontology/>
SELECT ?authorName
WHERE {
 ?book rdf:type dbpediaOnt:Book ;
       dbpediaOnt:author ?author .
 ?author rdf:type dbpediaOnt:Person ;
         foaf:name ?authorName .
 FILTER (str(?book) = "http://dbpedia.org/ontology/Book")
}
```
This code retrieves all the names of the authors of the Harry Potter books on dbpedia. It uses the `dbpediaOnt:Book` and `dbpediaOnt:author` properties to filter the results, and the `foaf:name` property to retrieve the author names. The `FILTER` function is used to ensu

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] <<SYS>>
You are a helpful, respectful and honest assistant expert coding in SparQL, ontologies and semantic web. Always DO answer the code in SparQL that retrieves the information asked. DO ALWAYS return a query. DO enclose the code in a code block:
```
code
```
).

<</SYS>>

# Write the SparQL code that retrieves the answer to ONLY this request: Give me all the names of the authors of the Harry Potter books on dbpedia. [/INST]  ```
PREFIX dbpedia: <http://dbpedia.org/ontology/>
SELECT DISTINCT ?author
WHERE {
 ?book dbpedia:bookAuthor ?author .
 VALUES ?title { "Harry Potter and the Philosopher's Stone"@en "Harry Potter and the Sorcerer's Stone"@en "Harry Potter and the Prisoner of Azkaban"@en "Harry Potter and the Goblet of Fire"@en "Harry Potter and the Order of the Phoenix"@en "Harry Potter and the Half-Blood Prince"@en "Harry Potter and the Deathly Hallows"@en "Harry Potter and the Cursed Child"@en "Harry Potter and the Chamber of Secrets"@en "Harry Potter and the Sorce

# Parse SPARQL

In [288]:
self_consistency(pipe_outputs, ontology_path)

('\nSELECT?authorName\nWHERE {\n ?book dbpedia-owl:author?author.\n ?author rdfs:label?authorName.\n  FILTER (?book = dbpedia:Harry_Potter)\n}\n',
 [0, 1])

In [268]:
import random

2

In [283]:
random.choice(get_max_indexes(test))

0

# Tests

In [217]:
print(pipe_outputs[3])

 Here is the SparQL code that retrieves the information you requested:
```
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
PREFIX dbpprop: <http://dbpedia.org/property/>

SELECT?party?participant
WHERE {
 ?party a dbpedia-owl:PoliticalParty.
 ?party dbpprop:parentOrganization?organization.
 ?organization a dbpedia-owl:InternationalMuslimBrotherhood.
 ?participant a dbpedia-owl:Person.
 ?participant dbpprop:memberOf?party.
}
```
This query retrieves the following information:

* The name of the political party (`?party`)
* The name of the person who is a member of the party (`?participant`)

Note that this query assumes that the names of the political parties and the people are represented in the DBpedia ontology using the `dbpedia-owl:PoliticalParty` and `dbpedia-owl:Person` classes

In [172]:
[print(q) for q in pipe_outputs[-2:]]

 ```
PREFIX wikidata: <https://www.wikidata.org/wiki/Special:EntitySchemaText/Wikidata-Query>

SELECT DISTINCT?architect?tenant
WHERE {
 ?architect wdt:P569?tenant ; a wdt:P31/wdt:P279* wd:Q1401770.
 ?tenant wdt:P31 wd:Q2521085.
  FILTER(?architect!=?tenant)
}
```
This SparQL code retrieves the architect of Marine Corps Air Station Kaneohe Bay who was also a tenant of the New Sanno hotel. The `?architect` variable is bound to the architect of the air station, and the `?tenant` variable is bound to the tenant of the hotel. The query uses the `wdt:P569` property to get the architect of the air station, and the `wdt:P31/wdt:P279*` property to get the tenants of the hotel. The `FILTER` statement is used to exclude the architect from the results, as it is the same as the value of the `?architect` variable.
 This question can be answered using the DBpedia dataset, which has information about the architects and tenants of various buildings. Here's an example SparQL query that retrieves the in

[None, None]

In [163]:
ontology = Graph().parse(ontology_path)
ontology = {str(element) for triple in ontology for element in triple if 'http://dbpedia.org/' in element}

In [167]:
"http://dbpedia.org/ontology/architect" in ontology

True

In [170]:
"http://dbpedia.org/property/tenant" in ontology

False

In [171]:
"SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Marine_Corps_Air_Station_Kaneohe_Bay> <http://dbpedia.org/property/architect> ?uri. <http://dbpedia.org/resource/New_Sanno_Hotel> <http://dbpedia.org/ontology/tenant> ?uri}"

'SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Marine_Corps_Air_Station_Kaneohe_Bay> <http://dbpedia.org/property/architect> ?uri. <http://dbpedia.org/resource/New_Sanno_Hotel> <http://dbpedia.org/ontology/tenant> ?uri}'