In [7]:
from openai import OpenAI

client_qwen = OpenAI(
    base_url = 'http://222.29.156.145:8000/v1/',
    api_key = 'key',
    timeout=30
)

client_deepseek = OpenAI(
    base_url="https://api.deepseek.com",
    api_key="sk-54964e5c3b8c4998a74f7d3e35b618ac",
    timeout=60
)

In [8]:
import json
data = json.load(open('data/processed_spice_data/dev_each_type_50.json', 'r'))
wikidata_mid_to_fn = json.load(open('data/wikidata_mid_to_fn.json', 'r'))

In [4]:
from retriever.semantic_retriever import SemanticRetriever
entity_retriever = SemanticRetriever('entity')

In [5]:
from utils.execute_query import execute_query
def get_type(entity_fn):
    entity_mid = entity_retriever.semantic_search(entity_fn.replace(' ', '_'))[0][1]
    result = execute_query(f'SELECT ?x WHERE {{ wd:{entity_mid} wdt:P31 ?x }}')
    types = [wikidata_mid_to_fn[mid] for mid in result]
    return [typ for typ in types if not typ.startswith('Wikimedia')]

In [None]:
memory = {}
clock = 0
last_question = ''

prompt1 = 'Extract entities from the input text. Entities are unique, distinguishable noun phrases with actual meaning, including: person names, locations, organizations, time points, product names, events, abstract concepts, etc. Exclude pure numbers. Output can be an empty string. Return entities joined by \u001F without any explanatory text.'
prompt2 = '''
You are a coreference resolution system for knowledge graph QA dialogues. Process inputs as follows:

1. Input Specifications:
- Multi-turn dialogues are separated by [SEP]
- Entity list follows [ENTITIES] marker (format: Entity: entity_type)

2. Processing Rules:
A) For multi-turn dialogues:
    a. Extract the last question between the last [SEP] and [ENTITIES] as the target question
    b. Resolve coreferences (pronouns/ellipsis) using preceding context
    c. Generate complete standalone question

B) For single-turn dialogues:
    Preserve the question exactly

Examples:
Input: "When was Google founded? [ENTITIES] Google: organization"
Output: When was Google founded?

Input: "Which sex does Lev Mayorov possess ? [SEP] male [SEP] And also tell me about Pál Jávor? [ENTITIES] Lev Mayorov: common_name; Pál Jávor: common_name"
Output: Which sex does Pál Jávor possess?
                
Input: "Where is Ch\u00e2telperronian located on ? [SEP] Iberian Peninsula [SEP] And also tell me about Qarwarasu (Huancavelica)? [ENTITIES] Ch\u00e2telperronian: concept; Iberian Peninsula: landform; Qarwarasu (Huancavelica): terrain"
Output: Where is Qarwarasu (Huancavelica) located on?
                
Input: "Which people emerged victorious in La Madrid Challenge by La Vuelta 2016 and La Madrid Challenge by La Vuelta 2015 ? [SEP] Giorgia Bronzini, Shelley Olds, Kirsten Wild [SEP] Which television programs are that person a screenwriter of ? [SEP] Did you mean Giorgia Bronzini ? [SEP] No, I meant Shelley Olds. Could you tell me the answer for that? [ENTITIES] La Madrid Challenge by La Vuelta 2016: cycling_race_class_defined_by_the_International_Cycling_Union; La Madrid Challenge by La Vuelta 2015: cycling_race_class_defined_by_the_International_Cycling_Union; Giorgia Bronzini: common_name; Shelley Olds: common_name; Kirsten Wild: common_name"
Output: Which television programs is Shelley Olds a screenwriter of?
'''

for qa in data:
    # extract entities
    clock = (clock + 1) % 10
    question = qa['question'][len(last_question):] if qa['question'].startswith(last_question[:-6]) else qa['question']
    messages = [{'role': 'system', 'content': prompt1}, {'role': 'user', 'content': question}]
    resp = client_qwen.chat.completions.create(
        model='Qwen/Qwen2.5-32B-Instruct',
        messages=messages,
        temperature=0,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    entities = resp.choices[0].message.content.split('\u001F')

    # update memory
    for entity in entities:
        if not entity: continue
        types = memory[entity][0] if entity in memory else get_type(entity)
        if types:
            memory[entity] = (types, clock)

    for entity in list(memory.keys()):
        if memory[entity][1] == (clock + 1) % 10:
            del memory[entity]

    last_question = qa['question']

    # coreference resolution
    question = qa['question'][:-6] + ' [ENTITIES] ' + '; '.join([f'{entity}: {typ}' for entity in memory for typ in memory[entity][0]])
    messages = [{'role': 'system', 'content': prompt2}, {'role': 'user', 'content': question}]
    resp = client_deepseek.chat.completions.create(
        model='deepseek-chat',
        messages=messages,
        temperature=0,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    coreference_resolved_question = resp.choices[0].message.content
    if coreference_resolved_question.startswith('Output: '):
        coreference_resolved_question = coreference_resolved_question[8:]
    qa['coreference_resolved_question'] = coreference_resolved_question
    print(question.rsplit('[ENTITIES]', 1)[0].replace(' [SEP] ', '\n'))
    print()
    print(coreference_resolved_question)
    print()
    print('--------------------')
    print()


Which male person was the parent of Ludovico II, Marquess of Saluzzo ? 

Which male person was the parent of Ludovico II, Marquess of Saluzzo?

--------------------

Who are the children of Ludovico II, Marquess of Saluzzo ? 

Who are the children of Ludovico II, Marquess of Saluzzo?

--------------------

Who are the children of Ludovico II, Marquess of Saluzzo ?
Gian Gabriele I of Saluzzo, Francesco of Saluzzo, Giovanni Ludovico, Marquess of Saluzzo
Who are siblings of that one ?
Did you mean Francesco of Saluzzo ?
No, I meant Giovanni Ludovico, Marquess of Saluzzo. Could you tell me the answer for that? 

Who are the siblings of Giovanni Ludovico, Marquess of Saluzzo?

--------------------

Who are the children of Ludovico II, Marquess of Saluzzo ?
Gian Gabriele I of Saluzzo, Francesco of Saluzzo, Giovanni Ludovico, Marquess of Saluzzo
Who are siblings of that one ?
Did you mean Francesco of Saluzzo ?
No, I meant Giovanni Ludovico, Marquess of Saluzzo. Could you tell me the answer f

In [None]:
json.dump(data, open('data/processed_spice_data/dev_each_type_50.json', 'w'), indent=2)