In [17]:
import json
import pickle

wikidata_mid_to_fn_dict = pickle.load(open('data/wikidata_mid_to_fn_dict.pickle', 'rb'))
data = json.load(open('data/processed_spice_data/dev_each_type_5.json', 'r'))

In [None]:
def sub_mid_to_fn(expression):
    seg_list = expression.split()
    for i in range(len(seg_list)):
        token = seg_list[i].strip(')(')
        if token.startswith('P') or token.startswith('Q'):
            fn = wikidata_mid_to_fn_dict.get(token, "unknown_entity")
            seg_list[i] = seg_list[i].replace(token, fn)
    new_expression = ' '.join(seg_list)
    return new_expression

def extract_kg_elements(expression):
    entities, relations, types = set(), set(), set()
    token_list = [seg.strip(')(') for seg in expression.split()]
    for i, token in enumerate(token_list):
        if token.startswith('Q'):
            if token_list[i - 1] != 'P31':
                entity = wikidata_mid_to_fn_dict.get(token, token)
                entities.add(entity)
            else:
                typ = wikidata_mid_to_fn_dict.get(token, token)
                types.add(typ)
        elif token.startswith('P'):
            relation = wikidata_mid_to_fn_dict.get(token, token)
            relations.add(relation)
    return sorted(entities), sorted(relations), sorted(types)

In [20]:
for qa in data:
    qa['entities'], qa['relations'], qa['types'] = extract_kg_elements(qa['s_expression'])
sorted_data = sorted([{key: qa[key] for key in ('question', 's_expression', 's_expression_cores', 'entities')} for qa in data], key=lambda x: len(x['entities']), reverse=True)
for qa in sorted_data:
    qa['s_expression'] = sub_mid_to_fn(qa['s_expression'])
    qa['s_expression_cores'] = [sub_mid_to_fn(core) for core in qa['s_expression_cores']]
prompt_data = sorted_data[5::10][:10]
test_data = sorted_data[10::10][:10]
print(json.dumps(prompt_data, indent=2))

[
  {
    "question": "Which administrative territories have diplomatic relationships with approximately 1 political territory ? [SEP] Georgia, Belarus, Luxembourg [SEP] Do Ukrainka have diplomatic relations with those administrative territories ? [CTX]",
    "s_expression": "(ALL (IS_TRUE Ukrainka diplomatic_relation Georgia) (IS_TRUE Ukrainka diplomatic_relation Belarus) (IS_TRUE Ukrainka diplomatic_relation Luxembourg))",
    "s_expression_cores": [
      "(IS_TRUE Ukrainka diplomatic_relation Georgia)",
      "(IS_TRUE Ukrainka diplomatic_relation Belarus)",
      "(IS_TRUE Ukrainka diplomatic_relation Luxembourg)"
    ],
    "entities": [
      "Belarus",
      "Georgia",
      "Luxembourg",
      "Ukrainka"
    ]
  },
  {
    "question": "Is Anglic languages a portion of Oru\u00e7 Reis-class submarine and West Germanic languages ? [CTX]",
    "s_expression": "(ALL (IS_TRUE Oru\u00e7_Reis-class_submarine part_of Anglic_languages) (IS_TRUE West_Germanic_languages part_of Anglic_lan

In [21]:
examples = []
for qa in prompt_data:
    examples.append({'role': 'user', 'content': qa['question']})
    examples.append({'role': 'assistant', 'content': '\u001F'.join(qa['entities'])})
examples

[{'role': 'user',
  'content': 'Which administrative territories have diplomatic relationships with approximately 1 political territory ? [SEP] Georgia, Belarus, Luxembourg [SEP] Do Ukrainka have diplomatic relations with those administrative territories ? [CTX]'},
 {'role': 'assistant',
  'content': 'Belarus\x1fGeorgia\x1fLuxembourg\x1fUkrainka'},
 {'role': 'user',
  'content': 'Is Anglic languages a portion of Oruç Reis-class submarine and West Germanic languages ? [CTX]'},
 {'role': 'assistant',
  'content': 'Anglic_languages\x1fOruç_Reis-class_submarine\x1fWest_Germanic_languages'},
 {'role': 'user',
  'content': 'Which administrative territory was Gary Collier born at ? [SEP] Fort Worth [SEP] Is that administrative territory a sister city of Adamsville, New Brunswick and Yuen Long Kau Hui ? [CTX]'},
 {'role': 'assistant',
  'content': 'Adamsville,_New_Brunswick\x1fFort_Worth\x1fYuen_Long_Kau_Hui'},
 {'role': 'user',
  'content': 'Which people were born at Santa Cruz and have the g

In [None]:
import openai
# from time import sleep

temp = 0.3

api_key = 'key'
LLM_engine = 'Qwen/Qwen2.5-32B-Instruct'
openai.base_url = 'http://222.29.156.145:8000/v1/'

# api_key = 'sk-54964e5c3b8c4998a74f7d3e35b618ac'
# LLM_engine = 'deepseek-chat'
# openai.base_url = 'https://api.deepseek.com'

for qa in test_data:
	messages = [{'role': 'system', 'content': 
	"""Perform strict multi-turn dialogue entity extraction:
	- Entities include: person/location/organization names, specialized titles/terms
	- Process ONLY the query after the last [SEP]
	- Resolve all contextual references to preceding entities
	- Output entities joined with \u001F"""}] + examples + [{'role': 'user', 'content': qa['question']}]

	openai.api_key = api_key
	resp = openai.chat.completions.create(
		model=LLM_engine,
		messages=messages,
		temperature=temp,
		max_tokens=256,
		top_p=1,
		frequency_penalty=0,
		presence_penalty=0
	)

	qa['predicted_entities'] = sorted(resp.choices[0].message.content.split('\u001F'))

print(json.dumps(test_data, indent=2))

[
  {
    "question": "Which people are in Leroy & Stitch ? [SEP] Dakota Fanning, Chris Sanders, Zoe Caldwell [SEP] Which television programs do those people star in ? [CTX]",
    "s_expression": "(DISTINCT (AND (JOIN cast_member (VALUES Dakota_Fanning Chris_Sanders Zoe_Caldwell)) (JOIN instance_of television_program)))",
    "s_expression_cores": [
      "(AND (JOIN P161 (VALUES Q115541 Q201641 Q1995506)) (JOIN P31 Q15416))"
    ],
    "entities": [
      "Chris_Sanders",
      "Dakota_Fanning",
      "Zoe_Caldwell"
    ],
    "relations": [
      "cast_member",
      "instance_of"
    ],
    "types": [
      "television_program"
    ],
    "predicted_entities": [
      "Chris_Sanders",
      "Dakota_Fanning",
      "Leroy_%26_Stitch",
      "Zoe_Caldwell"
    ]
  },
  {
    "question": "What do Dolly Parton, Oscar Robertson and Carl Smith do for a living ? [CTX]",
    "s_expression": "(DISTINCT (AND (JOIN (R occupation) (VALUES Dolly_Parton Oscar_Robertson Carl_Smith)) (JOIN instance

In [14]:
from utils.execute_query import execute_query
from retriever.semantic_retriever import semantic_search

def get_1hop_relations(entity: str):
    query = ("SELECT DISTINCT ?p WHERE { { ?s ?p wd:" + entity + " . } UNION { wd:" + entity + " ?p ?o . } }")
    return execute_query(query)

def get_1hop_types(entity: str):
    query = ("SELECT DISTINCT ?o WHERE { wd:" + entity + " wdt:P31 ?o . }")
    return execute_query(query)

for qa in test_data:
    qa['1hop_relations'] = set()
    qa['1hop_types'] = set()
    for entity in qa['predicted_entities']:
        mid = semantic_search(entity)[0][1]
        qa['1hop_relations'].update([wikidata_mid_to_fn_dict.get(mid, mid) for mid in get_1hop_relations(mid)])
        qa['1hop_types'].update([wikidata_mid_to_fn_dict.get(mid, mid) for mid in get_1hop_types(mid)])
    qa['1hop_relations'] = list(qa['1hop_relations'])
    qa['1hop_types'] = list(qa['1hop_types'])
print(json.dumps(test_data, indent=2))

[
  {
    "question": "Which people are in Leroy & Stitch ? [SEP] Dakota Fanning, Chris Sanders, Zoe Caldwell [SEP] Which television programs do those people star in ? [CTX]",
    "s_expression": "(DISTINCT (AND (JOIN cast_member (VALUES Dakota_Fanning Chris_Sanders Zoe_Caldwell)) (JOIN instance_of television_program)))",
    "s_expression_cores": [
      "(AND (JOIN P161 (VALUES Q115541 Q201641 Q1995506)) (JOIN P31 Q15416))"
    ],
    "entities": [
      "Chris_Sanders",
      "Dakota_Fanning",
      "Zoe_Caldwell"
    ],
    "relations": [
      "cast_member",
      "instance_of"
    ],
    "types": [
      "television_program"
    ],
    "predicted_entities": [
      "Chris_Sanders",
      "Dakota_Fanning",
      "Leroy_%26_Stitch",
      "Zoe_Caldwell"
    ],
    "1hop_relations": [
      "MusicBrainz_artist_ID",
      "producer",
      "Elonet_person_ID",
      "family_name",
      "Scope.dk_person_ID",
      "instance_of",
      "occupation",
      "date_of_birth",
      "Open_M