In [None]:
import json
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def find_best_matching_title(question_titles, text_annotations):
    matching_titles = []
    for question_title in question_titles:
        for annotation_group in text_annotations:
            for annotation in annotation_group['annotation_data']:
                if question_title['title'] == annotation['title']:
                    matching_titles.append((annotation['title'], annotation['pageRank']))

    if not matching_titles:
        return "NA"
    
    best_title = max(matching_titles, key=lambda x: x[1])
    return best_title[0]


def process_data(question_data, text_data):
    enriched_data = []

    for question_entry in question_data:
        question_id = question_entry['id']
        question_text = question_entry['text']
        
        
        if 'annotations' in question_entry and 'annotation_data' in question_entry['annotations']:
            question_titles = question_entry['annotations']['annotation_data']
        else:
            question_titles = []
        
        
        matching_text_entry = next((entry for entry in text_data if entry['id'] == question_id), None)

        if matching_text_entry:
            text_content = matching_text_entry['text']
            text_annotations = matching_text_entry['annotations']

            
            topic = find_best_matching_title(question_titles, text_annotations)
            
            
            enriched_entry = {
                "id": question_id,
                "question": question_text,
                "text": text_content,
                "topic": topic
            }
            enriched_data.append(enriched_entry)

    return enriched_data


def main():
  
    question_data = load_json_data('/home/qiyu/Dev/ziqing/wiki/wikified_squad_question.json')
    text_data = load_json_data('/home/qiyu/Dev/ziqing/wiki/wikified_squad_text.json')


    enriched_data = process_data(question_data, text_data)


    with open('enriched_squad.json', 'w', encoding='utf-8') as output_file:
        json.dump(enriched_data, output_file, indent=4, ensure_ascii=False)


main()


In [None]:

def count_entries_in_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        num_entries = len(data)
    

    print(f"'{file_path}':{num_entries}")

file_path = '/home/qiyu/Dev/ziqing/wiki/enriched_squad.json'

count_entries_in_json(file_path)


In [None]:
def remove_na_topics(file_path, output_path):

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    

    filtered_data = [entry for entry in data if entry['topic'] != "NA"]
    

    num_filtered_entries = len(filtered_data)
    print(f"after: {num_filtered_entries}")
    

    with open(output_path, 'w', encoding='utf-8') as output_file:
        json.dump(filtered_data, output_file, indent=4, ensure_ascii=False)
    print(f"write in '{output_path}'")


file_path = '/home/qiyu/Dev/ziqing/wiki/enriched_squad.json'

output_path = '/home/qiyu/Dev/ziqing/wiki/filtered_enriched_squad.json'

remove_na_topics(file_path, output_path)


In [None]:
import json

with open('/home/qiyu/Dev/ziqing/wiki/wikified_squad_question.json', 'r', encoding='utf-8') as f:
    questions_data = json.load(f)

with open('/home/qiyu/Dev/ziqing/wiki/wikified_squad_text.json', 'r', encoding='utf-8') as f:
    text_data = json.load(f)


text_data_index = {}

for item in text_data:
    ids = item['id'].split(', ')
    for id_ in ids:
        text_data_index[id_] = item


enriched_squad = []


for question in questions_data:
    question_id = question.get('id')
    question_text = question.get('text')
    question_annotations = question.get('annotations', {})

    if not question_annotations:
        continue

    question_annotation_data = question_annotations.get('annotation_data', [])

    if not question_annotation_data:
        continue


    text_item = text_data_index.get(question_id)

    if not text_item:
        continue

    text_text = text_item.get('text')
    text_annotations = text_item.get('annotations', [])

    if not text_annotations:
        continue


    found_titles = []
    for annotation in question_annotation_data:
        q_title = annotation.get('title')
        q_pageRank = annotation.get('pageRank', 0)

        for text_annotation in text_annotations:
            for t_annotation in text_annotation.get('annotation_data', []):
                t_title = t_annotation.get('title')

                if q_title == t_title:
                    found_titles.append((q_title, q_pageRank))


    if not found_titles:
        selected_title = "NA"
    elif len(found_titles) == 1:
        selected_title = found_titles[0][0]
    else:
      
        selected_title = max(found_titles, key=lambda x: x[1])[0]


    enriched_entry = {
        "id": question_id,
        "question": question_text,
        "text": text_text,
        "topic": selected_title
    }

    enriched_squad.append(enriched_entry)


with open('new_enriched_squad.json', 'w', encoding='utf-8') as f:
    json.dump(enriched_squad, f, ensure_ascii=False, indent=4)

print("Enriched squad data saved to enriched_squad.json")


In [None]:

def count_entries_in_json(file_path):
   
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    

    num_entries = len(data)
    

    print(f" '{file_path}' : {num_entries}")


file_path = '/home/qiyu/Dev/ziqing/wiki/new_enriched_squad.json'

count_entries_in_json(file_path)


In [None]:
def remove_na_topics(file_path, output_path):

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    filtered_data = [entry for entry in data if entry['topic'] != "NA"]

    num_filtered_entries = len(filtered_data)
    print(f"after: {num_filtered_entries}")
    
    with open(output_path, 'w', encoding='utf-8') as output_file:
        json.dump(filtered_data, output_file, indent=4, ensure_ascii=False)
    print(f"write '{output_path}'")

file_path = '/home/qiyu/Dev/ziqing/wiki/enriched_squad.json'

output_path = 'filtered_enriched_squad.json'

remove_na_topics(file_path, output_path)


In [None]:
from pathlib import Path

def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return json.load(file)

def find_highest_page_rank(titles, annotations):
    max_page_rank = -1
    selected_title = "NA"
    for annotation in annotations:
        if annotation['title'] in titles and annotation['pageRank'] > max_page_rank:
            max_page_rank = annotation['pageRank']
            selected_title = annotation['title']
    return selected_title

src_data = load_data('/home/qiyu/Dev/ziqing/wiki/wikified_KhanQ_text.json')
tgt_data = load_data('/home/qiyu/Dev/ziqing/wiki/wikified_KhanQ_question.json')

src_data = src_data if isinstance(src_data, list) else [src_data]
tgt_data = tgt_data if isinstance(tgt_data, list) else [tgt_data]

enriched_data = []

for src_item in src_data:
    for tgt_item in tgt_data:
        if src_item['id'] == tgt_item['id']:
            src_titles = {annot['title'] for annot in src_item['annotations']['annotation_data']}
            tgt_titles = {annot['title']: annot for annot in tgt_item['annotations']['annotation_data']}
            
            common_titles = src_titles.intersection(tgt_titles.keys())
            
            if not common_titles:
                topic = "NA"
            elif len(common_titles) == 1:
                topic = common_titles.pop()
            else:
                topic = find_highest_page_rank(common_titles, tgt_item['annotations']['annotation_data'])
            
            enriched_data.append({
                'id': src_item['id'],
                'text': src_item['text'],
                'question': tgt_item['text'],
                'topic': topic
            })

output_path = Path('/home/qiyu/Dev/ziqing/wiki/enriched_KhanQ.json')
with output_path.open('w', encoding='utf-8') as file:
    json.dump(enriched_data, file, ensure_ascii=False, indent=4)

print(f"save as {output_path}")


In [None]:
def remove_na_topics(file_path, output_path):

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    filtered_data = [entry for entry in data if entry['topic'] != "NA"]
    
    num_filtered_entries = len(filtered_data)
    print(f"after: {num_filtered_entries}")
    
    with open(output_path, 'w', encoding='utf-8') as output_file:
        json.dump(filtered_data, output_file, indent=4, ensure_ascii=False)
    print(f"save as '{output_path}'")

file_path = '/home/qiyu/Dev/ziqing/wiki/enriched_KhanQ.json'

output_path = 'filtered_enriched_KhanQ.json'

remove_na_topics(file_path, output_path)


In [None]:
import json

with open('filtered_enriched_KhanQ.json', 'r', encoding='utf-8') as file:
    enriched_data = json.load(file)

with open('wikified_KhanQ_text.json', 'r', encoding='utf-8') as file:
    wikified_data = json.load(file)

wikified_dict = {item["id"]: item["annotations"]["annotation_data"] for item in wikified_data}

new_data = []

for item in enriched_data:
    item_id = item["id"]
    text = item["text"]
    topic = item["topic"]

    annotations = wikified_dict.get(item_id, [])
    
    highest_pagerank = None
    highest_title = None
    for annotation in annotations:
        if annotation["title"] != topic and (highest_pagerank is None or annotation["pageRank"] > highest_pagerank):
            highest_pagerank = annotation["pageRank"]
            highest_title = annotation["title"]
    

    if highest_title:
        new_data.append({
            "id": item_id,
            "text": text,
            "topic": highest_title
        })

with open('othertopic_KhanQ.json', 'w', encoding='utf-8') as file:
    json.dump(new_data, file, ensure_ascii=False, indent=4)

print("save as 'othertopic_KhanQ.json'")
