# 1. Q&A Annotation for Events

In [7]:
import json
import random
import csv
import re

# 1. 定义多种问题模板和对应的答案字段
templates = {
    "event_info": [
        {
            "template": "What event is happening at {venue} on {event_time} organized by {organization}?",
            "answer_field": "event_name"
        },
        {
            "template": "Who is organizing the event '{event_name}'?",
            "answer_field": "organization"
        },
        {
            "template": "Which organization is hosting '{event_name}'?",
            "answer_field": "organization"
        },
        {
            "template": "Where is the event '{event_name}' held?",
            "answer_field": "venue"
        },
        {
            "template": "When is the event '{event_name}' scheduled to take place?",
            "answer_field": "event_time"
        },
        {
            "template": "What is the date of the event '{event_name}'?",
            "answer_field": "event_time"
        },
        {
            "template": "What time is the event '{event_name}' at {venue} organized by {organization}?",
            "answer_field": "event_time"
        },
        {
            "template": "Which event is taking place at {venue} organized by {organization} on {event_time}?",
            "answer_field": "event_name"
        },
        {
            "template": "What is the name of the event organized by {organization} at {venue} on {event_time}?",
            "answer_field": "event_name"
        },
        {
            "template": "What event is scheduled at {venue} on {event_time}?",
            "answer_field": "event_name"
        },
        {
            "template": "Which event is happening at {venue} on {event_time}?",
            "answer_field": "event_name"
        }
    ],
    "venue_related_questions": [
        {
            "template": "At which venue will '{event_name}' take place?",
            "answer_field": "venue"
        },
        {
            "template": "What is the main venue for events organized by {organization}?",
            "answer_field": "venue"
        },
        {
            "template": "Which organization is hosting events at {venue}?",
            "answer_field": "organization"
        }
    ]
}

# 2. 定义不同的日期格式
def format_date_variations(event_time):
    # 匹配单一日期和范围日期格式
    single_date_pattern = r"\b\w{3}, \w{3} \d{1,2}, \d{4}\b"
    range_date_pattern = r"\b\w{3}, \w{3} \d{1,2} - \w{3}, \w{3} \d{1,2}, \d{4}\b"

    # 定义不同的日期格式
    formats = []
    
    # 处理单一日期格式
    if re.match(single_date_pattern, event_time):
        date_obj = re.findall(r'\w{3} \d{1,2}, \d{4}', event_time)
        formats = [
            event_time.strip(),  # 原始格式
            date_obj[0].replace("2025", "'25").strip(),  # 短年份格式
            date_obj[0].replace("Oct", "October").strip(),  # 完整月份
            date_obj[0].replace(",", "").strip(),  # 去掉逗号
            f"{date_obj[0].split(' ')[0]} {date_obj[0].split(' ')[1]}".strip(),  # 月 日
        ]
    # 处理范围日期格式
    elif re.match(range_date_pattern, event_time):
        start_date, end_date = event_time.split("-")
        start_formats = format_date_variations(start_date.strip())  # 递归调用处理单一日期
        end_formats = format_date_variations(end_date.strip())      # 递归调用处理单一日期
        
        for start in start_formats:
            for end in end_formats:
                formats.append(f"{start.strip()} - {end.strip()}")  # 拼接开始和结束日期

    # 使用 .rstrip(',') 去除最终字符串末尾的逗号
    return ";".join(formats).rstrip(',')

# 3. 生成所有可能的问题和答案
def generate_questions_from_json(data, templates):
    questions_and_answers = []
    for item in data:
        # 生成多个日期格式
        date_variations = format_date_variations(item["event_time"])

        # 替代问题中的占位符
        for category, template_list in templates.items():
            for template_dict in template_list:
                template = template_dict["template"]
                answer_field = template_dict["answer_field"]
                
                # 处理缺失字段，特别是 'genre' 等不存在的字段
                if answer_field not in item:
                    continue  # 如果缺少答案字段，跳过该模板

                # 准备其他事件和场地，避免选择当前事件
                other_events = [d["event_name"] for d in data if d != item]
                other_venues = [d["venue"] for d in data if d != item]
                other_event = random.choice(other_events) if other_events else "N/A"
                other_venue = random.choice(other_venues) if other_venues else "N/A"

                try:
                    question = template.format(
                        event_name=item["event_name"],
                        event_time=item["event_time"],
                        venue=item["venue"],
                        organization=item["organization"],
                        other_event=other_event,
                        other_venue=other_venue
                    )
                except KeyError as e:
                    print(f"Missing key {e} in item: {item}")
                    continue  # 如果缺少必要字段，跳过该模板

                # 根据 answer_field 设置正确的答案
                if answer_field == "event_time":
                    answer = date_variations  # 输出多个日期格式
                else:
                    answer = item[answer_field]  # 正常获取答案字段

                questions_and_answers.append((question, answer))
    return questions_and_answers

# 4. 从文件中读取JSON数据
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# 5. 将问题和答案写入CSV文件
def write_to_csv(file_name, questions_and_answers):
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # 写入CSV的标题行
        writer.writerow(["Time Sensitive Label", "Question", "Reference Answer"])
        
        # 写入问题和答案
        for question, answer in questions_and_answers:
            writer.writerow(["", question, answer])

# 6. 主执行函数
def main():
    file_path = "../raw_documents/Music_Culture/1_Symphony_Events.json"
    json_data = load_json_data(file_path)
    
    questions_and_answers = generate_questions_from_json(json_data, templates)
    
    # 写入CSV
    output_file = "../../QA/Music_Symphony_Events.csv"
    write_to_csv(output_file, questions_and_answers)
    print(f"Generated {len(questions_and_answers)} questions and answers, saved to {output_file}")

if __name__ == "__main__":
    main()

Generated 784 questions and answers, saved to ../../QA/Music_Symphony_Events.csv
