## 收集SN-caption-test-align对应的比赛信息

In [4]:
import os
import shutil


def copy_original_files(data_dir, original_dir):
    """
    遍历 Data 文件夹下的比赛文件夹，复制 original 文件夹下对应的所有文件。

    Args:
        data_dir (str): Data 文件夹的路径。
        original_dir (str): original 文件夹的路径。
    """

    for season_dir in os.listdir(data_dir):
        data_season_path = os.path.join(data_dir, season_dir)
        original_season_path = os.path.join(original_dir, season_dir)

        # 检查 Data 文件夹下的赛季文件夹是否存在
        if os.path.isdir(data_season_path):
            # 检查 original 文件夹下的赛季文件夹是否存在
            if os.path.isdir(original_season_path):
                for match_dir in os.listdir(data_season_path):
                    data_match_path = os.path.join(data_season_path, match_dir)
                    original_match_path = os.path.join(original_season_path, match_dir)

                    # 检查 Data 文件夹下的比赛文件夹是否存在
                    if os.path.isdir(data_match_path):
                        # 检查 original 文件夹下的比赛文件夹是否存在
                        if os.path.isdir(original_match_path):
                            # 复制 original 文件夹下的所有文件到 Data 文件夹下的对应比赛文件夹
                            for filename in os.listdir(original_match_path):
                                file_path = os.path.join(
                                    original_match_path, filename)
                                if os.path.isfile(file_path):
                                    shutil.copy2(file_path, data_match_path)
                                    print(
                                        f"已复制 {season_dir}/{match_dir}/{filename} 到 {data_match_path}")
                        else:
                            print(
                                f"警告：original 文件夹下不存在 {season_dir}/{match_dir} 文件夹")
            else:
                print(f"警告：original 文件夹下不存在 {season_dir} 文件夹")

In [None]:
# 替换为您的 Data 和 original 文件夹路径
data_directory = "./Data"
original_directory = "../SoccerNet/ASR"

# 调用函数执行复制操作
copy_original_files(data_directory, original_directory)

In [None]:
# 替换为您的 Data 和 original 文件夹路径
data_directory = "./Data"
original_directory = "../SoccerNet/dataset-v2label"

# 调用函数执行复制操作
copy_original_files(data_directory, original_directory)

In [None]:
# 替换为您的 Data 和 original 文件夹路径
data_directory = "./Data"
original_directory = "../SoccerNet/Raw_match/"

# 调用函数执行复制操作
copy_original_files(data_directory, original_directory)

In [None]:
# 替换为您的 Data 和 original 文件夹路径
data_directory = "./Data"
original_directory = "../SoccerNet/LLaMA-ASR-Summary/events_json/"

# 调用函数执行复制操作
copy_original_files(data_directory, original_directory)

In [None]:
# 替换为您的 Data 和 original 文件夹路径
data_directory = "./Data"
original_directory = "../SoccerNet/LLaMA-ASR-Summary/events_json_visual_evidential/"

# 调用函数执行复制操作
copy_original_files(data_directory, original_directory)

In [None]:
# 替换为您的 Data 和 original 文件夹路径
data_directory = "./Data"
original_directory = "../SoccerNet/MatchTime_withtype/SN-Caption-test-align/"

# 调用函数执行复制操作
copy_original_files(data_directory, original_directory)

## 重命名文件夹

In [None]:
import os


def rename_json_files(folder_path):
    """
    递归地遍历文件夹，并重命名符合条件的 JSON 文件。

    Args:
        folder_path (str): 文件夹的路径。
    """

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file == "sn-caption-test-align.json":
                old_file_path = os.path.join(root, file)
                new_file_path = os.path.join(root, "events_commentaries.json")
                os.rename(old_file_path, new_file_path)
                print(f"已重命名: {old_file_path} -> {new_file_path}")


# 替换为您的文件夹路径
folder_path = "./Data"  # 将 "your_folder_path" 替换为实际的文件夹路径

# 调用函数执行重命名操作
rename_json_files(folder_path)

## 测试 Offline Video Model

### llava-onevision-qwen2-7b-ov-hf

In [None]:
import subprocess
import cv2
import torch
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
from PIL import Image

model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
processor.tokenizer.padding_side = "left"

video_path = "/DB/data/jiayuanrao-1/sports/haokai_intern/Dataset/Long-Video-QA/Data/spain_laliga_2016-2017/2017-03-12 - 22-45 Real Madrid 2 - 1 Betis/1_224p_clipped.mp4"

In [None]:
question = "Which part of the body was used to score the goal? \na) Header (Head)\nb) Left foot\nc) Right foot\nd) Other body part"

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "video", "path": video_path},
            {"type": "text", "text": question},
        ],
    },
]

inputs = processor.apply_chat_template(
    conversation,
    num_frames=8,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device, torch.float16)
out = model.generate(**inputs, max_new_tokens=60)
processor.batch_decode(out, skip_special_tokens=True,
                       clean_up_tokenization_spaces=True)

## 统计数据集事件

### SoccerNetV2

In [17]:
import os
import json


def count_average_events(directory):
    events = {}
    events_counts = 0
    match_count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                match_count += 1
                path = os.path.join(root, file)
                with open(path, 'r') as f:
                    data = json.load(f)
                    annotations = data['annotations']
                    events_counts += len(annotations)
                    for annotation in annotations:
                        label = annotation["label"]
                        if events.get(label) == None:
                            events[label] = 0
                        else:
                            events[label] += 1
    events = {key:value/match_count for key, value in events.items()}
    events_counts /= match_count

    return events, events_counts, match_count

In [18]:
directory = '../SoccerNet/dataset-v2label'
soccernetv2_events, soccernetv2_events_counts, match_count = count_average_events(directory)
soccernetv2_events, soccernetv2_events_counts, match_count

({'Kick-off': 5.13,
  'Shots on target': 11.638,
  'Offside': 4.194,
  'Indirect free-kick': 21.04,
  'Ball out of play': 63.618,
  'Throw-in': 37.834,
  'Clearance': 15.79,
  'Foul': 23.346,
  'Substitution': 5.676,
  'Shots off target': 10.51,
  'Corner': 9.67,
  'Goal': 3.404,
  'Direct free-kick': 4.398,
  'Yellow card': 4.092,
  'Penalty': 0.344,
  'Red card': 0.108,
  'Yellow->red card': 0.09},
 220.916,
 500)

### SoccerReplay-1988

In [1]:
import os
import json


def count_soccerreplay_1988_average_events(directory):
    events = {}
    events_count = 0
    match_count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                match_count += 1
                path = os.path.join(root, file)
                with open(path, 'r') as f:
                    data = json.load(f)
                    annotations = data['comments']
                    events_count += len(annotations)
                    for annotation in annotations:
                        label = annotation["comments_type"]
                        if events.get(label) == None:
                            events[label] = 0
                        else:
                            events[label] += 1
    return events, events_count, match_count


directory_path = '../SoccerReplay-1988/SoccerReplay-1988'
soccerreplay_1988_events, soccerreplay_1988_events_count, match_count = count_soccerreplay_1988_average_events(directory_path)
len(soccerreplay_1988_events), {
    key: [value, f'{value/match_count:.2f}'] for key, value in soccerreplay_1988_events.items()}, soccerreplay_1988_events_count/match_count, match_count

(26,
 {'statistics and summary': [6446, '3.24'],
  'start of half game': [7250, '3.65'],
  'foul with no card': [14010, '7.05'],
  'clearance': [19073, '9.59'],
  'corner': [20328, '10.23'],
  'goal': [7274, '3.66'],
  'off side': [4416, '2.22'],
  'injury': [3864, '1.94'],
  'substitution': [14441, '7.26'],
  'free kick': [2026, '1.02'],
  'saved by goal-keeper': [5975, '3.01'],
  'yellow card': [8025, '4.04'],
  'lead to corner': [7889, '3.97'],
  'show added time': [3547, '1.78'],
  'shot off target': [11024, '5.55'],
  'end of half game': [4030, '2.03'],
  '': [176, '0.09'],
  'foul lead to penalty': [1008, '0.51'],
  'penalty': [470, '0.24'],
  'ball possession': [4543, '2.29'],
  'ball out of play': [1665, '0.84'],
  'throw in': [310, '0.16'],
  'second yellow card': [163, '0.08'],
  'red card': [390, '0.20'],
  'var': [1500, '0.75'],
  'wcl-icon-settings-info-rounded': [98, '0.05']},
 75.43611670020121,
 1988)

In [18]:
import os
import json


def count_goals(directory):
    match = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                goals = 0
                path = os.path.join(root, file)
                with open(path, 'r') as f:
                    data = json.load(f)
                    annotations = data['comments']
                    for annotation in annotations:
                        label = annotation["comments_type"]
                        if label == 'goal':
                            goals += 1
                    if goals >= 10:
                        match.append([file, goals])
    return match


directory_path = '../SoccerReplay-1988/SoccerReplay-1988'
matches = count_goals(
    directory_path)
matches

[['2021-04-18_Lazio_5-3_Benevento_nF9kQM9n.json', 10],
 ['2020-10-05_Aston_Villa_7-2_Liverpool_6J0L2p0r.json', 11],
 ['2023-09-30_Aston_Villa_6-1_Brighton_E3fbIwnp.json', 10],
 ['2020-03-01_Lecce_2-7_Atalanta_Uo3vjpqp.json', 10],
 ['2022-08-27_Liverpool_9-0_Bournemouth_UsamHrKG.json', 11],
 ['2022-10-02_Manchester_City_6-3_Manchester_Utd_4fpwmb6T.json', 11],
 ['2019-08-14_FC_Copenhagen_1-2_Crvena_zvezda_YsUm9yEM.json', 15],
 ['2022-10-03_Girona_3-5_Real_Sociedad_8Gb9KrXQ.json', 10]]

#### 统计进球事件

In [None]:
import os
import json
import re


def extract_score_from_filename(filename):
    # 匹配形如 _3-3_ 或 _4-0_ 的比分
    match = re.search(r'_(\d+)-(\d+)_', filename)
    if match:
        return int(match.group(1)) + int(match.group(2))
    return None


def count_goals_and_compare(directory):
    mismatched_matches = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)

                # 提取文件名中包含的比分进球数总和
                expected_goals = extract_score_from_filename(file)
                if expected_goals is None:
                    continue  # 跳过无法识别比分的文件

                # 统计 comments 中 'goal' 类型的数量
                with open(file_path, 'r', encoding='utf-8') as f:
                    try:
                        data = json.load(f)
                        comments = data.get('comments', [])
                        actual_goals = sum(1 for c in comments if c.get(
                            'comments_type') == 'goal')

                        if expected_goals != actual_goals:
                            mismatched_matches.append({
                                'file': file,
                                'expected_goals': expected_goals,
                                'actual_goals': actual_goals
                            })
                    except Exception as e:
                        print(f"Error processing file {file}: {e}")
    return mismatched_matches


# 修改路径为你的实际路径
directory_path = '../SoccerReplay-1988/SoccerReplay-1988'
result = count_goals_and_compare(directory_path)

# 打印不一致的比赛文件
for match in result:
    print(
        f"{match['file']}: expected {match['expected_goals']}, found {match['actual_goals']}")

2019-01-20_Fiorentina_3-3_Sampdoria_GdfimYws.json: expected 6, found 7
2019-03-04_Napoli_1-2_Juventus_29Qtz4MD.json: expected 3, found 4
2019-04-30_Atalanta_2-0_Udinese_6w99xiq5.json: expected 2, found 1
2019-03-17_Genoa_2-0_Juventus_ObsVrY4o.json: expected 2, found 3
2019-05-27_Atalanta_3-1_Sassuolo_bBW7eQ6s.json: expected 4, found 5
2019-05-13_AS_Roma_2-0_Juventus_KM2WUasd.json: expected 2, found 3
2022-04-22_Burnley_2-0_Southampton_CzfH0o8b.json: expected 2, found 3
2021-08-28_Manchester_City_5-0_Arsenal_jwTbynnG.json: expected 5, found 6
2022-03-13_Manchester_Utd_3-2_Tottenham_0dQbr4hU.json: expected 5, found 6
2022-05-08_Brighton_4-0_Manchester_Utd_OEiic62a.json: expected 4, found 5
2021-11-08_West_Ham_3-2_Liverpool_0S6SM1kD.json: expected 5, found 6
2021-10-23_Everton_2-5_Watford_pfkXX6tH.json: expected 7, found 8
2022-04-09_Arsenal_1-2_Brighton_MmrxRsM3.json: expected 3, found 4
2021-09-19_West_Ham_1-2_Manchester_Utd_vuy4tYTC.json: expected 3, found 4
2022-02-20_Manchester_City_

In [3]:
len(result)

427

In [13]:
average_more_goals = sum(
    [abs(match['actual_goals']-match['expected_goals']) for match in result])/len(result)
average_more_goals

1.215456674473068

In [16]:
special_matches = [
    match for match in result if match['actual_goals'] < match['expected_goals']]
special_matches

[{'file': '2019-04-30_Atalanta_2-0_Udinese_6w99xiq5.json',
  'expected_goals': 2,
  'actual_goals': 1},
 {'file': '2020-09-20_Verona_3-0_AS_Roma_htMuKMqK.json',
  'expected_goals': 3,
  'actual_goals': 0},
 {'file': '2022-10-06_Sevilla_1-4_Dortmund_YmXZ9lSh.json',
  'expected_goals': 5,
  'actual_goals': 4},
 {'file': '2018-03-01_Atl._Madrid_4-0_Leganes_WCMR1KPk.json',
  'expected_goals': 4,
  'actual_goals': 3}]

In [None]:
# 6768是从比分上统计的ground truth进球数
6768 / (soccerreplay_1988_events['goal'])

0.9304371734946384

#### 统计换人事件

In [None]:
import json
import os
from datetime import datetime


def calculate_average_substitutions(base_path, season_threshold):
    early_season_substitutions = []
    late_season_substitutions = []

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                # 修正日期提取逻辑，从match_info.timestamp中获取日期
                match_date_str = data['match_info'].get(
                    'timestamp', '').split()[0]
                if not match_date_str:
                    continue

                try:
                    match_date = datetime.strptime(match_date_str, '%Y-%m-%d')
                except ValueError:
                    print(f"日期解析失败: {file}")
                    continue

                # 赛季计算逻辑保持不变
                season = f"{match_date.year}-{match_date.year + 1}" if match_date.month >= 8 else f"{match_date.year - 1}-{match_date.year}"

                # 统计换人次数（修正comments的获取方式）
                comments = data.get('comments', [])
                substitution_count = sum(1 for comment in comments if comment.get(
                    'comments_type', '').lower() == 'substitution')

                # 调试输出（可选）
                # print(f"{file}: {substitution_count}次换人，赛季{season}")

                # 分组统计
                if season < season_threshold:
                    early_season_substitutions.append(substitution_count)
                else:
                    late_season_substitutions.append(substitution_count)

    # 计算平均值（处理空列表情况）
    early_avg = sum(early_season_substitutions) / \
        len(early_season_substitutions) if early_season_substitutions else 0
    late_avg = sum(late_season_substitutions) / \
        len(late_season_substitutions) if late_season_substitutions else 0

    return early_avg, late_avg


# 修改路径为实际路径（注意路径分隔符）
base_path = '../SoccerReplay-1988/SoccerReplay-1988'  # 根据实际文件结构调整路径
season_threshold = '2019-2020'

# 调用函数并打印结果
early_avg, late_avg = calculate_average_substitutions(
    base_path, season_threshold)
print(f"早于2019-2020赛季的平均换人数量: {early_avg:.2f}")
print(f"晚于2019-2020赛季的平均换人数量: {late_avg:.2f}")

早于2019-2020赛季的平均换人数量: 5.69
晚于2019-2020赛季的平均换人数量: 7.79


: 

## 根据选定事件收集SoccerReplay-1988

In [None]:
import os
import json


def filter_comments_by_type(input_dir, output_dir, target_types):
    """
    过滤SoccerReplay-1988数据集中的注释事件，仅保留指定类型
    
    Args:
        input_dir (str): 原始数据集根目录路径
        output_dir (str): 过滤后数据保存目录
        target_types (list): 需要保留的comments_type列表
    """
    # 创建输出目录（如果不存在）
    os.makedirs(output_dir, exist_ok=True)

    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.json'):
                input_path = os.path.join(root, file)
                rel_path = os.path.relpath(root, input_dir)
                output_subdir = os.path.join(output_dir, rel_path)
                os.makedirs(output_subdir, exist_ok=True)
                output_path = os.path.join(output_subdir, file)

                try:
                    with open(input_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)

                    # 过滤comments数组
                    filtered_comments = [
                        comment for comment in data.get('comments', [])
                        if comment.get('comments_type') in target_types
                    ]

                    data['comments'] = filtered_comments  # 更新数据

                    # 保存处理后的文件
                    with open(output_path, 'w', encoding='utf-8') as f:
                        json.dump(data, f, indent=2, ensure_ascii=False)

                    print(f"Processed: {file}")

                except json.JSONDecodeError as e:
                    print(f"JSON解析错误: {input_path}, 错误: {str(e)}")
                except Exception as e:
                    print(f"处理文件失败: {input_path}, 错误: {str(e)}")


if __name__ == "__main__":
    # 需要保留的事件类型列表
    valid_types = [
        'corner', 'substitution', 'yellow card',
        'second yellow card', 'red card', 'goal',
        'penalty'
    ]

    # 设置输入输出路径
    input_directory = "../SoccerReplay-1988/SoccerReplay-1988"
    output_directory = "../Long-Video-QA/Data"

    # 执行过滤操作
    filter_comments_by_type(input_directory, output_directory, valid_types)

In [None]:
import os 
import json

# 遍历指定目录下的所有JSON文件，为每个注释添加索引，并直接写回源文件。
def add_comment_index(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                # 为每个注释添加索引
                for i, comment in enumerate(data.get('comments', [])):
                    comment['index'] = i + 1

                # 将修改后的数据写回原文件
                with open(file_path, 'w', encoding='utf-8') as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
                print(f"已处理文件: {file_path}")


input_directory = "../Long-Video-QA/Data" 
add_comment_index(input_directory)

In [11]:
from openai import OpenAI


def workflow(input_text, Instruction, follow_up_prompt=None, api_key="sk-FOvSp53sdWa96yE71d74F52a4d074d7f9128E7Bb4f113cE7"):
    client = OpenAI(api_key=api_key,
                    base_url="https://az.gptplus5.com/v1", timeout=2400.0)
    completion = client.chat.completions.create(
        model="chatgpt-4o-latest",
        messages=[
            {"role": "system", "content": Instruction},
            {"role": "user", "content": input_text}
        ],
        timeout=2400.0,
        stream=False
    )

    first_round_reply = completion.choices[0].message.content

    return first_round_reply


workflow("What is the score of the match?",
         "You are a soccer expert. Please answer the question as accurately as possible.")

"Unfortunately, I don't have real-time access to live match scores or events. To check the current score of a soccer match, you can visit a sports website, use a mobile app like ESPN or FlashScore, or check with broadcasters covering the game. Let me know if you need help finding information on a specific match!"

In [5]:
import os
import json

with open("./2017-08-12_Watford_3-3_Liverpool_AaZvBO5T_goals.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
len(data)

15

## 生成QA对

#### 尝试JSON Mode

In [None]:
import json
from openai import OpenAI


QA_SCHEMA = {
    "name": "generate_qa_pairs",
    "description": "Returns an array of QA pair objects in the specified format.",
    "parameters": {
        "type": "array",
        "items": {
            "type": "object",
            "properties": {
                "question":         {"type": "string"},
                "choices": {
                    "type": "array",
                    "items": {"type": "string"},
                    "minItems": 2,
                    "maxItems": 4
                },
                "answer":           {"type": "string", "enum": ["A", "B", "C", "D"]},
                "type":             {"type": "string", "enum": ["past", "full", "future"]},
                "question_level":   {"type": "string", "enum": ["match_level", "team_level", "player_level"]},
                "related_event": {
                    "type": "array",
                    "items": {"type": "integer"}
                },
                "current_event_index": {
                    "anyOf": [
                        {"type": "integer"},
                        {"type": "null"}
                    ]
                },
                "index":            {"type": "integer"}
            },
            "required": ["question", "choices", "answer", "type", "question_level", "related_event", "current_event_index", "index"]
        }
    }
}


def workflow(input_text, Instruction, follow_up_prompt=None, api_key="sk-FOvSp53sdWa96yE71d74F52a4d074d7f9128E7Bb4f113cE7"):
    client = OpenAI(api_key=api_key,
                    base_url="https://az.gptplus5.com/v1", timeout=2400.0)

    completion = client.chat.completions.create(
        # model="deepseek-chat",
        model="chatgpt-4o-latest",
        messages=[
            {"role": "system", "content": Instruction},
            {"role": "user",   "content": input_text}
        ],
        functions=[QA_SCHEMA],
        function_call={"name": "generate_qa_pairs"},
        timeout=2400.0,
        stream=False
    )
    
    # 这里直接拿到已经是 JSON 格式的字符串
    args = completion.choices[0].message.function_call.arguments
    return json.loads(args)


def clean_json_block(text):
    if text.strip().startswith("```"):
        lines = text.strip().splitlines()
        return "\n".join(lines[1:-1])
    return text


def extract_goal_comments(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    comments_list = data.get("comments", [])
    result = []

    for comment in comments_list:
        if comment.get("comments_type") != "goal":
            continue

        half = comment.get("half")
        if half not in [1, 2]:
            continue

        timestamp = comment.get("time_stamp", "")
        if not timestamp:
            continue

        comments_text = comment.get("comments_text", "")
        if not comments_text:
            continue

        index = comment.get("index", -1)  # 获取index，如果不存在则默认为-1
        half_str = "1st half" if half == 1 else "2nd half"
        commentary_line = f"{half_str} - {timestamp} \"{comments_text}\" (Index: {index})"
        result.append(commentary_line)

    return "\n".join(result)


def generate_qa_pairs(json_path, save_path):
    goal_comments = extract_goal_comments(json_path)

    input_text = f"""Follow these requirements:

1. CONTEXT:
The provided text contains goal-related commentary from a football match. Use this to create three types of QA pairs across three different levels. 
When done, call the function `generate_qa_pairs` to return your JSON object with a 'qa_pairs' property containing the array of QA pairs.

2. QA TYPES AND EXAMPLES:
A. Past Video QA (At least 3 questions per level)
Match Level Examples:
  * "How many goals have been scored in the match so far?"
  * "What is the current score of the match?"
  * "How many minutes passed between the first and second goals?"

Team Level Examples:
  * "How many goals has Liverpool scored up to this point?"
  * "Which team scored the most recent goal?"
  * "Has Watford managed to take the lead yet?"

Player Level Examples:
  * "Who scored the opening goal of the match?"
  * "Has Roberto Firmino scored any goals so far?"
  * "Which player provided the assist for the last goal?"

B. Full Video QA (At least 3 questions per level)
Match Level Examples:
  * "How many total goals were scored in the match?"
  * "In which half were more goals scored?"
  * "What was the final score of the match?"

Team Level Examples:
  * "How many goals did Watford score in total?"
  * "Which team scored more goals in the second half?"
  * "Did both teams score in both halves?"

Player Level Examples:
  * "Who scored the most goals in the match?"
  * "Which players scored for Liverpool?"
  * "Did any player score multiple goals?"

C. Future Prediction QA (At least 3 questions per level, Can be asked during the middle of the match)
Match Level Examples:
  * "Will there be more goals in this match?"
  * "Will the match end in a draw?"
  * "How many more goals will be scored in this match?"
  * "Who will score the next goal?"

Team Level Examples:
  * "Which team is more likely to score next?"
  * "Will the trailing team manage to equalize?"
  * "Will either team keep a clean sheet in the remaining time?"

Player Level Examples:
  * "Will any substitute player score in the remaining time?"
  * "Will any player complete a hat-trick?"
  * "Which striker is most likely to score next?"

3. REQUIREMENTS:
- Generate at least 5 questions of each type
- Questions must require the provided goal data to answer
- Every question must be a multiple choice question with 2-4 choices
- Questions must cover all three levels:
  * match_level: Questions about overall match statistics or events
  * team_level: Questions about specific team's performance
  * player_level: Questions about individual player's actions
- related_event must be a list of indices referring to relevant goal events
- Vary question formats and complexity
- Include specific details from the commentary
- Ensure answers are clear, concise and accurate
- For prediction questions, use knowledge of all events to provide correct answers
- For 'past' and 'future' types, include the 'current_event_index' field
- For 'full' type, set 'current_event_index' to None

5. COMMENTARY:
{goal_comments}

Generate the QA pairs in the specified JSON format only."""

    instruction = "You are a football analyst tasked with creating diverse QA pairs from goal-related match commentary. Ensure that answers are generated by considering ALL events in the match."

    qa_pairs_response = workflow(
        input_text=input_text, Instruction=instruction)
    qa_pairs = qa_pairs_response.get("qa_pairs", [])

    # qa_pairs = clean_json_block(qa_pairs)

    return qa_pairs


if __name__ == "__main__":
    match_path = "../Long-Video-QA/Data/england_epl_2017-2018/2017-08-12_watford-fc-liverpool-fc/2017-08-12_Watford_3-3_Liverpool_AaZvBO5T.json"
    save_path = "./2017-08-12_Watford_3-3_Liverpool_AaZvBO5T_goals.json"
    qa_pairs = generate_qa_pairs(match_path, save_path)

### 尝试使用deepseek

In [2]:
import json
from openai import OpenAI


def workflow(input_text, Instruction, follow_up_prompt=None, api_key="sk-3db847df235b46d894d71b425c57c0f0"):
    client = OpenAI(api_key=api_key,
                    base_url="https://api.deepseek.com", timeout=2400.0)
    completion = client.chat.completions.create(
        # model="chatgpt-4o-latest",
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": Instruction},
            {"role": "user", "content": input_text}
        ],
        timeout=2400.0,
        stream=False
    )

    first_round_reply = completion.choices[0].message.content

    return first_round_reply


workflow("What is the score of the match?",
         "You are a soccer expert. Please answer the question as accurately as possible.")

"I currently don't have access to live match scores. You can check the latest scores on sports websites like **ESPN**, **BBC Sport**, **SofaScore**, or **FlashScore**.  \n\nLet me know if you'd like historical match results or analysis on a specific game!"

In [1]:
import json
from openai import OpenAI


def workflow(input_text, Instruction, follow_up_prompt=None, api_key="sk-FOvSp53sdWa96yE71d74F52a4d074d7f9128E7Bb4f113cE7"):
    client = OpenAI(api_key=api_key,
                    base_url="https://az.gptplus5.com/v1", timeout=2400.0)
    completion = client.chat.completions.create(
        # model="chatgpt-4o-latest",
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": Instruction},
            {"role": "user", "content": input_text}
        ],
        timeout=2400.0,
        stream=False
    )

    first_round_reply = completion.choices[0].message.content

    return first_round_reply


def clean_json_block(text):
    if text.strip().startswith("```"):
        lines = text.strip().splitlines()
        return "\n".join(lines[1:-1])
    return text


def extract_goal_comments(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    comments_list = data.get("comments", [])
    result = []

    for comment in comments_list:
        if comment.get("comments_type") != "goal":
            continue

        half = comment.get("half")
        if half not in [1, 2]:
            continue

        timestamp = comment.get("time_stamp", "")
        if not timestamp:
            continue

        comments_text = comment.get("comments_text", "")
        if not comments_text:
            continue

        index = comment.get("index", -1)  # 获取index，如果不存在则默认为-1
        half_str = "1st half" if half == 1 else "2nd half"
        commentary_line = f"{half_str} - {timestamp} \"{comments_text}\" (Index: {index})"
        result.append(commentary_line)

    return "\n".join(result)


def generate_qa_pairs(json_path, save_path):
    goal_comments = extract_goal_comments(json_path)

    input_text = f"""Follow these requirements:

1. CONTEXT:
The provided text contains goal-related commentary from a football match. Use this to create three types of QA pairs across three different levels:

2. QA TYPES AND EXAMPLES:
A. Past Video QA (At least 3 questions per level)
Match Level Examples:
  * "How many goals have been scored in the match so far?"
  * "What is the current score of the match?"
  * "How many minutes passed between the first and second goals?"

Team Level Examples:
  * "How many goals has Liverpool scored up to this point?"
  * "Which team scored the most recent goal?"
  * "Has Watford managed to take the lead yet?"

Player Level Examples:
  * "Who scored the opening goal of the match?"
  * "Has Roberto Firmino scored any goals so far?"
  * "Which player provided the assist for the last goal?"

B. Full Video QA (At least 3 questions per level)
Match Level Examples:
  * "How many total goals were scored in the match?"
  * "In which half were more goals scored?"
  * "What was the final score of the match?"

Team Level Examples:
  * "How many goals did Watford score in total?"
  * "Which team scored more goals in the second half?"
  * "Did both teams score in both halves?"

Player Level Examples:
  * "Who scored the most goals in the match?"
  * "Which players scored for Liverpool?"
  * "Did any player score multiple goals?"

C. Future Prediction QA (At least 3 questions per level, Can be asked during the middle of the match)
Match Level Examples:
  * "Will there be more goals in this match?"
  * "Will the match end in a draw?"
  * "How many more goals will be scored in this match?"
  * "Who will score the next goal?"

Team Level Examples:
  * "Which team is more likely to score next?"
  * "Will the trailing team manage to equalize?"
  * "Will either team keep a clean sheet in the remaining time?"

Player Level Examples:
  * "Will any substitute player score in the remaining time?"
  * "Will any player complete a hat-trick?"
  * "Which striker is most likely to score next?"

3. RESPONSE FORMAT:
You MUST ONLY RETURN A JSON ARRAY of objects with this structure:
{{
    "question": "The question text",
    "choices": ["A. Choice A", "B. Choice B", "C. Choice C", "D. Choice D"],  # 2-4 choices
    "answer": "The correct choice letter (A/B/C/D)",
    "type": "past/full/future",
    "question_level": "match_level/team_level/player_level",
    "related_event": [0, 1, 2],  # List of related event indices in ascending order
    "current_event_index": 5,  # Index of the current event for 'past' and 'future' types, None for 'full'
    "index": 0  # The index of the question in the original commentary, starting from 0
}}
You MUST NOT include any other text or explanation.

4. REQUIREMENTS:
- Generate at least 5 questions of each type
- Questions must require the provided goal data to answer
- Every question must be a multiple choice question with 2-4 choices
- Questions must cover all three levels:
  * match_level: Questions about overall match statistics or events
  * team_level: Questions about specific team's performance
  * player_level: Questions about individual player's actions
- related_event must be a list of indices referring to relevant goal events
- Vary question formats and complexity
- Include specific details from the commentary
- Ensure answers are clear, concise and accurate
- For prediction questions, use knowledge of all events to provide correct answers
- For 'past' and 'future' types, include the 'current_event_index' field
- For 'full' type, set 'current_event_index' to None

5. COMMENTARY:
{goal_comments}

Generate the QA pairs in the specified JSON format only."""

    instruction = "You are a football analyst tasked with creating diverse QA pairs from goal-related match commentary. Ensure that answers are generated by considering ALL events in the match."

    qa_pairs = workflow(input_text=input_text,
                        Instruction=instruction)

    qa_pairs = clean_json_block(qa_pairs)

    return qa_pairs


if __name__ == "__main__":
    match_path = "../Long-Video-QA/Data/england_epl_2017-2018/2017-08-12_watford-fc-liverpool-fc/2017-08-12_Watford_3-3_Liverpool_AaZvBO5T.json"
    save_path = "./2017-08-12_Watford_3-3_Liverpool_AaZvBO5T_goals.txt"
    qa_pairs = generate_qa_pairs(match_path, save_path)

In [2]:
qa_pairs

'[\n    {\n        "question": "How many goals have been scored in the match so far?",\n        "choices": ["A. 1", "B. 2", "C. 3", "D. 4"],\n        "answer": "C",\n        "type": "past",\n        "question_level": "match_level",\n        "related_event": [1, 3, 4],\n        "current_event_index": 4,\n        "index": 0\n    },\n    {\n        "question": "What is the current score of the match?",\n        "choices": ["A. 1:0", "B. 1:1", "C. 2:1", "D. 2:2"],\n        "answer": "C",\n        "type": "past",\n        "question_level": "match_level",\n        "related_event": [1, 3, 4],\n        "current_event_index": 4,\n        "index": 1\n    },\n    {\n        "question": "How many minutes passed between the first and second goals?",\n        "choices": ["A. 7 minutes", "B. 20 minutes", "C. 28 minutes", "D. 31 minutes"],\n        "answer": "B",\n        "type": "past",\n        "question_level": "match_level",\n        "related_event": [1, 3],\n        "current_event_index": 3,\n   

In [2]:
import os
path = './Data/england_epl_2017-2018/2017-08-12_watford-fc-liverpool-fc/2017-08-12_Watford_3-3_Liverpool_AaZvBO5T.json'
os.listdir('./Tiny_QA/Subject_Data/2017-08-12_watford-fc-liverpool-fc')[0]

'2017-08-12_Watford_3-3_Liverpool_AaZvBO5T.json'