In [2]:
# Merge three json files into one

import json

def merge_json_files(file1, file2, file3, output_file):
    merged_data = []

    for file in [file1, file2, file3]:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            if isinstance(data, list):
                merged_data.extend(data)
            else:
                merged_data.append(data)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=4)



In [None]:
if __name__ == "__main__":
    merge_json_files(
        "codes/soccerNet/annotations_test.json",
        "codes/soccerNet/annotations_train.json",
        "codes/soccerNet/annotations_valid.json",
        "combined_data.json"
    )

In [3]:
question1 = "Is it a foul or not? Why?"
question2 = "What card would you give? Why?"
question3 = "Could the referee have given advantage? Why?"

'''
Can not transform the text to lower case because capital 
letter contain information.
For example, 'No' must indicate false, 
but 'no' could appear in the explanation which may not means false.
'''

key1 = {"Yes":True, 
        "No":False, 
        "No foul":False,
        "No Foul":False, 
        "no foul":False,
        "Not a foul":False,
        "not a foul":False,
        "Not foul":False,
        "not foul":False,
        "yes":True}

key2 = {"Yellow": "Yellow", 
        "Red": "Red",
        "No card": "No card",
        "No Card": "No card",
        "no card": "No card",
        "No foul":"No card",
        "No Foul":"No card", 
        "no foul":"No card",
        "No":"No card",
        "yellow":"Yellow",
        "red":"Red",
        "Not a foul":"No card",
        "not a foul":"No card",
        "Not foul":"No card",
        "not foul":"No card",}

key3 = {"Yes":True,
        "No":False,
        "No advantage":False,
        "no advantage":False,
        "No Advantage":False,
        "yes":True,
        "No foul":False,
        "No Foul":False, 
        "no foul":False,
        "Not a foul":False,
        "not a foul":False,
        "Not foul":False,
        "not foul":False
        }


In [4]:
# Change the formatting of data to video-based

def transform_data(file, output_file):
    transformed_data = {}
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for item in data:
            path = item.get("path", "")
            video1 = item.get("video1", "")
            video2 = item.get("video2", "")
            league = item.get("league", "")
            number_of_games = item.get("number_of_games", "")
            question = item.get("question", "")
            answer = item.get("answer", "")

            if path not in transformed_data:
                transformed_data[path] = {
                    "video1": video1,
                    "video2": video2,
                    "league": league,
                    "number_of_games": number_of_games,
                    "foul": [],
                    "card" : [],
                    "advantage": [],
                    "punishment": [],
                    "foul_explanation": [],
                    "card_explanation": [],
                    "advantage_explanation": [],
                    "punishment_explanation": []
                }
                if question == question1:
                    transformed_data[path]["foul_explanation"].append(answer)
                elif question == question2:
                    transformed_data[path]["card_explanation"].append(answer)
                elif question == question3:
                    transformed_data[path]["advantage_explanation"].append(answer)
            
            else:
                if not (video1 == transformed_data[path]["video1"] and
                        video2 == transformed_data[path]["video2"]):
                    print(f"Warning: Inconsistent video info for path {path}")

                if question == question1:
                    transformed_data[path]["foul_explanation"].append(answer)
                elif question == question2:
                    transformed_data[path]["card_explanation"].append(answer)
                elif question == question3:
                    transformed_data[path]["advantage_explanation"].append(answer)     

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(transformed_data, f, ensure_ascii=False, indent=4)  


In [None]:
if __name__ == "__main__":
    transform_data("combined_data.json", "final_data.json")

In [5]:
# Use key words match to fill in the decision fields

def fill_in_decision(file):
    import re

    def match_whole_word(word, text):
        # Use regex to match whole words only
        pattern = rf'\b{re.escape(word)}\b'
        return re.search(pattern, text) is not None

    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for path, value_dict in data.items():
            foul_explanations = value_dict.get("foul_explanation", [])
            card_explanations = value_dict.get("card_explanation", [])
            advantage_explanations = value_dict.get("advantage_explanation", [])

            for explanation in foul_explanations:
                all_match = []
                for key in key1:
                    if match_whole_word(key, explanation):
                        all_match.append(key1[key])

                set_all_match = set(all_match)
                if len(set_all_match) == 1:
                    value_dict["foul"].append(set_all_match.pop())
                else:
                    value_dict["foul"].append(None)
                
            for explanation in card_explanations:
                all_match = []
                for key in key2:
                    if match_whole_word(key, explanation):
                        all_match.append(key2[key])

                set_all_match = set(all_match)
                if len(set_all_match) == 1:
                    value_dict["card"].append(set_all_match.pop())
                else:
                    value_dict["card"].append(None)                       

            for explanation in advantage_explanations:
                all_match = []
                for key in key3:
                    if match_whole_word(key, explanation):
                        all_match.append(key3[key])

                set_all_match = set(all_match)
                if len(set_all_match) == 1:
                    value_dict["advantage"].append(set_all_match.pop())
                else:
                    value_dict["advantage"].append(None)

    with open("auto_labeled_data.json", 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


In [None]:
if __name__ == "__main__":
    fill_in_decision("final_data.json")

In [None]:
# Find data items with complete labels and export as complete_items_data.json

with open("auto_labeled_data.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
    count = 0
    complete_item_dict = {}
    for path, value_dict in data.items():
        foul = value_dict.get("foul", [])
        card = value_dict.get("card", [])   
        advantage = value_dict.get("advantage", [])
        if not (None in foul or None in card or None in advantage):
            count += 1
            complete_item_dict[path] = value_dict
    
    print(count)

with open("complete_items_data.json", 'w', encoding='utf-8') as f:
    json.dump(complete_item_dict, f, ensure_ascii=False, indent=4)

1844


In [7]:
# 

with open("complete_items_data.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
    count = 0
    consistent_dict = {}
    for path, value_dict in data.items():
        foul = set(value_dict.get("foul", []))
        card = set(value_dict.get("card", [])) 
        advantage = set(value_dict.get("advantage", []))

        if len(foul)==1 and len(card)==1 and len(advantage)==1:
            count += 1
            consistent_dict[path] = value_dict
    
    print(count)

with open("complete_consistent_data.json", 'w', encoding='utf-8') as f:
    json.dump(consistent_dict, f, ensure_ascii=False, indent=4)

1675
