In [1]:
import json

def parse_gemini_json(s: str):
    s = s.strip("```")
    s = s.strip("json")
    j = json.loads(s)

    def get(k, default=None):
        # case-insensitive getter
        for key in j:
            if key.strip().lower() == k:
                return j[key]
            
        raise KeyError(f"Key '{k}' not found in JSON.")

    # ---- normalize helpers ----
    def to_bool_decision(v: str) -> bool:
        v = v.lower()
        return "no" not in v and "foul" in v

    def to_bool_yesno(v: str) -> bool:
        return str(v).strip().lower() in {"yes", "true", "1"}

    def norm_card(v: str) -> str:
        v = v.lower()
        if "yellow" in v: return "Yellow"
        if "red" in v:    return "Red"
        return "No card"

    def norm_punish(v: str) -> str:
        v = v.lower()
        if "penalty" in v or v == "pk":           return "PK"
        if "drop" in v:                           return "DB"
        if "indirect" in v or v == "ifk":         return "IFK"
        if "free kick" in v or "direct" in v or v in {"fk","dfk"}: return "FK"
        if "no" in v or "none" in v:              return "None"
        return "None"

    foul_exp = get("decision explanation", "")
    card_exp = get("card explanation", "")
    punish_exp = get("punishment explanation", "")
    adv_exp = get("advantage explanation", "")

    return {
        "foul": to_bool_decision(str(get("decision", ""))),
        "card": norm_card(str(get("card", ""))),
        "advantage": to_bool_yesno(str(get("advantage", ""))),
        "punishment": norm_punish(str(get("punishment", ""))),
        "foul_explanation": foul_exp,
        "card_explanation": card_exp,
        "advantage_explanation": adv_exp,
        "punishment_explanation": punish_exp,
    }


In [2]:
import embedding

def indicator(bool1:bool, bool2:bool):
    if bool1 == bool2:
        return 1
    return 0

def get_card_cor(card_vlm:str, card_referee:str):
    if card_vlm == card_referee:
        return 1
    
    elif card_vlm.lower() == "yellow" and card_referee.lower() == "red":
        return 0.6
    elif card_vlm.lower() == "red" and card_referee.lower() == "yellow":
        return 0.4
    else:
        return 0

def get_average_score(explanation_vlm:str, explanation_list_referee:list[str]):
    scores = []
    for explanation_referee in explanation_list_referee:
        score = embedding.sentence_bert_sim(explanation_vlm, explanation_referee)
        scores.append(score)
    
    return sum(scores) / len(scores)


def calculate_correctness(response_dict:dict, referee_dict:dict):
    foul = response_dict["foul"]
    card = response_dict["card"]
    advantage = response_dict["advantage"]
    punishment = response_dict["punishment"]
    foul_referee = referee_dict["foul"][0] if referee_dict["foul"] else None
    card_referee = referee_dict["card"][0] if referee_dict["card"] else None
    advantage_referee = referee_dict["advantage"][0] if referee_dict["advantage"] else None
    punishment_referee = referee_dict["punishment"][0] if referee_dict["punishment"] else None

    foul_explanation = response_dict["foul_explanation"]
    card_explanation = response_dict["card_explanation"]
    advantage_explanation = response_dict["advantage_explanation"]
    punishment_explanation = response_dict["punishment_explanation"]
    foul_explanation_referee = referee_dict["foul_explanation"] if referee_dict["foul_explanation"] else None
    card_explanation_referee = referee_dict["card_explanation"] if referee_dict["card_explanation"] else None
    advantage_explanation_referee = referee_dict["advantage_explanation"] if referee_dict["advantage_explanation"] else None
    punishment_explanation_referee = referee_dict["punishment_explanation"] if referee_dict["punishment_explanation"] else None

    foul_cor, card_cor, advantage_cor = None, None, None
    foul_score, card_score, advantage_score = None, None, None

    if foul_referee is not None:
        foul_cor = indicator(foul, foul_referee)

    if card_referee:
        card_cor = get_card_cor(card, card_referee)
    
    if foul_referee == False:
        advantage_cor = indicator(advantage, False)
    elif advantage_referee is not None:
        advantage_cor = indicator(advantage, advantage_referee)
    
    if foul_explanation_referee:
        foul_score = get_average_score(foul_explanation, foul_explanation_referee)
    
    if card_explanation_referee:
        card_score = get_average_score(card_explanation, card_explanation_referee)

    if advantage_explanation_referee:
        advantage_score = get_average_score(advantage_explanation, advantage_explanation_referee)
    
    return [foul_cor, card_cor, advantage_cor], [foul_score, card_score, advantage_score]


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Running the experiment

import gemini_call
import copy
cor_dict = {"foul": [], "card": [], "advantage": []}
score_dict = {"foul": [], "card": [], "advantage": []}
all_responses_dict = {}
summary_dict = {}

count = 0
num_samples = 10

with open("complete_consistent_data.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
    for path, value_dict in data.items():
        url1 = value_dict["video1"]
        url2 = value_dict["video2"]
        response = gemini_call.gemini_call(url1,url2,"prompt.txt")
        print(response)

        
        response_dict = parse_gemini_json(response)
        all_responses_dict[path] = response_dict
        
        cor_list, score_list = calculate_correctness(response_dict, value_dict)
        summary_dict[path] = copy.deepcopy(response_dict)
        summary_dict[path]["correctness"] = cor_list
        summary_dict[path]["scores"] = score_list
        
        cor_dict["foul"].append(cor_list[0])
        cor_dict["card"].append(cor_list[1])
        cor_dict["advantage"].append(cor_list[2])
        score_dict["foul"].append(score_list[0])
        score_dict["card"].append(score_list[1])
        score_dict["advantage"].append(score_list[2])

        count += 1
        if count >= num_samples:
            break

with open("gemini_responses.json", 'w', encoding='utf-8') as f:
    json.dump(summary_dict, f, ensure_ascii=False, indent=4)

for key, values in cor_dict.items():
    average_cor = sum([v for v in values if v is not None]) / len([v for v in values if v is not None])
    print(f"Average correctness for {key}: {average_cor:.4f}")

for key, values in score_dict.items():
    average_score = sum([v for v in values if v is not None]) / len([v for v in values if v is not None])
    print(f"Average similarity score for {key}: {average_score:.4f}")

fps: 25.00, frames: 125, duration: 5.00 seconds
Saved 100 images to: frames_live
fps: 25.00, frames: 126, duration: 5.04 seconds
Saved 101 images to: frames_replay
```json
{
"decision": "Foul",
"decision explanation": "Chelsea player 21 made contact with WBA player 26's face with his arm after the ball was played, causing him to fall to the ground. This constitutes a direct free kick offense.",
"card": "Yellow",
"card explanation": "The contact was a reckless use of the arm to an opponent's face, deemed unsporting behavior rather than violent conduct or excessive force.",
"punishment": "Free Kick",
"punishment explanation": "The foul occurred in midfield, outside the penalty area, and is a direct free kick offense according to the Laws of the Game.",
"advantage": "No",
"advantage explanation": "The fouled player immediately went to ground holding his face, clearly indicating an injury or inability to continue the play, so no attacking advantage was apparent."
}
```
fps: 25.00, frames: 