In [None]:
import os
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
YOUR_INTPHYS2_PATH = ""
YOUR_MODEL_PREDICTION_PATH = ""

In [None]:
def calculate_accuracy(df_merged, df_gt_labels, col1, col2):
  """
  Calculates the accuracy between two columns in a Pandas DataFrame.

  Args:
    df: The Pandas DataFrame.
    col1: The name of the first column.
    col2: The name of the second column.

  Returns:
    The accuracy as a float.
  """
  incorrect_predictions = (df_merged[col1] == 2).sum()
  correct_predictions = (df_merged[col1] == df_merged[col2]).sum()
  total_predictions = len(df_gt_labels)
  accuracy = correct_predictions / float(total_predictions)
  return (accuracy, int(correct_predictions), int(total_predictions), int(incorrect_predictions))



def label_pred_yes_no(row):
    if "yes" in str(row["response"]) or "Yes" in str(row["response"]):
        return 1
    elif "no" in str(row["response"]) or "No" in str(row["response"]):
        return 0
    else:
        #print("Not found", str(row["response"]))
        return 2

def label_pred_1_0(row):
    if "1" in str(row["response"]):
        return 1
    elif "0" in str(row["response"]):
        return 0
    else:
        #print("Not found", str(row["response"]))
        return 2
        
def label_target(row):
    if "Impossible" in row["type"]:
        return 0
    elif "Possible" in row["type"]:
        return 1
    else:
        print("Parsing Error!")

def get_values_singlevideo(model, source="Main", split="", subsplit='', fps=15, prompt=0, seed=1):
    ground_truth_labels = YOUR_INTPHYS2_PATH+source+"/metadata.csv"
    if model == "Qwen2_5_VL":
        filecsv = YOUR_MODEL_PREDICTION_PATH+str(seed)+"prompt"+str(prompt)+"_"+source+"_"+model+"_"+str(fps)+"fps.csv"
    else:
        filecsv = YOUR_MODEL_PREDICTION_PATH+str(seed)+"prompt"+str(prompt)+"_"+source+"_"+model+"_"+str(fps)+"fps.csv"
    predicted_labels = pd.read_csv(filecsv)
    predicted_labels['filename'] = predicted_labels['filename'].str.replace('.mp4', '', regex=False)
    if model == "Qwen2_5_VL" or model == "Perception-LM-3B" :
        predicted_labels['filename'] = predicted_labels['filename'].str.split('/')
        predicted_labels['filename'] = predicted_labels['filename'].str[-1]

    df_gt_labels = pd.read_csv(ground_truth_labels)
    df_gt_labels = df_gt_labels.rename(columns={'name': 'filename'})

    predicted_labels.filename = predicted_labels.filename.astype(str)
    df_gt_labels.filename = df_gt_labels.filename.astype(str)

    # Split difficulty on Main
    if source == 'Main':
        if subsplit == "Easy":
            df_gt_labels = df_gt_labels.loc[df_gt_labels['env'].isin(['BasicLevel_0'])]
        elif subsplit == "Medium":
            df_gt_labels = df_gt_labels.loc[df_gt_labels['env'].isin(["SaltFlats_0", "DesertMap_0", "RaceTrack_0", "TropicalIsland_0"])]
        elif subsplit == "Hard":
            df_gt_labels = df_gt_labels.loc[df_gt_labels['env'].isin(["PLVDaylight_0", "Egypt_0", "RuralAustralia03_0", "ParkingGarage_0", "None"])]
    
    if source == 'Main':
        # Camera settings
        if split == 'FixedCamera':
            df_gt_labels = df_gt_labels.loc[df_gt_labels['game_name'].isin(["FixedMarryPoppins", "FixedJumpSolidity", "RotatingCup", "HotAirBallon", "SphereFallingDown", "SolidityFallingFlat"])]            
        elif split == 'MovingCamera':
            df_gt_labels = df_gt_labels.loc[df_gt_labels['game_name'].isin(["SphereFallingDownSoldity", "BoxSoldity", "Scaffoling", "CameraSolidity", "JumpSolidity", "Box", "MovingAroundOccluder", "JailStone", "PrisonCell", "Restaurant"])]            

        # Properties settings
        if subsplit == 'Permanence':
            df_gt_labels = df_gt_labels.loc[df_gt_labels['condition'].isin(["permanence"])]            
        elif subsplit == 'Immutability':
            df_gt_labels = df_gt_labels.loc[df_gt_labels['condition'].isin(["immutability", "immutability_texture"])]            
        elif subsplit == 'Continuity':
            df_gt_labels = df_gt_labels.loc[df_gt_labels['condition'].isin(["continuity", "continuity_swap"])]            
        elif subsplit == 'Solidity':
            df_gt_labels = df_gt_labels.loc[df_gt_labels['condition'].isin(["solidity"])]            


    df_merged = predicted_labels.merge(df_gt_labels, on='filename')
    df_merged["target"] = df_merged.apply(label_target, axis=1)
    if prompt == 0 or prompt == 2 or prompt == 4 or prompt == 5 or prompt == 6:
        df_merged["response"] = df_merged.apply(label_pred_yes_no, axis=1)
    else:
        df_merged["response"] = df_merged.apply(label_pred_1_0, axis=1)
    df_merged = df_merged[['SceneIndex', 'filename', "target", "response", "env"]]#.

    return calculate_accuracy(df_merged, df_gt_labels, "response", "target"), filecsv

In [None]:
list_data = []
for i, model in enumerate(["gemini-1.5-pro"]):
    new_dict = {}
    list_accs = []
    list_accs_final = []
    for split in ["Permanence", "Immutability", "Continuity", "Solidity"]:
    #for split in ["Easy", "Medium", "Hard", "All"]:
        list_split_res = []
        list_files = []
        for camera in ["FixedCamera", "MovingCamera", "All"]:
            for prompt in [0]:
                for seed in [1]:
                    list_frames = [1]
                    for frame_number in list_frames:
                        # print(model, split)
                        all, filecsv = get_values_singlevideo(model, "Main", camera, split, fps=frame_number, prompt=prompt, seed=seed)
                        accuracy, correct_predictions, total_predictions, incorrect_predictions = all
                        list_files.append(filecsv)
                        list_accs.append(accuracy)
                        list_split_res.append(accuracy * 100)
            new_dict[split] = (np.max(list_split_res),np.std(list_accs))
            max_elem = np.argmax(list_split_res)
            print(camera, model, " ", split, " ", np.max(list_split_res), list_files[max_elem])
            list_accs_final.append(list_split_res[max_elem])
    print(model)
    s_results = ""
    for acc in list_accs_final:
        s_results += "{:.2f} & ".format(acc)
    print(s_results)
    list_data.append(new_dict)