In [None]:
import openai, json, tqdm
import numpy as np
import regex as re
import os

In [None]:
api_key = ## YOUR OPENAI API KEY HERE
openai.api_key = api_key

def chat_classify(gt, pred, model: str = "gpt-3.5-turbo"):
    def request_openai_api():
        #===============
        messages = [
            {"role": "user",
             "content": f"""I will provide you two paragraphs. The first paragraph is human-composed and the second paragraph is generated by an AI model. I want to evaluate the hallucination in the second paragraph. You will extract the object and action words or phrases from the following text. The objects should have a tangible meaning and consist of no more than two words; non-tangible objects. or objects irrelevant to the actions, should not be extracted. The action words or phrases should only relate to the extracted objects. Also, you must convert the corresponding actions to their complete root form. Then, for the final answer, please generate 4 lists and must transfer the synonyms in 4 lists into the same word. Please directly output the final object and action lists in two paragraphs, respectively as in the form in the example below without any justifications or intermediate steps.
             
             Here is an example:
             1. The video captures a dog's cautious interaction with a metal toy inside a house. The dog appears wary and maintains a distance from the unfamiliar object, barking to express its disapproval and possibly intimidation. As the toy moves, the dog's reaction is to bark and lean backward, showing a clear sign of being unsettled by the toy's motion. When the toy momentarily ceases movement, the dog also stops, remaining alert and attentive. At the end of the image, when the toy comes to a halt, the dog looks up, still processing the strange encounter with the inanimate object.
             2. The video is a collage of multiple pictures featuring two dogs playing with a toy alligator. The dogs are in various positions, with some of them standing on the toy alligator, while others are interacting with it in different ways. The collage captures the dogs' playfulness and excitement as they engage with the toy alligator. 
             The lists are:
             Object list 1: [dog, toy, house] 
             Action list 1: [interaction, bark, express intimidation, move, lean backward, stop, look up] 
             Object list 2: [dog, toy] 
             Action list 2: [play, stand, interaction] 
             Here are the paragraphs for which you should generate object and action lists:

             1. {str(gt)}
             2. {str(pred)}

             Remember, The objects should have a tangible meaning and consist of no more than two words; non-tangible objects should not be extracted. The action words or phrases should only relate to the extracted objects. The object and action lists for each paragraph are (only give 2 object lists and 2 action lists; NOTHING MORE NOTHING LESS):"""},
        ]   
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages,
        )
        return response
    return request_openai_api()

def get_obj_action_lists(llm_output):
    """
    First paragraph is the ground truth and second paragraph is the LLM generated description.

    Returns:
    object_lists: List of object lists extracted from the LLM output (should be length 2)
    action_lists: List of action lists extracted from the LLM output (should be length 2)
    """
    object_list_pattern = r"object list (\d+): \[(.*)\]"
    action_list_pattern = r"action list (\d+): \[(.*)\]"
    object_lists = re.findall(object_list_pattern, llm_output.lower())
    action_lists = re.findall(action_list_pattern, llm_output.lower())

    object_lists = [x[1].split(", ") for x in object_lists]
    action_lists = [x[1].split(", ") for x in action_lists]

    return object_lists, action_lists

In [None]:
json_path = ## PATH TO YOUR JSON FILE
json_name = json_path.split('/')[-1].replace('.json', '')
data = json.load(open(json_path))
data_eval_lists = {}

for content in tqdm.tqdm(data, total=len(data)):
    vid_name = content['video_name'].replace('.mp4', '')
    vid_gt_desc = content['A']
    vid_pred_desc = content['pred']

    try:
        response = chat_classify(vid_gt_desc, vid_pred_desc).choices[0].message['content']

        object_lists, action_lists = get_obj_action_lists(response)
    except:
        object_lists, action_lists = [], []

    # check if the extraction was successful, if not then retry
    if len(object_lists) != 2 or len(action_lists) != 2:
        # retry 3 more times, if still not successful, then skip
        for i in range(3):
            try:
                response = chat_classify(vid_gt_desc, vid_pred_desc).choices[0].message['content']
            except:
                continue

            object_lists, action_lists = get_obj_action_lists(response)
            if len(object_lists) == 2 and len(action_lists) == 2:
                break
            
            object_lists, action_lists = 'Error in extracting', 'Error in extracting'
            print(f'\tError in extracting for {vid_name}. Retry {i+1}/3')

            if i == 2:
                print(f'\t\t{response}')

    data_eval_lists[vid_name] = {}
    data_eval_lists[vid_name]['gt'] = vid_gt_desc
    data_eval_lists[vid_name]['pred'] = vid_pred_desc
    data_eval_lists[vid_name]['llm_response'] = response
    data_eval_lists[vid_name]['object_lists'] = object_lists
    data_eval_lists[vid_name]['action_lists'] = action_lists

# save
json.dump(data_eval_lists, open(f'./OBJ-ACTIONS_{json_name}.json', 'w'), indent=4)

In [None]:
j_path = f'./OBJ-ACTIONS_{json_name}.json'
oa_data = json.load(open(j_path))

a_re = []
a_pre = []
a_f1 = []

o_re = []
o_pre = []
o_f1 = []

for vid in oa_data:
    if len(oa_data[vid]['action_lists']) == 0:
        continue
    # Action metrics
    a_reference_list, a_pred_list = oa_data[vid]['action_lists'][0], oa_data[vid]['action_lists'][1]

    a_tp = len(set(a_reference_list) & set(a_pred_list))
    a_fp = len(set(a_pred_list) - set(a_reference_list))
    a_fn = len(set(a_reference_list) - set(a_pred_list))

    a_recall = a_tp / (a_tp + a_fn) if (a_tp + a_fn) != 0 else 0
    a_precision = a_tp / (a_tp + a_fp) if (a_tp + a_fp) != 0 else 0
    a_f1_score = 2 * (a_precision * a_recall) / (a_precision + a_recall) if (a_precision + a_recall) != 0 else 0

    a_re.append(a_recall)
    a_pre.append(a_precision)
    a_f1.append(a_f1_score)

    # Object metrics
    o_reference_list, o_pred_list = oa_data[vid]['object_lists'][0], oa_data[vid]['object_lists'][1]

    o_tp = len(set(o_reference_list) & set(o_pred_list))
    o_fp = len(set(o_pred_list) - set(o_reference_list))
    o_fn = len(set(o_reference_list) - set(o_pred_list))

    o_recall = o_tp / (o_tp + o_fn) if (o_tp + o_fn) != 0 else 0
    o_precision = o_tp / (o_tp + o_fp) if (o_tp + o_fp) != 0 else 0
    o_f1_score = 2 * (o_precision * o_recall) / (o_precision + o_recall) if (o_precision + o_recall) != 0 else 0

    o_re.append(o_recall)
    o_pre.append(o_precision)
    o_f1.append(o_f1_score)

# print metrics
print(j_path)
print('Object Precision/Recall/F1')
print(f'{np.mean(o_re):.4f} / {np.mean(o_pre):.4f} / {np.mean(o_f1):.4f}')

print('Action Precision/Recall/F1')
print(f'{np.mean(a_re):.4f} / {np.mean(a_pre):.4f} / {np.mean(a_f1):.4f}')

In [None]:
# Delete the JSON file
os.remove(j_path)