- Original dataset is over 27GB
- First download sample of 100 dataset from https://mega.nz/folder/0SQCmJzZ#a51rhBc4zbfe6VwGKqyrQA
- Dataset is already prepared
- Llava captioning was done manually from https://llava.hliu.cc/

In [None]:
import pandas as pd
import numpy as np
import cv2
import os
import re
import random
import json
from glob import glob
import shutil
import pickle
from PIL import Image
from pprint import pprint
from tqdm import tqdm
import time
import torch
from nltk.translate.bleu_score import sentence_bleu
from ast import literal_eval
import string
from lavis.models import load_model_and_preprocess

# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
device

import openai

with open('api_key.txt') as f:
    openai_api_key = f.read()
    
# Set OpenAI API key.

openai.api_key = openai_api_key


In [None]:
root_folder = os.getcwd().replace('\\','/')
sample_100 = f"{root_folder}/sample_100"

# read validation data annotations file

annot_df = pd.read_json(f'{sample_100}/val.jsonl', lines=True)
annot_df = annot_df[["movie","img_fn","metadata_fn","answer_likelihood","answer_orig","question_orig",
        "rationale_orig","answer_choices","question","rationale_choices","rationale_label"]]

# Drop duplicates based on the 'movies' column
filtered_df = annot_df.drop_duplicates(subset='movie', keep='last')

def sample100(filepath):
    try:
        superimposed_image(filepath)
        sample.append(str(filepath))
    except:
        pass

# Create list of images to use

sample = []

for i in range(0, len(annot_df["img_fn"]), 50):
    sample100(annot_df["img_fn"][i])

def get_image_str(filepath):
    path = re.split("/", filepath)[1]
    path = f"{sample_100}new_{path}"
    return path

# add objects column

new_dict = dict(zip(annot_df['img_fn'],annot_df['objects']))
objects = []
for img in captioned_df['img_fn']:
    objects.append(new_dict[img])
filtered_df['objects'] = objects

def get_image_str(filepath):
    path = re.split("/", filepath)[1]
    path = os.path.splitext(path)[0]
    return path

# Create a dictionary mapping image names to captions

mapping_dict = dict(zip([name + ".jpg" for name in a], b))

# Add a new column 'captions' by mapping the values based on 'image_names'

filtered_df['captions'] = [get_image_str(filepath) for filepath in filtered_df['img_fn']].map(mapping_dict)

In [None]:
# Function to add "_i" to each item in a list
def add_suffix_to_list(lst_str):
    lst = literal_eval(lst_str)
    return [f'{item}_{i}' for i, item in enumerate(lst)]

# Function to convert object strings to dictionaries
def convert_to_dict(obj_str):
    obj_list = literal_eval(obj_str)
    obj_dict = {i+1: item for i, item in enumerate(obj_list)}
    return obj_dict


def alphabetic_enumerate_dict(iterable, start=0):
    alphabet = string.ascii_lowercase
    result_dict = {}
    for idx, item in enumerate(iterable, start=start):
        quotient, remainder = divmod(idx, len(alphabet))
        prefix = alphabet[quotient] if quotient > 0 else ''
        key = f"{prefix}{alphabet[remainder]}"
        result_dict[key] = item
    return result_dict

In [None]:
# This code was used to superimpose the bounding boxes and polygons on the images

def superimposed_image(filepath):
    file_name = os.path.splitext(filepath)[0]
    save_path = re.split("/", file_name)[1]
    # read json file
    with open(f"{movies_path}/{file_name}.json") as file:
        data = json.load(file)
    # read image file
    image = cv2.imread(f"{movies_path}/{file_name}.jpg")
    
    for i, box in enumerate(data["boxes"]):
        x1, y1, x2, y2, score = box
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

        # Draw bounding box
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Get modified label for the current object
        label = data["names"][i]
        label = f"{label}_{i}"

        # Add label text
        label_text = f"Object {i}: {label}"
        cv2.putText(image, label_text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    for polygon in data["segms"]:
        pts = np.array(polygon, dtype=np.int32)
        pts = pts.reshape((-1, 1, 2))  # Reshape to match the required format
        cv2.polylines(image, [pts], isClosed=True, color=(0, 0, 255), thickness=2)
        
    save_path = f"{sample_100}new_{save_path}.jpg".replace('\\','/')
        
    #cv2.imshow('Image with Bounding Boxes and Polygons', image)
    #cv2.waitKey(0)
    #cv2.destroyAllWindows()
    
    # Save the image
    cv2.imwrite(save_path, image)
    
    # Save image and json file
    shutil.copy(f"{movies_path}/{file_name}.json", f"{sample_100}")
    shutil.copy(f"{movies_path}/{file_name}.jpg", f"{sample_100}")
    

def sample100(filepath):
    try:
        superimposed_image(filepath)
        sample.append(str(filepath))
    except:
        pass

    
for i in range(0, len(annot_df["img_fn"]), 75):
    sample100(annot_df["img_fn"][i])
    if len(sample) == 100:
        break

# Final dataframe

filtered_df = annot_df[annot_df['img_fn'].isin(tuple(sample))]
filtered_df = filtered_df.drop_duplicates(subset='img_fn', keep="last")
filtered_df.to_csv("context_df.csv", index=False)


from nltk.translate.meteor_score import meteor_score

def calculate_meteor(reference, translated):
    # Tokenize the strings into words
    reference_tokens = reference.split()
    translated_tokens = translated.split()

    # Calculate the METEOR score
    meteor_score_value = meteor_score([reference_tokens], translated_tokens, alpha=0.9, beta=3, gamma=0.5)

    return meteor_score_value
# Apply the function to calculate meteor score and add it as a new column 'bleu_score' to the dataframe

completed_df['meteor_score'] = completed_df.apply(lambda row: calculate_meteor(row["rationale_orig"],
                                                                               row["rationale4_llava"]), axis=1)

#### BLIP-2 Captioning

In [None]:
# loads BLIP-2 pre-trained model

model, vis_processors, _ = load_model_and_preprocess(name="blip2_t5",
                                                     model_type="pretrain_flant5xxl",
                                                     is_eval=True,
                                                     device=device)

# prepare the image

image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)


blip_captions = []

for path in filtered_df['img_fn']:
    raw_image = Image.open(f"{sample_100}/{path}").convert("RGB")
    image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
    caption = model.generate({"image": image, "prompt": "Question: Can you tell me about this image in detail? Answer:"})
    print(caption)
    display(raw_image.resize((596, 437)))
    blip_captions.append(caption)
    


#### GPT Prompting

In [None]:
blip_prompt = "objects ={0: 'person', 1: 'person', 2: 'person', 3: 'person', 4: 'horse', 5: 'bottle', 6: 'cup', 7: 'cup',\
8: 'cup', 9: 'cup', 10: 'chair', 11: 'chair', 12: 'diningtable'}\
['a woman is sitting at a bar with a man and a woman']\
# Why is 4 touching 1 on the arm?\
[a: [[1], 'is', 'trying', 'to', 'calm', [0], 'down', ',', 'and', 'tell', 'her', 'to', 'stay', 'where', 'she', 'is', '.'],\
b: [[3], 'wants', 'to', 'show', 'her', 'a', 'message', 'she', 'has', 'just', 'been', 'sent', '.'],\
c: ['She', 'is', 'drawing','his', 'attention', 'to', 'something', '.'],\
d: [[0], 'has', 'just', 'arrived', ',', [1], 'has', 'not', 'seen', 'her', 'in','awhile', 'and', 'is', 'warmly', 'greeting', 'her', '.']]\
She is drawing his attention to something.\
objects ={1: 'person', 2: 'person', 3: 'person', 4: 'person', 5: 'person', 6: 'person', 7: 'person', 8: 'person', 9: 'person',\
10: 'person', 11: 'person', 12: 'person', 13: 'tie', 14: 'tie', 15: 'tie', 16: 'tie', 17: 'tie', 18: 'tie',19: 'tie', 20:'tie'}\
['a man in a suit and a woman in a suit are sitting in a courtroom']\
# Why is 8 sitting at the front of the courtroom?\
{a: [[7], 'is', 'in', 'a', 'jury', 'and', 'is', 'examining', 'evidence', '.'],\
b: [[7], 'is', 'upset', 'about', 'the', 'verdict','of', 'a', 'case', '.'],\
c: [[7], 'is', 'providing', 'security', 'to', 'the', 'witnesses', 'and', 'to', 'the', 'defendants','as', 'well', '.'],\
d: [[11], 'is', 'the', 'ultimate', 'decision', 'maker', '.']}\
8 is providing security to the witnesses and to the defendants as well.\
objects =[1: 'person', 2: 'person', 3: 'bowl']\
['Two women are laying on the floor in a kitchen with blood splattered on the floor']\
# why does 2 have stuff on her face?\
[a:[[1], 'smells', 'really', 'bad', '.'],\
b: [[1], 'got', 'hit', 'in', 'the', 'face', 'with', 'some', 'food', '.'],\
c: [[1], 'is','trying', 'to', 'cover', 'up', 'a', 'bruise', '.'],\
d: [[0], 'has', 'just', 'been', 'in', 'a', 'fight', 'with', 'someone', '.']]\
2 got hit in the face with some food."

In [None]:
llava_prompt = "objects ={0: 'person', 1: 'person', 2: 'person', 3: 'person', 4: 'horse', 5: 'bottle', 6: 'cup', 7: 'cup',\
8: 'cup', 9: 'cup', 10: 'chair', 11: 'chair', 12: 'diningtable'}\
The image depicts a group of people gathered in a bar setting. There are four people in the scene, with two men and two\
women engaged in conversation. They are all sitting around a dining table, and there are chairs placed around it.The bar\
is well-stocked with various types of glassware, including numerous wine glasses and cups. Some of the wine glasses are\
placed on the table, while others are scattered throughout the scene. There are also several cups placed around the table\
and on other surfaces.\n\nThe atmosphere appearsto be casual and social, with the group enjoying each other's company and\
having drinks together.\
# Why is 4 touching 1 on the arm?\
{a: [[1], 'is', 'trying', 'to', 'calm', [0], 'down', ',', 'and', 'tell', 'her', 'to', 'stay', 'where', 'she', 'is', '.'],\
b: [[3], 'wants', 'to', 'show', 'her', 'a', 'message', 'she', 'has', 'just', 'been', 'sent', '.'],\
c: ['She', 'is', 'drawing','his', 'attention', 'to', 'something', '.'],\
d: [[0], 'has', 'just', 'arrived', ',', [1], 'has', 'not', 'seen', 'her', 'in','awhile', 'and', 'is', 'warmly', 'greeting', 'her', '.']}\
She is drawing his attention to something.\
objects ={1: 'person', 2: 'person', 3: 'person', 4: 'person', 5: 'person', 6: 'person', 7: 'person', 8: 'person', 9: 'person',\
10: 'person', 11: 'person', 12: 'person', 13: 'tie', 14: 'tie', 15: 'tie', 16: 'tie', 17: 'tie', 18: 'tie',19: 'tie', 20:'tie'}\
The image features a group of people sitting in chairs, likely attending a formal event or a courtroom setting. There are\
several individuals in the scene, with some of them wearing ties. The people are seated next to each other, some facing\
forward while others are turned to the side or slightly away from the camera.\n\nThe chairs are arranged in rows, with \
some chairs being more prominent in the foreground and others further back in the scene. The people in the chairs appear\
to be paying attention to something happening in front of them, possibly a speaker or a presentation.\
# Why is 8 sitting at the front of the courtroom?\
{a: [[7], 'is', 'in', 'a', 'jury', 'and', 'is', 'examining', 'evidence', '.'],\
b: [[7], 'is', 'upset', 'about', 'the', 'verdict','of', 'a', 'case', '.'],\
c: [[7], 'is', 'providing', 'security', 'to', 'the', 'witnesses', 'and', 'to', 'the', 'defendants','as', 'well', '.'],\
d: [[11], 'is', 'the', 'ultimate', 'decision', 'maker', '.']}\
8 is providing security to the witnesses and to the defendants as well.\
objects =[1: 'person', 2: 'person', 3: 'bowl']\
['Two women are laying on the floor in a kitchen with blood splattered on the floor']\
# why does 2 have stuff on her face?\
[a:[[1], 'smells', 'really', 'bad', '.'],\
b: [[1], 'got', 'hit', 'in', 'the', 'face', 'with', 'some', 'food', '.'],\
c: [[1], 'is','trying', 'to', 'cover', 'up', 'a', 'bruise', '.'],\
d: [[0], 'has', 'just', 'been', 'in', 'a', 'fight', 'with', 'someone', '.']]\
2 got hit in the face with some food."


In [None]:
MODEL = "gpt-3.5-turbo"

def LM(prompt):
    
    answer = openai.ChatCompletion.create(
        model = MODEL,
        messages = [{"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                   ],
        temperature = 0,)
    
    answer = answer['choices'][0]['message']['content']
    
    new_prompt = prompt + answer + "What is the rationale for that?"
    
    time.sleep(10)
    
    explanation = openai.ChatCompletion.create(
        model = MODEL,
        messages = [{"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": new_prompt},
                   ],
        temperature = 1,)
    
    explanation = explanation['choices'][0]['message']['content']
    
    return answer, explanation

In [None]:
def user_input(i, blip=True):
    if blip:
        return "objects =" + str(objects[i]) + str(blip_captions[i]) + " # " + str(question_orig[i])\
                           + str(alphabetic_enumerate_dict(literal_eval(answer_choices[i])))\
                      
    return "objects =" + str(objects[i]) + str(llava_captions[i]) + " # " + str(question_orig[i])\
                    + str(alphabetic_enumerate_dict(literal_eval(answer_choices[i])))\

In [None]:
#context_df = pd.read_csv("completed_6.csv")

objects = context_df["new_objects1"]
answer_choices = context_df["answer_choices"]
question_orig = context_df["question_orig"]
answer_orig = context_df["answer_orig"]
rationale_orig = context_df["rationale_orig"]
blip_captions = context_df["blip_captions"]
llava_captions = context_df["captions"]

In [None]:
answer_blip = []
rationale_blip = []
answer_llava = []
rationale_llava = []

for i in tqdm(range(0,100,1)):
    try:
        llava_context = llava_prompt + user_input(i, blip=False)
        blip_context = blip_prompt + user_input(i, blip=True)
        # Get response with llava context
        llava_response = LM(llava_context)
        answer_llava.append(llava_response[0])
        rationale_llava.append(llava_response[1])
        time.sleep(10)
        # Get response with llava context
        blip_response = LM(blip_context)
        answer_blip.append(blip_response[0])
        rationale_blip.append(blip_response[1])
        time.sleep(30)
    except:
        print(i)
        raise
        
context_df["answer4_blip"] = answer_blip
context_df["rationale4_blip"] = rationale_blip
context_df["answer2_llava"] = answer_llava
context_df["rationale2_llava"] = rationale_llava

context_df.to_csv("completed_1.csv", index=False)