# **Active Object Localization with Deep Reinforcement Learning**

In [None]:
!Nvidia-smi

In [None]:
# GPU setup
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 
import tensorflow.compat.v1 as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1*X GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=(1024*7))])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") 
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

In [None]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

# Download required dataset 
!kaggle datasets download -d huanghanchina/pascal-voc-2012 -p pascal


**Extracting the zip file:**

In [None]:
#!pwd
os.chdir('./pascal')  #change dir
!unzip -q pascal-voc-2012.zip

In [None]:
import os
os.chdir('./pascal')  #change dir

**Install and import necessary libraries:**

In [None]:
!pip install xmltodict

In [None]:
!pip install opencv-python

In [None]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import cv2
import sys
import xmltodict
import math
import random
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop, SGD, Adam
from tensorflow.keras.applications.vgg16 import VGG16
import matplotlib.pyplot as plt

**Prepare and loading images from dataset along with their bounding boxes:**

In [None]:

def get_object_name(object_number):
    """
    Converting object's id to object's name
    """
    # Define a dictionary that maps object numbers to object names
    dictionary={ 1:'aeroplane' , 2:'bicycle', 3:'bird', 4:'boat', 
                 5:'bottle' , 6:'bus', 7:'car', 8:'cat', 9:'chair', 
                 10:'cow', 11:'diningtable', 12:'dog' , 13:'horse', 
                 14:'motorbike', 15:'person', 16:'pottedplant',
                 17:'sheep', 18:'sofa', 19:'train', 20:'tvmonitor'}

    # Return the object name corresponding to the input object number
    return dictionary[object_number]

def read_image_index(object_name, dataset_path, test):
    """
    Reading the name of images from the txt file of target object
    """
    # Initialize an empty list to store the names of images containing the target object
    index_list = []

    # Define the path to the text file that contains the image names
    if test:
        # If test is True, use the trainval.txt file
        index_file_path = dataset_path + "ImageSets/Main/" + object_name + "_trainval.txt"
    else:
        # If test is False, use the train.txt file
        index_file_path = dataset_path + "ImageSets/Main/" + object_name + "_train.txt"

    # Open the text file
    with open(index_file_path, 'r') as f:
        # Loop through each line of the text file
        for line in f:
            # Check if the line contains the target object
            if "-1" not in line.split(" ")[1]:
                # If the line contains the target object, extract the image name and add it to the index list
                index_list.append(line.split(" ")[0])

    # Return the list of image names
    return index_list

def read_image(image_index, dataset_path):
    """
    Loading images using their name from JPEGImages folder

    :param image_index: list of image names
    :param dataset_path: path to dataset
    :return: list of images
    """
    image_list = []
    image_folder_path = dataset_path + "JPEGImages/"
    for each_image in image_index:
        img = cv2.imread(image_folder_path + each_image + ".jpg")
        image_list.append(img)

    return image_list
def load_annotation(image_index, object_name, dataset_path):
    """
    Loading bounding boxes around objects in images

    :param image_index: list of image names
    :param object_name: name of the object to be detected
    :param dataset_path: path to dataset
    :return: list of bounding boxes
    """
    bounding_box_list = []
    annotattion_path = dataset_path + "Annotations/"
    for each_image in image_index:
        path = annotattion_path + each_image + ".xml"
        xml = xmltodict.parse(open(path, 'rb'))
        xml_objects = xml['annotation']['object']
        if isinstance(xml_objects, list):
            for each_object in xml_objects:
                if each_object["name"] == object_name:
                    xmin = each_object["bndbox"]["xmin"]
                    ymin = each_object["bndbox"]["ymin"]
                    xmax = each_object["bndbox"]["xmax"]
                    ymax = each_object["bndbox"]["ymax"]
                    bounding_box = (int(xmin), int(ymin), int(xmax), int(ymax))
                    bounding_box_list.append(bounding_box)
                    break
        else:
            if xml_objects["name"] == object_name:
                xmin = xml_objects["bndbox"]["xmin"]
                ymin = xml_objects["bndbox"]["ymin"]
                xmax = xml_objects["bndbox"]["xmax"]
                ymax = xml_objects["bndbox"]["ymax"]
                bounding_box = (int(xmin), int(ymin), int(xmax), int(ymax))
                bounding_box_list.append(bounding_box)

    return bounding_box_list

def load_data(object_number, test):

    """
    Loading dataset images for a specific class by calling corresponding functions
    and saving images and their annotations into arrays
    """ 
    dataset_path = "./VOC2012/"
    object_name = get_object_name(object_number)  
    image_index = read_image_index(object_name, dataset_path,test)
    #print(len(image_index))
    image_list = np.asarray(read_image(image_index, dataset_path))
    bounding_box_list = np.asarray(load_annotation(image_index, object_name, dataset_path))
    
    if test:
      np.save(object_name + "_valimage.npy", image_list)
      np.save(object_name + "_valbox.npy", bounding_box_list)

    else:
      np.save(object_name + "_image.npy", image_list)
      np.save(object_name + "_box.npy", bounding_box_list)

    return image_list, bounding_box_list


**Showing some pictures from 20 classes of dataset:**

In [None]:
# Set the size of the figure to be displayed
plt.figure(figsize=(15,15))

# Loop through 20 sets of image data
for i in range(1,21):
    
    # Load the i-th set of image data and bounding box information
    image_list, bounding_box_list = load_data(i,test=False)
    
    # Select the 15th image in the set
    im = image_list[15]
    
    # Draw a rectangle with blue line borders of thickness of 3 px around the object of interest
    mask = cv2.rectangle(im, (bounding_box_list[15][0], bounding_box_list[15][1]), (bounding_box_list[15][2], bounding_box_list[15][3])  , (255, 0, 0) , 3) 
    
    # Apply the mask to the image data
    masked_data = cv2.bitwise_and(im, im, mask)
    
    # Display the masked image
    plt.subplot(5,4,i)
    plt.imshow(masked_data)
    
    # Get the object name for the i-th set of image data
    label = get_object_name(i)
    
    # Set the title of the plot to be the object name
    plt.title(str(label))

# Show the complete figure
plt.show()


**Showing 10 sample test images (this funciton is called after training the agent and testing it on 100 test images)**

In [None]:
def test2():
    # Load the test data for object 1
    object_number = 1
    image_list, bounding_box_list = load_data(object_number ,test=True)
    iou = []

    # Load the VGG16 and Deep Q models
    vgg16 = create_vgg16()
    deep_q = create_q_model()
    deep_q.load_weights("model.h5")

    # Loop through a subset of images in the test data
    for i in range(30,40):
        bounding_box = bounding_box_list[i]
        image = image_list[i]
        history = [-1] * history_size
        height, width, channel = np.shape(image)
        current_mask = np.asarray([0, 0, width, height])
        feature = extract_feature(image, history, vgg16)
        end = False
        masks = []
        step = 0
        
        # Keep taking actions until the end is reached
        while not end:

            # Compute the Q values for the current state
            q_value = compute_q(feature, deep_q)

            # Select the action with the highest Q value
            action = np.argmax(q_value)

            # Update the history with the selected action
            history = history[1:]
            history.append(action)

            # If the end state is reached or the maximum number of steps is reached
            if action == 8 or step == 10: #steps should be changed to 40
                end = True

                # Visualize the predicted box and ground truth box for the current image
                plt.figure()
                new_mask = current_mask
                cv2.rectangle(image, (int(new_mask[0]), int(new_mask[1])),
                              (int(new_mask[2]), int(new_mask[3])), (255, 0, 0), 1)

                predicted_box = cv2.rectangle(image, (int(new_mask[0]), int(new_mask[1])),
                              (int(new_mask[2]), int(new_mask[3])), (0, 0, 255), 2)

                groundtruth= cv2.rectangle(image, (int(bounding_box[0]), int(bounding_box[1])),
                              (int(bounding_box[2]), int(bounding_box[3])), (0, 255, 0), 2)

                test_result = cv2.bitwise_and(image, image, groundtruth)
                
                plt.imshow(test_result)
                plt.title('predicted box is shown in bold blue and ground truth box is shown in bold green \n Search path shown in red')
                plt.show()

            # If the end state is not reached
            else:
                new_mask = compute_mask(action, current_mask)

            # Crop the image based on the new mask
            cropped_image = crop_image(image, new_mask)

            # Extract features from the cropped image and update the current mask
            feature = extract_feature(cropped_image, history, vgg16)
            masks.append(new_mask)
            current_mask = new_mask

            # Visualize the search path
            cv2.rectangle(image, (int(current_mask[0]), int(current_mask[1])),
                          (int(current_mask[2]), int(current_mask[3])), (255, 0, 0), 1)
            step += 1

        # Compute the IoU for the predicted box and the ground truth box and store it in the iou list
        mask = masks[-1]
        iou.append(compute_iou(mask,bounding_box))

    # Compute the average IoU for all images in the test set
    print(sum(iou)/len(iou))


**Testing the agent on 100 test images and calculating the average iou:**

In [None]:
import cv2
import numpy as np
import pandas as pd

# Define the test function
def test():
    # Set the object number and load the test data
    object_number = 1
    image_list, bounding_box_list = load_data(object_number,test=True)

    # Initialize the intersection over union (IOU) list and create VGG16 and Q models
    iou = []
    vgg16 = create_vgg16()
    deep_q = create_q_model()

    # Load the pretrained weights for the Q model
    deep_q.load_weights("model.h5")

    # Iterate through each image in the test set
    for i in range(0, 100):
        # Get the bounding box and image
        bounding_box = bounding_box_list[i]
        image = image_list[i]

        # Initialize the history list, current mask, and step count
        history = [-1] * history_size
        height, width, channel = np.shape(image)
        current_mask = np.asarray([0, 0, width, height])
        step = 0

        # Set the end flag to False and initialize the masks list
        end = False
        masks = []

        # While not at the end, keep predicting new masks
        while not end:
            # Compute the Q value for the current state
            q_value = compute_q(feature, deep_q)

            # Choose the action with the highest Q value
            action = np.argmax(q_value)

            # Update the history list and append the chosen action
            history = history[1:]
            history.append(action)

            # If the action is 8 (stop) or the step count reaches 10, end the loop
            if action == 8 or step == 10:
                end = True
                print("end")
                # Save the image with the predicted bounding box
                new_mask = current_mask
                cv2.rectangle(image, (int(bounding_box[0]), int(bounding_box[1])),
                              (int(bounding_box[2]), int(bounding_box[3])), (0, 0, 255), 1)
                cv2.imwrite("./result/plane_result%d.jpg" % i, image)
            else:
                # Compute the new mask based on the chosen action
                new_mask = compute_mask(action, current_mask)

            # Crop the image based on the new mask and extract features from the cropped image
            cropped_image = crop_image(image, new_mask)
            feature = extract_feature(cropped_image, history, vgg16)

            # Append the new mask to the masks list, update the current mask, and increment the step count
            masks.append(new_mask)
            current_mask = new_mask
            cv2.rectangle(image, (int(current_mask[0]), int(current_mask[1])),
                          (int(current_mask[2]), int(current_mask[3])), (0, 255, 0), 1)
            step += 1

        # Compute the IOU between the predicted mask and the ground truth bounding box and append to the IOU list
        mask = masks[-1]
        iou.append(compute_iou(mask,bounding_box))

    # Print the average IOU score
    print(sum(iou)/len(iou))

    # Draw the final bounding box on the image and display it
    cv2.rectangle(image, (int(mask[0]), int(mask[1])),
                   (int(mask[2]),int(mask[3])),(0, 255, 0), 2)
    cv2.imshow('image', image)
    cv2.waitKey()
    cv2.destroyAllWindows()


**Training the agent:** 
(to see the images of test results scroll down in the output box)

In [None]:
# Set parameters for the Deep Q-Learning algorithm
history_size = 10                 # number of previous actions to consider
action_option = 9                # number of possible actions agent can take
max_steps = 20                   # maximum number of steps in one episode
experience_sample_size = 20       # size of experience sample to train on
max_experience_size = 1000       # maximum size of experience replay buffer
gamma = 0.1                      # discount factor
epsilon_change_steps = 10         # number of steps before decreasing epsilon
loss_arr = []                     # list to store loss values during training


# Define function to extract features from image and previous actions
def extract_feature(image, history, vgg16):
    # Initialize feature vector for history
    history_feature = np.zeros(action_option * history_size)
    # Loop through history to encode previous actions
    for i in range(history_size):
        if history[i] != -1:
            history_feature[i * action_option + history[i]] = 1
    
    # Extract image features using pre-trained VGG16 model
    feature_extractor = K.function([vgg16.layers[0].input], [vgg16.layers[20].output])
    image_reshape = [(cv2.resize(image, (224, 224))).reshape(1, 224, 224, 3)]
    image_feature = feature_extractor(image_reshape)[0]
    # Flatten and concatenate image and history features
    image_feature = np.ndarray.flatten(image_feature)
    feature = np.concatenate((image_feature, history_feature))
    
    return np.array([feature])


# Define function to compute Q-values for given state using deep Q-network model
def compute_q(feature, deep_q_model):
    output = deep_q_model.predict(feature)
    return np.ndarray.flatten(output)



def compute_mask(action, current_mask):
    # Define image rate, which is a hyperparameter to adjust the amount of mask movement for each action
    image_rate = 0.1
    
    # Calculate the width and height differences based on the current mask
    delta_width = image_rate * (current_mask[2] - current_mask[0])
    delta_height = image_rate * (current_mask[3] - current_mask[1])
    
    # Initialize the delta values for x and y coordinates
    dx1 = 0
    dy1 = 0
    dx2 = 0
    dy2 = 0

    # Determine the delta values based on the action
    if action == 0:
        dx1 = delta_width
        dx2 = delta_width
    elif action == 1:
        dx1 = -delta_width
        dx2 = -delta_width
    elif action == 2:
        dy1 = delta_height
        dy2 = delta_height
    elif action == 3:
        dy1 = -delta_height
        dy2 = -delta_height
    elif action == 4:
        dx1 = -delta_width
        dx2 = delta_width
        dy1 = -delta_height
        dy2 = delta_height
    elif action == 5:
        dx1 = delta_width
        dx2 = -delta_width
        dy1 = delta_height
        dy2 = -delta_height
    elif action == 6:
        dy1 = delta_height
        dy2 = -delta_height
    elif action == 7:
        dx1 = delta_width
        dx2 = -delta_width

    # Calculate the new mask based on the delta values and ensure that it's in the correct format
    new_mask_tmp = np.array([current_mask[0] + dx1, current_mask[1] + dy1,
                         current_mask[2] + dx2, current_mask[3] + dy2])
    new_mask = np.array([
        min(new_mask_tmp[0], new_mask_tmp[2]),
        min(new_mask_tmp[1], new_mask_tmp[3]),
        max(new_mask_tmp[0], new_mask_tmp[2]),
        max(new_mask_tmp[1], new_mask_tmp[3])
    ])

    return new_mask

def compute_iou(mask, ground_truth):
    dx = min(mask[2], ground_truth[2]) - max(mask[0], ground_truth[0])
    dy = min(mask[3], ground_truth[3]) - max(mask[1], ground_truth[1])

    if (dx >= 0) and (dy >= 0):
        inter_area = dx*dy
    else:
        inter_area = 0

    mask_area = (mask[2] - mask[0]) * (mask[3] - mask[1])
    ground_truth_area = (ground_truth[2] - ground_truth[0]) * (ground_truth[3] - ground_truth[1])

    return inter_area / (mask_area + ground_truth_area - inter_area)


def compute_reward(action, ground_truth, current_mask):
    new_mask = compute_mask(action, current_mask)
    iou_new = compute_iou(new_mask, ground_truth)
    iou_current = compute_iou(current_mask, ground_truth)

    if iou_current < iou_new:
        return 1
    else:
        return -1


def compute_end_reward(current_mask, ground_truth):
    if compute_iou(current_mask, ground_truth) > 0.5:
        return 3
    else:
        return -3


def select_action(feature, ground_truth_box, step, q_value, epsilon, current_mask):
    if step == max_steps:
        action = 8 #select trigger if agent surpassed maximum number of steps

    else:
        if random.random() > epsilon:
            action = np.argmax(q_value)
        else:
            end_reward = compute_end_reward(current_mask, ground_truth_box)
            if end_reward > 0:
                action = 8
            else:
                rewards = []
                for i in range(action_option - 1):
                    reward = compute_reward(i, ground_truth_box, current_mask)
                    rewards.append(reward)
                rewards = np.asarray(rewards)
                positive_reward_index = np.where(rewards >= 0)[0]

                if len(positive_reward_index) == 0:
                    positive_reward_index = np.asarray(range(9))

                action = np.random.choice(positive_reward_index)

    return action


def execute_action(action, history, ground_truth_box, current_mask):
    if action == 8:
        new_mask = current_mask
        reward = compute_end_reward(current_mask, ground_truth_box)
        end = True
    else:
        new_mask = compute_mask(action, current_mask)
        reward = compute_reward(action, ground_truth_box, current_mask)
        history = history[1:]
        history.append(action)
        end = False

    return new_mask, reward, end, history


def compute_target(reward, new_feature, model):
    return reward + gamma * np.amax(compute_q(new_feature, model))


def crop_image(image, new_mask):
    height, width, channel = np.shape(image)
    new_mask = np.asarray(new_mask).astype("int")
    new_mask[0] = max(new_mask[0], 0)
    new_mask[1] = max(new_mask[1], 0)
    new_mask[2] = min(new_mask[2], width)
    new_mask[3] = min(new_mask[3], height)
    cropped_image = image[new_mask[1]:new_mask[3], new_mask[0]:new_mask[2]]
    new_height, new_width, new_channel = np.shape(cropped_image)

    if new_height == 0 or new_width == 0:
        cropped_image = np.zeros((224, 224, 3))
    else:
        cv2.resize(cropped_image, (224, 224))

    return cropped_image


def experience_replay(deep_q_model, experience):
    sample = random.choices(experience, k=experience_sample_size)

    targets = np.zeros((experience_sample_size, action_option))

    for i in range(experience_sample_size):
        feature, action, new_feature, reward, end = sample[i]
        target = reward

        if not end:
            target = compute_target(reward, new_feature, deep_q_model)

        targets[i, :] = compute_q(feature, deep_q_model)
        targets[i][action] = target

    x = np.concatenate([each[0] for each in sample])

    global loss_arr
    loss = deep_q_model.train_on_batch(x, targets)
    loss_arr.append(loss)
    if len(loss_arr) == 100:
        print("loss %s" % str(sum(loss_arr) / len(loss_arr)))
        loss_arr = []


def train_deep_q(training_epoch, epsilon, image_list, bounding_box_list, deep_q_model, vgg16):
    experience = []

    for current_epoch in range(1, training_epoch + 1):

        print("Now starting epoch %d" % current_epoch)
        training_set_size = np.shape(image_list)[0]

        for i in range(1000):
            image = image_list[i]
            ground_truth_box = bounding_box_list[i]
            history = [-1] * history_size
            height, width, channel = np.shape(image)
            current_mask = np.asarray([0, 0, width, height])
            feature = extract_feature(image, history, vgg16)
            end = False
            step = 0
            total_reward = 0

            while not end:
                q_value = compute_q(feature, deep_q_model)
                action = select_action(feature, ground_truth_box, step, q_value, epsilon, current_mask)
                new_mask, reward, end, history = execute_action(action, history, ground_truth_box, current_mask)
                cropped_image = crop_image(image, new_mask)
                new_feature = extract_feature(cropped_image, history, vgg16)
                if len(experience) > max_experience_size:
                    experience = experience[1:]
                    experience.append([feature, action, new_feature, reward, end])
                else:
                    experience.append([feature, action, new_feature, reward, end])

                experience_replay(deep_q_model, experience)
                feature = new_feature
                current_mask = new_mask
                step += 1
                total_reward += reward

            print("Image %d, total reward %i" % (i, total_reward))

        if current_epoch < epsilon_change_steps:
            epsilon -= 0.1
            print("current epsilon is %f" % epsilon)

        tf.keras.models.save_model(deep_q_model, "my_tmp_model.h5")

    return deep_q_model


HUBER_DELTA = 1.0
def smoothL1(y_true, y_pred):
    x = K.abs(y_true - y_pred)
    x = tf.where(x < HUBER_DELTA, 0.5 * x ** 2, HUBER_DELTA * (x - 0.5 * HUBER_DELTA))
    return K.sum(x)


def create_q_model():
    model = Sequential()
    model.add(Dense(1024, input_shape=(4096 + action_option*history_size,), activation='relu'))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(9, activation='linear'))
    model.compile(loss=smoothL1, optimizer='adam')
    return model


def create_vgg16():
    vgg16 = VGG16(weights='imagenet', include_top=True , pooling='max')
    #vgg16.summary()
    return vgg16


def main():

    # object_number = int(sys.argv[1])
    # training_epoch = sys.argv[2]
    # epsilon = sys.argv[3]
    training_epoch = 10
    epsilon = 1
    image_list = []
    bounding_box_list = []
    for i in range(1,21):
        image_data, bounding_box_data = load_data(i, test=False)
        image_list.append(image_data)
        bounding_box_list.append(bounding_box_data)
    deep_q_model = create_q_model()
    vgg16 = create_vgg16()
    trained_model = train_deep_q(training_epoch, epsilon, image_list, bounding_box_list, deep_q_model, vgg16)
    trained_model.save("model.h5")



In [None]:
#if __name__ == '__main__':
main()

In [None]:
test()

In [None]:
test2()

In [None]:
#history_size = 10
#action_option = 9
#max_steps = 20
#experience_sample_size = 20
#max_experience_size = 1000
#gamma = 0.1
#epsilon_change_steps = 10
loss_arr = []
def test2(iou_threshold):
    image_list, bounding_box_list = load_data(test=True)
    iou = []
    precisions = []
    recalls = []
    vgg16 = create_vgg16()
    deep_q = create_q_model()
    deep_q.load_weights("model.h5")
    all_predicted_boxes = []
    all_groundtruth_boxes = []

    for i in range(101):
        bounding_box = bounding_box_list[i]
        image = image_list[i]
        history = [-1] * history_size
        height, width, channel = np.shape(image)
        current_mask = np.asarray([0, 0, width, height])
        feature = extract_feature(image, history, vgg16)
        end = False
        masks = []
        predicted_boxes = []

        for step in range(40):  # fixed number of steps
            q_value = compute_q(feature, deep_q)
            action = np.argmax(q_value)
            history = history[1:]
            history.append(action)

            if action == 8 or step == 39:  # fixed number of steps
                end = True
                predicted_box = current_mask.tolist()
                predicted_boxes.append(predicted_box)
                all_predicted_boxes.append(predicted_box)
                break
            else:
                new_mask = compute_mask(action, current_mask)

                cropped_image = crop_image(image, new_mask)
                feature = extract_feature(cropped_image, history, vgg16)
                masks.append(new_mask)
                predicted_box = new_mask.tolist()
                predicted_boxes.append(predicted_box)
                current_mask = new_mask

        if not end:
            predicted_box = current_mask.tolist()
            predicted_boxes.append(predicted_box)
            all_predicted_boxes.append(predicted_box)

        # Calculate precision and recall
        tp = 0
        fp = 0
        fn = 0
        for predicted_box in predicted_boxes:
            iou_scores = [compute_iou(predicted_box, gt_box) for gt_box in bounding_box_list]
            max_iou = max(iou_scores)
            if max_iou >= iou_threshold:
                tp += 1
            else:
                fp += 1
        fn = len(bounding_box_list) - tp

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        precisions.append(precision)
        recalls.append(recall)
        iou.append(max_iou)

    # Calculate AP for each class
    ap_list = []
    iou_thresholds = np.arange(0.5, 1.0, 0.05)
    for iou_thresh in iou_thresholds:
        true_positives = 0
        false_positives = 0
        total_groundtruth_boxes = len(bounding_box_list)

        for i in range(len(all_predicted_boxes)):
            iou_score = compute_iou(all_predicted_boxes[i], bounding_box_list[i])
            if iou_score >= iou_thresh:
                true_positives += 1
            else:
                false_positives += 1

        if total_groundtruth_boxes == 0:
            average_precision = 0
        else:
            average_precision = true_positives / (true_positives + false_positives)

        ap_list.append(average_precision)

    mAP = np.mean(ap_list)
    return mAP
