In [None]:
################# HEALPER PREPROCESS ######################################

import json
import numpy as np
from random import randint
import cv2


ALL_DET = "/content/drive/MyDrive/HOI_Data/All_data/"
INFOS = "/content/drive/MyDrive/HOI_Data/infos/"

all_data_directory = ALL_DET

ann_file_train = all_data_directory + 'Annotations_vcoco/train_annotations.json'
ann_file_val = all_data_directory + 'Annotations_vcoco/val_annotations.json'
ann_file_test = all_data_directory + 'Annotations_vcoco/test_annotations.json'

with open(ann_file_train) as f:
    ANNOTATIONS_TRAIN = json.load(f)

with open(ann_file_val) as f:
    ANNOTATIONS_VAL = json.load(f)

with open(ann_file_test) as f:
    ANNOTATIONS_TEST = json.load(f)

OBJ_path_train = all_data_directory + 'Object_Detections_vcoco/train/'
OBJ_path_test = all_data_directory + 'Object_Detections_vcoco/val/'

VERB_TO_ID = {
    'carry': 0,
    'catch': 1,
    'cut_instr':2,
    'cut_obj': 3,
    'drink': 4,
    'eat_instr':5,
    'eat_obj': 6,
    'hit_instr':7,
    'hit_obj': 8,
    'hold': 9,
    'jump': 10,
    'kick': 11,
    'lay': 12,
    'look': 13,
    'point': 14,
    'read': 15,
    'ride': 16,
    'run': 17,
    'sit': 18,
    'skateboard': 19,
    'ski': 20,
    'smile': 21,
    'snowboard': 22,
    'stand': 23,
    'surf': 24,
    'talk_on_phone': 25,
    'throw': 26,
    'walk': 27,
    'work_on_computer': 28
}

MATCHING_IOU = 0.5
NUMBER_OF_VERBS = 29

def get_detections(segment_key, flag):

    """based on threshold score values, for score person and score obj in object detctions, we distinct the object detections. This includes actual co-ordinate of the person and object in the images.

    segment_key = train / test / val.
    flag = image_no.

    Returns:
        d_p_boxes = all the presons bbx in the image -> actual image co-ordinates
        d_o_boxes = all the objects bbx in the image -> actual image co-ordinates
        scores_p = person scores in the image.
        scores_o = object scores in the image
        class_id_persons = class_id for the person boxes in image.
        class_id_objects = class_id for the object boxes in image.
        annotation = cleaned up annotation of the form [{'person_box': [0.96, 1.07, 352., 145],
          'hois': [{'verb': 'cut_obj', 'obj_box': [117.61, 175.46, 522.51, 332.6]}, {'verb': 'hold', 'obj_box': [163.17, 50.3, 231.19, 116]}]}].
        img_shape = shape of this image (W, H)
    """

    SCORE_PER = 0.6
    SCORE_OBJ = 0.3
    select_threshold=2000000
    if flag == 'train':
        annotation = ANNOTATIONS_TRAIN[str(segment_key)]
        cur_obj_paths = OBJ_path_train + "COCO_train2014_%.12i.json" % (segment_key)

    elif flag == 'test':
        annotation = ANNOTATIONS_TEST[str(segment_key)]
        cur_obj_paths = OBJ_path_test + "COCO_val2014_%.12i.json" % (segment_key)

    elif flag == 'val':
        annotation = ANNOTATIONS_VAL[str(segment_key)]
        cur_obj_paths = OBJ_path_train + "COCO_train2014_%.12i.json" % (segment_key)

    annotation = clean_up_annotation(annotation)

    with open(cur_obj_paths) as f:
        detections = json.load(f)

    img_H = detections['H']
    img_W = detections['W']
    img_shape = [img_W, img_H]
    persons_d, objects_d = analyze_detections(detections, SCORE_PER, SCORE_OBJ)
    d_p_boxes, scores_p, class_id_persons = get_boxes_det(persons_d, img_H, img_W)
    d_o_boxes, scores_o, class_id_objects = get_boxes_det(objects_d, img_H, img_W)

    if len(d_p_boxes)>select_threshold:
        d_p_boxes,scores_p ,class_id_persons= d_p_boxes[0:select_threshold],scores_p[0:select_threshold],class_id_persons[0:select_threshold]

    if len(d_o_boxes)>select_threshold-1:
        d_o_boxes,scores_o,class_id_objects= d_o_boxes[0:select_threshold-1],scores_o[0:select_threshold-1],class_id_objects[0:select_threshold-1]

    return d_p_boxes, d_o_boxes, scores_p, scores_o, class_id_persons, class_id_objects, annotation, img_shape




def analyze_detections(detections, SCORE_PER, SCORE_OBJ):

    """gives all the person predicitons and the objects in an image seperately.
       based on high scores.

    Returns:
        two lists, one is the persons detected in an image, the other is the objects detected in an image.
    """

    persons = []
    objects = []

    for det in detections['detections']:
        if det['class_str'] == 'person':
            if det['score'] >= SCORE_PER:
                persons.append(det)

        else:
            if det['score'] >= SCORE_OBJ:
                objects.append(det)

    return persons, objects


def get_boxes_det(dets, img_H, img_W):

    """Gives the distinct boxes, scores and classes present in the detection recieved
       Gives us the actual co ordinates in the image for the persons and objects.

    Args:
        dets (_type_): [{'class_str': 'tie','score': 0.063, 'class_no': 28, 'box_coords': [0.06, 0.109, 0.847, 0.535]}]
        img_H (_type_): height
        img_W (_type_): width

    Returns:
        type: boxes, scores, class_no. present in the given detection (Person / Object).
    """

    boxes = []
    scores = []
    class_no = []

    for det in dets:
        top, left, bottom, right = det['box_coords']
        scores.append(det['score'])
        class_no.append(det['class_no'])
        left, top, right, bottom = left* img_W, top*img_H, right*img_W, bottom*img_H
        boxes.append([left, top, right, bottom])

    return boxes, scores, class_no




def clean_up_annotation(annotation):

    """
    Recieves the main annotation structure that is present in the main dataset
       for an image, and converts it into more explainable and easily readable annotation
       for verb with no objectm, the object bbx remains []

    Args:
        annotation (_type_):
        '106497': [{'person_bbx': [0.96, 1.07, 352., 145],
                    'Verbs': 'cut_obj',
                    'object': {'obj_bbx': [117.61, 175.46, 522.51, 332.6]}},
                   {'person_bbx': [0.96, 1.07, 3525, 145],
                    'Verbs': 'hold',
                    'object': {'obj_bbx': [163.17, 50.3, 231.19, 116]}}]

    Returns:
        _type_:
        [{'person_box': [0.96, 1.07, 352., 145],
          'hois': [{'verb': 'cut_obj', 'obj_box': [117.61, 175.46, 522.51, 332.6]}, {'verb': 'hold', 'obj_box': [163.17, 50.3, 231.19, 116]}]}]
    """

    persons_dict = {}

    for hoi in annotation:
        box = hoi['person_bbx']
        box = [int(coord) for coord in box]
        dkey = tuple(box)
        objects = hoi['object']

        if len(objects['obj_bbx']) == 0:
            cur_oi = {
                'verb': hoi['Verbs'],
                'obj_box': []
            }
        else:
            cur_oi = {
                'verb': hoi['Verbs'],
                'obj_box': [int(coord) for coord in objects['obj_bbx']]
            }
        if dkey not in persons_dict:
            persons_dict[dkey] = {'person_box': box, 'hois':[cur_oi]}
        else:
            persons_dict[dkey]['hois'].append(cur_oi)
    pers_list = []

    for dkey in persons_dict.keys():
        pers_list.append(persons_dict[dkey])

    return pers_list





def get_compact_detections(segment_key, flag):

    """This thing is required for building attention maps.
       here, we get the numpy array version for the person bboxes.
       here it is based on 0 to 1 value. not the image co-ordinate value.
       in objects_np, there is an added co-ordinate , this is for no object involved verbs.
       and the co-ordinate is 0, 0, 0, 0.
       as person class is 0, for this specific object, class is given 0.
       it can be seen in class_id_objects, 1st val is 1.

    Returns:
        _type_: _description_
    """

    d_p_boxes, d_o_boxes, scores_p, scores_o, class_id_persons, class_id_objects, annotation, img_shape = get_detections(segment_key, flag)

    img_W, img_H = img_shape[0], img_shape[1]
    no_person_dets = len(d_p_boxes)
    no_object_dets = len(d_o_boxes)
    persons_np = np.zeros([no_person_dets, 4], np.float32)
    objects_np = np.zeros([no_object_dets+1, 4], np.float32)
    class_id_objects.insert(0, 1)
    if no_person_dets != 0:
        persons_np = np.array(d_p_boxes, np.float32)
    objects_np = np.array([[0, 0, 0, 0]] + d_o_boxes, np.float32)
    persons_np = persons_np / np.array([img_W, img_H, img_W, img_H])
    objects_np = objects_np / np.array([img_W, img_H, img_W, img_H])

    return {
        'person_bbx': persons_np,
        'objects_bbx': objects_np,
        'person_bbx_score': scores_p,
        'object_bbx_score': scores_o,
        'class_id_objects': class_id_objects
    }


def get_attention_maps(segment_key, flag):
    """it gives us a map type representation for the attentions at the object level and the person level.
        we get the data for building the attention map.


    Returns:
        for all the person, object pair, we build the union box, and get the attention map for all pairs.
    """
    compact_detections = get_compact_detections(segment_key, flag)
    persons_np, objects_np = compact_detections['person_bbx'], compact_detections['objects_bbx']
    union_box = []
    no_person_dets = len(persons_np)
    no_object_dets = len(objects_np)
    for dp_i in range(no_person_dets):
        for do_i in range(no_object_dets):
            union_box.append(union_BOX(persons_np[dp_i], objects_np[do_i], segment_key))

    return np.concatenate(union_box)


def union_BOX(roi_pers, roi_objs, segment_key, H=64, W=64):

    """
    this is used for building the attention maps. it is used for getting the map co ordinates with attentions.
    """
    assert H == W
    roi_pers = np.array(roi_pers*H, dtype=int)
    roi_objs = np.array(roi_objs*H, dtype=int)
    sample_box = np.zeros([1, 2, H, W])
    sample_box[0, 0, roi_pers[1]:roi_pers[3]+1, roi_pers[0]:roi_pers[2]+1] = 100
    sample_box[0, 1, roi_objs[1]:roi_objs[3]+1, roi_objs[0]:roi_objs[2]+1] = 100

    return sample_box



def get_compact_label(segment_key, flag):

    """This is where the main fun starts, this compares the annotation detections, with the original object detections, by making them compared with each other, we create the robust annotations. Now, here, we,
    throughout the process of filtering, we use IoU. to get correct annotations. Instead of using bbx from annot, we use more robust bbx co-ordinates from object_det with the help of IoU relation.

    Returns:
       labels_np : this is of size (num_of_persons, num_of_objects+1, num_of_verbs) -> + 1 for verb without objects
       labels_single : for every possible, person object pair, we get whether there is verb or not.
    """

    d_p_boxes, d_o_boxes, scores_p, scores_o, class_id_persons, class_id_objects, annotation, img_shape = get_detections(segment_key, flag)

    no_person_dets, no_obj_dets = len(d_p_boxes), len(d_o_boxes)
    labels_np = np.zeros([no_person_dets, no_obj_dets+1, NUMBER_OF_VERBS], np.int32)

    a_p_boxes = [ann['person_box'] for ann in annotation]
    iou_mtx = get_iou_mtx(a_p_boxes, d_p_boxes)

    if no_obj_dets!=0 and len(a_p_boxes)!=0:
        max_iou_each_det = np.max(iou_mtx, axis=0)
        index_max_each_det = np.argmax(iou_mtx, axis=0)

        for dd in range(no_person_dets):
            cur_max_iou = max_iou_each_det[dd]
            if cur_max_iou < MATCHING_IOU:
                continue
            matched_anno = annotation[index_max_each_det[dd]]
            hoi_anns = matched_anno['hois']
            no_object_hois = [oi for oi in hoi_anns if len(oi['obj_box'])==0]

            for no_hoi in no_object_hois:
                verb_idx = VERB_TO_ID[no_hoi['verb']]
                labels_np[dd, 0, verb_idx] = 1

            object_hois = [oi for oi in hoi_anns if len(oi['obj_box'])!=0]
            a_o_boxes = [oi['obj_box'] for oi in object_hois]
            iou_mtx_o = get_iou_mtx(a_o_boxes, d_o_boxes)

            if a_o_boxes and d_o_boxes:
                for do in range(len(d_o_boxes)):
                    for ao in range(len(a_o_boxes)):
                        cur_iou = iou_mtx_o[ao, do]
                        if cur_iou < MATCHING_IOU:
                            continue
                        current_hoi = object_hois[ao]
                        verb_idx = VERB_TO_ID[current_hoi['verb']]
                        labels_np[dd, do+1, verb_idx] = 1

        comp_labels = labels_np.reshape(no_person_dets*(no_obj_dets+1), NUMBER_OF_VERBS)
        labels_single=np.array([1 if i.any()==True else 0 for i in comp_labels])
        labels_single=labels_single.reshape(np.shape(labels_single)[0],1)
        return{'labels_all':labels_np,'labels_single':labels_single}

    else:
        comp_labels = labels_np.reshape(no_person_dets*(no_obj_dets+1), NUMBER_OF_VERBS)
        labels_single=np.array([1 if i.any()==True else 0 for i in comp_labels])
        labels_single=labels_single.reshape(np.shape(labels_single)[0],1)
        return{'labels_all':labels_np,'labels_single':labels_single}





def get_iou_mtx(anns, dets):

    """gives us the 2 d matrix of size (n_ann, n_dets). Containing the IoU values of the bboxes
    """

    no_at = len(anns)
    no_dt = len(dets)
    iou_mtx = np.zeros([no_at, no_dt])

    for at_n in range(no_at):
        at_box = anns[at_n]
        for dt_n in range(no_dt):
            dt_box = dets[dt_n]
            iou_mtx[at_n, dt_n] = IoU_box(at_box, dt_box)

    return iou_mtx


def IoU_box(box1, box2):
    """
    Args:
        box1 : left1, top1, right1, bottom1
        box2 : left2, top2, right2, bottom2

    returns:
        intersection over union
    """

    left1, top1, right1, bottom1 = box1
    left2, top2, right2, bottom2 = box2

    left_int, top_int = max(left1, left2), max(top1, top2)
    right_int, bottom_int = min(right1, right2), min(bottom1, bottom2)

    area_intersection = max(0, right_int-left_int) * max(0, bottom_int-top_int)

    area1 = (right1 - left1) * (bottom1 - top1)
    area2 = (right2 - left2) * (bottom2 - top2)

    IoU = area_intersection / (area1+area2 - area_intersection)

    return IoU




def get_bad_detections(segment_key,flag):

    """Get detections with no persons.
    """

    labels_all=get_compact_label(segment_key,flag)['labels_all']
    if labels_all.size==0:
        return True
    else:
        return False


def dry_run():

    ALL_SEGS_train = ANNOTATIONS_TRAIN.keys()
    ALL_SEGS_val = ANNOTATIONS_VAL.keys()
    ALL_SEGS_test = ANNOTATIONS_TEST.keys()

    ALL_SEGS_train = [int(v) for v in ALL_SEGS_train]
    ALL_SEGS_train.sort()
    ALL_SEGS_val = [int(v) for v in ALL_SEGS_val]
    ALL_SEGS_val.sort()
    new_anns = {}
    ALL_SEGS_test = [int(v) for v in ALL_SEGS_test]
    ALL_SEGS_test.sort()

    bad_detections_train = []
    bad_detections_val = []
    bad_detections_test = []

    ######### detect bad detections #############

    for segkey in (ALL_SEGS_train):

        if get_bad_detections(segkey, "train"):
            bad_detections_train.append(segkey)


    for segkey in (ALL_SEGS_val):

        if get_bad_detections(segkey, "val"):
            bad_detections_val.append(segkey)


    for segkey in (ALL_SEGS_test):

        if get_bad_detections(segkey, 'test'):
            bad_detections_test.append(segkey)


    return bad_detections_train, bad_detections_val, bad_detections_test

In [None]:
############ CAlCULATE AP CLASSWISE #######################################

from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report
import pandas as pd
import torch
import numpy as np
import os
import random

NO_VERBS = 29
VERBS_NO_COCO = 80

VERB2ID = [
    'carry', 'catch', 'cut_instr', 'cut_obj', 'drink', 'eat_instr', 'eat_obj',
    'hit_instr', 'hit_obj', 'hold', 'jump', 'kick', 'lay', 'look', 'point',
    'read', 'ride', 'run', 'sit', 'skateboard', 'ski', 'smile', 'snowboard',
    'stand', 'surf', 'talk_on_phone', 'throw', 'walk', 'work_on_computer'
]

coco_verbs = [
    u'person', u'bicycle', u'car', u'motorcycle', u'airplane', u'bus', u'train',
    u'truck', u'boat', u'traffic light', u'fire hydrant', u'stop sign',
    u'parking meter', u'bench', u'bird', u'cat', u'dog', u'horse', u'sheep',
    u'cow', u'elephant', u'bear', u'zebra', u'giraffe', u'backpack', u'umbrella',
    u'handbag', u'tie', u'suitcase', u'frisbee', u'skis', u'snowboard',
    u'sports ball', u'kite', u'baseball bat', u'baseball glove', u'skateboard',
    u'surfboard', u'tennis racket', u'bottle', u'wine glass', u'cup', u'fork',
    u'knife', u'spoon', u'bowl', u'banana', u'apple', u'sandwich', u'orange',
    u'broccoli', u'carrot', u'hot dog', u'pizza', u'donut', u'cake', u'chair',
    u'couch', u'potted plant', u'bed', u'dining table', u'toilet', u'tv',
    u'laptop', u'mouse', u'remote', u'keyboard', u'cell phone', u'microwave',
    u'oven', u'toaster', u'sink', u'refrigerator', u'book', u'clock', u'vase',
    u'scissors', u'teddy bear', u'hair drier', u'toothbrush'
]

threshold = 0.1

def class_AP(*args):
    result = []
    predicted_score = args[0]
    true_score = args[1]
    predicted_single_class = args[2]
    true_single_class = args[3]
    mean = 0

    for k in range(NO_VERBS):
        if VERB2ID[k]:
            predicted = predicted_score[:, k]
            true = true_score[:, k]
            try:
                AP_s = average_precision_score(true, predicted) * 100
            except:
                import pdb; pdb.set_trace()

            mean += AP_s
            result.append((VERB2ID[k], AP_s))

    result.append(('Mean', mean / NO_VERBS))
    mean = 0.0
    counter = 0

    return result, [('AP', average_precision_score(true_single_class, predicted_single_class) * 100)]

In [None]:
############ DATA LOADER VCOCO ##########################

from __future__ import print_function, division
import pickle
import json
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from PIL import Image

with open("/content/drive/MyDrive/HOI_Data/Checkpoints/bad_detections.pkl", "rb") as f:
    data = pickle.load(f)

bad_detections_train = data["train"]
bad_detections_val = data["val"]
bad_detections_test = data["test"]

NO_VERB = 29

def vcoco_collate(batch):
    """_summary_

    Args:
        batch (_type_): _description_

    Returns:
        image : all images in tensor format, torch.stacked.
        labels_all : (B, Total HOI pairs, verb)
        labels_single: (B, Total HOI pairs)
        image_id : stack of image ids in the batch.
        pairs_info : (B, Person no., Obj no., Verb no.)
    """
    image = []
    image_id = []
    pairs_info = []
    labels_all = []
    labels_single = []
    for index, item in enumerate(batch):
        image.append(item['image'])
        image_id.append(torch.tensor(int(item['image_id'])))
        pairs_info.append(torch.tensor(np.shape(item['labels_all'])))
        tot_HOI = int(np.shape(item['labels_single'])[0])
        labels_all.append(torch.tensor(item['labels_all'].reshape(tot_HOI, NO_VERB)))
        labels_single.append(torch.tensor(item['labels_single']))
    return [
        torch.stack(image),
        torch.cat(labels_all),
        torch.cat(labels_single),
        torch.stack(image_id),
        torch.stack(pairs_info),
    ]

class Rescale:
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, image):
        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)
        img2 = transform.resize(image, (new_h, new_w))
        return img2

class ToTensor:
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, image):
        image = image.transpose((2, 0, 1))
        return torch.from_numpy(image).float()

class vcoco_Dataset:
    def __init__(self, json_file_image, root_dir, transform=None):
        with open(json_file_image) as json_file_:
            self.vcoco_frame_file = json.load(json_file_)
        self.flag = json_file_image.split('/')[-1].split('_')[0]
        if self.flag == 'train':
            self.vcoco_frame = [
                x for x in self.vcoco_frame_file.keys() if x not in str(bad_detections_train)
            ]
        elif self.flag == 'val':
            self.vcoco_frame = [
                x for x in self.vcoco_frame_file.keys() if x not in str(bad_detections_val)
            ]
        elif self.flag == 'test':
            self.vcoco_frame = [
                x for x in self.vcoco_frame_file.keys() if x not in str(bad_detections_test)
            ]
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.vcoco_frame)

    def __getitem__(self, idx):
        if self.flag == 'test':
            img_pre_suffix = 'COCO_val2014_' + str(self.vcoco_frame[idx]).zfill(12) + '.jpg'
        else:
            img_pre_suffix = 'COCO_train2014_' + str(self.vcoco_frame[idx]).zfill(12) + '.jpg'

        all_labels = get_compact_label(int(self.vcoco_frame[idx]), self.flag)
        labels_all = all_labels['labels_all']
        labels_single = all_labels['labels_single']

        img_name = os.path.join(self.root_dir, img_pre_suffix)
        ids = [int(self.vcoco_frame[idx]), self.flag]
        image = Image.open(img_name).convert('RGB')
        image = np.array(image)

        if self.transform:
            image = self.transform(image)

        sample = {
            'image': image,
            'labels_all': labels_all,
            'labels_single': labels_single,
            'image_id': self.vcoco_frame[idx],
        }
        return sample

In [None]:
####################### POOL PAIRING ########################################

import torch
import torch.nn as nn
import numpy as np
import math


def get_pool_loc(ims, image_id, flag_, size=(7, 7), spatial_scale=1, batch_size=1):

    """So, here basically, we get tensors for the image area, where the attention features map are extracted.
       Also, for object and person, we cut from the main image, and use adaptive pooling and get trensors.

    Returns:
        pers_out: tensor from the main image for person area.
        objs_out: tensor from the main image for object area.
        spatial_locs: spatial locations for the persons, objects in each image.
        union_box_out: this gives us the attention maps in an image in a tensor version, a list containg for each img.
    """

    spatial_locs = []
    union_box_out = []
    pers_out = []
    objs_out = []

    flag = 'train'
    max_pool = nn.AdaptiveMaxPool2d(size)

    for im in range(batch_size):
        this_image = int(image_id[im])

        if int(flag_[im][0]) == 0:
            flag = 'train'

        elif int(flag_[im][0]) == 1:
            flag = 'val'

        elif int(flag_[im][0]) == 2:
            flag = 'test'

        a = get_compact_detections(this_image, flag)
        roi_pers, roi_objs = a['person_bbx'], a['objects_bbx']
        union_box = get_attention_maps(this_image, flag)
        union_box_out.append(torch.tensor(union_box).cuda().float())

        C, H, W = ims[im].size()[0], ims[im].size()[1], ims[im].size()[2]
        spatial_scale = [W, H, W, H]
        image_this_im = ims[im]
        roi_pers, roi_objs = roi_pers*spatial_scale, roi_objs*spatial_scale

        # pooling persons

        for index, roi_val in enumerate(roi_pers):
            x1, y1, x2, y2 = int(roi_val[0]), int(roi_val[1]), int(roi_val[2]), int(roi_val[3])
            sp = [x1, y1, x2, y2, x2-x1, y2-y1]
            image = image_this_im.narrow(0, 0, image_this_im.size()[0])[..., y1:(y2+1), x1:(x2+1)]
            pooled = max_pool(image)
            pers_out.append((pooled))
            spatial_locs.append(sp)

        # pooling objects

        for index, roi_val in enumerate(roi_objs):
            x1, y1, x2, y2 = int(roi_val[0]), int(roi_val[1]), int(roi_val[2]), int(roi_val[3])
            sp = [x1, y1, x2, y2, x2-x1, y2-y1]
            image = image_this_im.narrow(0, 0, image_this_im.size()[0])[..., y1:(y2+1), x1:(x2+1)]
            pooled = max_pool(image)
            objs_out.append((pooled))
            spatial_locs.append(sp)

    return torch.stack(pers_out), torch.stack(objs_out), spatial_locs, torch.cat(union_box_out)

def extract_spatial(hum_box, obj_box):

    """Extracts spatial with respect to the relation between the distance and the width, height of object and person.

    Returns:
        _type_: _description_
    """

    x1h, y1h, x2h, y2h, wh, hh = float(hum_box[0]), float(hum_box[1]), float(hum_box[2]), float(hum_box[3]), float(hum_box[4]), float(hum_box[5])

    x1o, y1o, x2o, y2o, wo, ho = float(obj_box[0]), float(obj_box[1]), float(obj_box[2]), float(obj_box[3]), float(obj_box[4]), float(obj_box[5])

    if wh == 0.0:
        wh += 1
    if hh == 0.0:
        hh += 1

    diff_x = 0.001 if x1h-x1o == 0 else x1h - x1o
    diff_y = 0.001 if y1h-y1o == 0 else y1h - y1o

    if wo!=0 and ho!=0:
        extract = torch.FloatTensor([diff_x/wo, diff_y/ho, math.log(wh/wo), math.log(hh/ho)])

    elif wo == 0 and ho != 0:
        extract = torch.FloatTensor([diff_x, diff_y/ho, math.log(wh), math.log(hh/ho)])

    elif wo != 0 and ho == 0:
        extract = torch.FloatTensor([diff_x/wo, diff_y, math.log(wh/wo), math.log(hh)])

    else:
        extract = torch.FloatTensor([diff_x, diff_y, math.log(wh), math.log(hh)])

    return extract.cuda()


def pairing(pers, objs, context, spatial_locs, pairs_info):

    """This gives us, batch wise, pers, objs and pers_objs batch combined with context and spatial features there.

    """

    pairs_out = []
    pers_out = []
    objs_out = []

    start = 0
    start_p = 0
    start_o = 0

    for batch in range(len(pairs_info)):

        this_batch_per = int(pairs_info[batch][0])
        this_batch_obj = int(pairs_info[batch][1])
        this_batch_len = int(pairs_info[batch][0]+pairs_info[batch][1])

        batch_pers, batch_objs = pers[start_p:start_p+this_batch_per], objs[start_o:start_o+this_batch_obj]
        batch_context = context[batch]
        sp_locs_batch = spatial_locs[start:start+this_batch_len]
        sp_locs_pers_batch, sp_locs_objs_batch = sp_locs_batch[0:this_batch_per], sp_locs_batch[this_batch_per:this_batch_per+this_batch_obj]

        pers_objs = []

        for ind_p, i in enumerate(batch_pers):

            for ind_o, j in enumerate(batch_objs):
                sp_features = extract_spatial(sp_locs_pers_batch[ind_p], sp_locs_objs_batch[ind_o])
                pers_objs.append(torch.cat([i, j, sp_features], 0))

        pers_objs_batch = torch.stack(pers_objs)

        pairs_out.append(torch.cat([pers_objs_batch, batch_context.repeat(pers_objs_batch.size()[0], 1)], 1))
        pers_out.append(batch_pers)
        objs_out.append(batch_objs)

        start += this_batch_len
        start_p += this_batch_per
        start_o += this_batch_obj


    return torch.cat(pairs_out), torch.cat(pers_out), torch.cat(objs_out)

In [None]:
################# PRED VISSSS################################################

import numpy as np
import pickle
import cv2
import pandas as pd
import json
from google.colab.patches import cv2_imshow

# Load directory JSON

all_data_dir = ALL_DET

# Define paths
OBJ_PATH_train_s = all_data_dir + 'Object_Detections_vcoco/train/'
OBJ_PATH_test_s = all_data_dir + 'Object_Detections_vcoco/val/'
image_dir_train = all_data_dir + 'Data_vcoco/train2014'
image_dir_val = all_data_dir + 'Data_vcoco/train2014'
image_dir_test = all_data_dir + 'Data_vcoco/val2014'

# Verb-to-ID and ID-to-Verb mappings
VERB2ID_2 = {
    u'carry': 0, u'catch': 1, u'cut_instr': 2, u'cut_obj': 3, u'drink': 4,
    u'eat_instr': 5, u'eat_obj': 6, u'hit_instr': 7, u'hit_obj': 8,
    u'hold': 9, u'jump': 10, u'kick': 11, u'lay': 12, u'look': 13,
    u'point': 14, u'read': 15, u'ride': 16, u'run': 17, u'sit': 18,
    u'skateboard': 19, u'ski': 20, u'smile': 21, u'snowboard': 22,
    u'stand': 23, u'surf': 24, u'talk_on_phone': 25, u'throw': 26,
    u'walk': 27, u'work_on_computer': 28
}

ID2VERB = {v: k for k, v in VERB2ID_2.items()}

# Pandas display options
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.options.display.max_columns = 250
pd.options.display.max_rows = 200

def visual(image_id, flag, pairs_info, score_HOI, score_interact, score_obj_box, score_per_box, score_REL, score_HOI_pair, ground_truth):
    start = 0

    for batch in range(len(image_id)):
        this_image = int(image_id[batch])
        a = get_compact_detections(this_image, flag)
        person_bbxn = a['person_bbx']
        obj_bbxn = a['objects_bbx']

        this_batch_pers = int(pairs_info[batch][0])
        this_batch_objs = int(pairs_info[batch][1])
        increment = this_batch_pers * this_batch_objs

        ground_truth_this_batch = ground_truth[start:start + increment]
        score_HOI_this_batch = score_HOI[start:start + increment]
        start += increment

        if flag == 'train':
            cur_obj_path_s = OBJ_PATH_train_s + "COCO_train2014_%.12i.json" % this_image
            image_dir_s = image_dir_train + '/COCO_train2014_%.12i.jpg' % this_image
        elif flag == 'test':
            cur_obj_path_s = OBJ_PATH_test_s + "COCO_val2014_%.12i.json" % this_image
            image_dir_s = image_dir_test + '/COCO_val2014_%.12i.jpg' % this_image
        elif flag == 'val':
            cur_obj_path_s = OBJ_PATH_train_s + "COCO_train2014_%.12i.json" % this_image
            image_dir_s = image_dir_val + '/COCO_train2014_%.12i.jpg' % this_image

        with open(cur_obj_path_s) as fp:
            detections = json.load(fp)

        img_H = detections['H']
        img_W = detections['W']

        person_bbx = np.array([img_W, img_H, img_W, img_H], dtype=float) * person_bbxn
        obj_bbx = np.array([img_W, img_H, img_W, img_H], dtype=float) * obj_bbxn
        img = cv2.imread(image_dir_s, 3)
        img_temp = img

        start_index = 0
        for person_box in person_bbx:
            for object_box in obj_bbx:
                img_temp = img.copy()
                ground_truth_this_sample = ground_truth_this_batch[start_index]
                score_HOI_this_sample = score_HOI_this_batch[start_index]

                print(score_HOI_this_sample)

                pred = [
                    ('GROUND_TRUTH', [(ID2VERB[ind], float("%.2f" % ground_truth_this_sample[ind])) for ind in np.argsort(ground_truth_this_sample)[-5:][::-1]])
                ]
                pred.append(
                    ('TOTAL_PREDICTION', [(ID2VERB[ind], float("%.2f" % score_HOI_this_sample[ind])) for ind in np.argsort(score_HOI_this_sample)[-5:][::-1]])
                )

                prediction = pd.DataFrame(pred, columns=['Name', 'Prediction'])

                x, y, w, h = int(person_box[0]), int(person_box[1]), int(person_box[2] - person_box[0]), int(person_box[3] - person_box[1])
                cv2.rectangle(img_temp, (x, y), (x + w, y + h), (0, 255, 0), 3)

                x, y, w, h = int(object_box[0]), int(object_box[1]), int(object_box[2] - object_box[0]), int(object_box[3] - object_box[1])
                cv2.rectangle(img_temp, (x, y), (x + w, y + h), (0, 0, 255), 3)

                print('\nPredictions (Five Highest Confidence Class):\n{}'.format(prediction))

                cv2_imshow(img_temp)
                start_index += 1

                k = cv2.waitKey(0)
                if k == 27:  # ESC key to exit
                    cv2.destroyAllWindows()

            if k == 27:  # ESC key to exit
                cv2.destroyAllWindows()

        if k == 27:  # ESC key to exit
            cv2.destroyAllWindows()

    cv2.destroyAllWindows()

In [None]:
##### This script will refine the predictions based on detected object by the object detector. Following by the work of https://github.com/vt-vl-lab/iCAN#######


import numpy as np
import pickle

# Open the pickle file in binary mode and specify the encoding
with open(INFOS+'prior.pickle', 'rb') as fp:
    priors = pickle.load(fp, encoding='latin1')  # Add encoding='latin1'

def apply_prior(Object, prediction_HOI_in):
    prediction_HOI = np.ones(prediction_HOI_in.shape)

    for index, prediction in enumerate(prediction_HOI):
        prediction_HOI[index] = priors[int(Object[index])]

    return prediction_HOI

In [None]:
################ PROPER INFERENCE #####################################

import json
import torch
import numpy as np

all_data_dir = ALL_DET

OBJ_PATH_train_s = all_data_dir + 'Object_Detections_vcoco/train/'
OBJ_PATH_test_s = all_data_dir + 'Object_Detections_vcoco/val/'
number_of_roles = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2]

proper_keys = [
    'carry_agent', 'carry_obj', 'catch_agent', 'catch_obj', 'cut_agent', 'cut_instr', 'cut_agent', 'cut_obj',
    'drink_agent', 'drink_instr', 'eat_agent', 'eat_instr', 'eat_agent', 'eat_obj', 'hit_agent', 'hit_instr',
    'hit_agent', 'hit_obj', 'hold_agent', 'hold_obj', 'jump_agent', 'jump_instr', 'kick_agent', 'kick_obj',
    'lay_agent', 'lay_instr', 'look_agent', 'look_obj', 'point_agent', 'point_instr', 'read_agent', 'read_obj',
    'ride_agent', 'ride_instr', 'run_agent', 'sit_agent', 'sit_instr', 'skateboard_agent', 'skateboard_instr',
    'ski_agent', 'ski_instr', 'smile_agent', 'snowboard_agent', 'snowboard_instr', 'stand_agent', 'surf_agent',
    'surf_instr', 'talk_on_phone_agent', 'talk_on_phone_instr', 'throw_agent', 'throw_obj', 'walk_agent',
    'work_on_computer_agent', 'work_on_computer_instr'
]

def infer_format(image_id, all_scores_batch, flag, all_detections, pairs_info):

    """_summary_

    Returns:
        The output is like:
        all_detections = [{'image_id: 233...', 'person_bbx': [0.3, 0.2, 0.5, 0.5], 'carry_agent':0.45, 'carry_obj':[0.2, 0.3, 0.1, 0.3, 0.7 <- action score last aer ta], .....,  'work_on_computer_instr'}]
    """
    this_batch_start = 0
    for batch in range(len(image_id)):
        this_image = int(image_id[batch])
        persons = all_scores_batch[this_image, 'pers_bbx']
        objects = all_scores_batch[this_image, 'obj_bbx']
        hum_scores = 0
        this_batch_pers = int(pairs_info[batch][0])
        this_batch_objs = int(pairs_info[batch][1])
        increment = this_batch_pers * this_batch_objs
        all_scores = all_scores_batch[this_image, 'score']

        if flag == 'train':
            cur_obj_path_s = OBJ_PATH_train_s + "COCO_train2014_%.12i.json" % (this_image)
        elif flag == 'test':
            cur_obj_path_s = OBJ_PATH_test_s + "COCO_val2014_%.12i.json" % (this_image)
        elif flag == 'val':
            cur_obj_path_s = OBJ_PATH_train_s + "COCO_train2014_%.12i.json" % (this_image)

        with open(cur_obj_path_s) as fp:
            detections = json.load(fp)

        persons_score = []
        objects_score = []
        objects_score.append(float(1))
        number_of_objects = len(objects)
        persons_score = np.array(persons_score, dtype=float)
        objects_score = np.array(objects_score, dtype=float)
        img_H = detections['H']
        img_W = detections['W']
        index_person = 0
        infer_dict = {}

        for item_no, role_ids in enumerate((all_scores)):
            person_bbxn = persons[item_no]
            obj_bbxn = objects[item_no]
            person_bbx = np.array([img_W, img_H, img_W, img_H], dtype=float) * person_bbxn
            obj_bbx = np.array([img_W, img_H, img_W, img_H], dtype=float) * obj_bbxn
            infer_dict = {}

            infer_dict['person_box'] = person_bbx.tolist()
            infer_dict['image_id'] = this_image
            dict_index = 0

            for index, k in enumerate(role_ids):
                person_action_score = k  # *person_confidence
                instances = number_of_roles[index]
                for j in range(instances):
                    if proper_keys[dict_index + j][-5:] == 'agent':
                        agent_key = proper_keys[dict_index + j]

                        if agent_key in infer_dict.keys():
                            if k > infer_dict[agent_key]:
                                infer_dict[agent_key] = float(person_action_score)
                        else:
                            infer_dict[agent_key] = float(person_action_score)
                    else:
                        obj_score = k
                        obj_bbx_score = np.append(obj_bbx, obj_score)
                        infer_dict[proper_keys[dict_index + j]] = obj_bbx_score.tolist()

                dict_index += number_of_roles[index]

            all_detections.append(infer_dict)

    return all_detections

In [None]:
############################# MODEL ###############################################


from __future__ import print_function, division
import torch
import torch.nn as nn
import os
import numpy as np
import torchvision.models as models
from torchvision.models import ResNet152_Weights


lin_size = 1024
ids = 80
context_size = 1024
sp_size = 1024
mul = 3
deep = 512
pool_size = (10, 10)
pool_size_pose = (18, 5, 5)




class Flatten(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x.view(x.size()[0], -1)


class HOI_Detector(nn.Module):

    def __init__(self):

        super().__init__()
        model = models.resnet152(weights=ResNet152_Weights.DEFAULT)
        self.flatten = Flatten()
        self.Conv_pretrain = nn.Sequential(*list(model.children())[0:7])


        ##### Conv blocks for humans, objects and context #########################

        self.Conv_people = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.ReLU(inplace=False)
        )

        self.Conv_objects = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.ReLU(inplace=False),
        )

        self.Conv_context = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False),
            nn.BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.ReLU(inplace=False)
        )

        ###################################################################


        ########## Attention Feature Model ###########

        self.conv_sp_map = nn.Sequential(
            nn.Conv2d(2, 64, kernel_size=(5, 5)),
            nn.MaxPool2d(kernel_size=(2, 2)),
            nn.Conv2d(64, 32, kernel_size=(5, 5)),
            nn.MaxPool2d(kernel_size=(2, 2)),
            nn.AvgPool2d((13, 13), padding=0, stride=(1, 1))
        )

        self.spmap_up = nn.Sequential(
            nn.Linear(32, 512),
            nn.ReLU()
        )

        #############################################

        ###### Prediction model for attention features #######################

        self.lin_spmap_tail = nn.Sequential(
            nn.Linear(512, 29)
        )

        ######################################################################


        ###### Graph Model basic Structure ##################################

        self.peo_to_obj_w = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU()
        )

        self.obj_to_peo_w = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU()
        )

        ####################################################################

        ############### Interaction prediction model for visual feature ###########################

        self.lin_single_head = nn.Sequential(
            nn.Linear(lin_size*3+4, 1024),
            nn.Linear(1024, 512),
            nn.ReLU()
        )

        self.lin_single_tail = nn.Sequential(
            nn.Linear(512, 1)
        )

        ######################### Prediction Model for visual features ######################################

        self.lin_visual_head = nn.Sequential(
            nn.Linear(lin_size*3+4, 1024),
            nn.Linear(1024, 512),
            nn.ReLU()
        )

        self.lin_visual_tail = nn.Sequential(
            nn.Linear(512, 29)
        )

        #####################################################################################################

        ######################### Prediciton Model for graph features #######################################

        self.lin_graph_head = nn.Sequential(
            nn.Linear(lin_size*2, 1024),
            nn.Linear(1024, 512),
            nn.ReLU()
        )

        self.lin_graph_tail = nn.Sequential(
            nn.Linear(512, 29)
        )

        #####################################################################################################

        self.sigmoid = nn.Sigmoid()





    def forward(self, x, pairs_info, pairs_info_augmented, image_id, flag_, phase):

        out1 = self.Conv_pretrain(x)

        rois_people, rois_objects, spatial_locs, union_box = get_pool_loc(out1, image_id, flag_, size=pool_size, spatial_scale=25, batch_size=len(pairs_info))

        #### Defining the pooling operations ####

        x, y = out1.size()[2], out1.size()[3]
        hum_pool = nn.AvgPool2d(pool_size, padding=0, stride=(1, 1))
        obj_pool = nn.AvgPool2d(pool_size, padding=0, stride=(1, 1))
        context_pool = nn.AvgPool2d((x, y), padding=0, stride=(1, 1))

        # Human ->
        residual_people = rois_people
        res_people = self.Conv_people(rois_people) + residual_people
        res_av_people = hum_pool(res_people)
        out2_people = self.flatten(res_av_people)
        ##########

        # Object ->
        residual_object = rois_objects
        res_object = self.Conv_objects(rois_objects) + residual_object
        res_av_object = obj_pool(res_object)
        out2_objects = self.flatten(res_av_object)

        # Context ->
        residual_context = out1
        res_context = self.Conv_context(out1) + residual_context
        res_av_context = context_pool(res_context)
        out2_context = self.flatten(res_av_context)


        # Attention feature ->
        out2_union = self.spmap_up(self.flatten(self.conv_sp_map(union_box)))

        #####################################




        ################ Making Essential Pairing ###################################

        pairs, people, objects_only = pairing(out2_people, out2_objects, out2_context, spatial_locs, pairs_info)

        ##############################################################################



        ############### Interaction probability ######################################

        lin_single_h = self.lin_single_head(pairs)
        lin_single_t = lin_single_h * out2_union
        lin_single = self.lin_single_tail(lin_single_t)
        interaction_prob = self.sigmoid(lin_single)

        ##############################################################################





        ####################### Graph Model Base Structure ###################################

        people_t = people
        objects_only = objects_only
        combine_g = []
        people_f = []
        objects_f = []
        pairs_f = []
        start_p = 0
        start_o = 0
        start_c = 0


        for batch_num, l in enumerate(pairs_info):

            ########### Slicing ######################
            people_this_batch = people_t[start_p:start_p+int(l[0])]
            no_peo = len(people_this_batch)
            objects_this_batch = objects_only[start_o:start_o+int(l[1])][1:]
            no_objects_this_batch = objects_only[start_o:start_o+int(l[1])][0]
            no_obj = len(objects_this_batch)
            interaction_prob_this_batch = interaction_prob[start_c:start_c+int(l[1])*int(l[0])]


            if no_obj == 0:
                people_this_batch_r = people_this_batch
                objects_this_batch_r = no_objects_this_batch.view([1, 1024])

            else:
                peo_to_obj_this_batch = torch.stack([torch.cat((i, j)) for ind_p, i in enumerate(people_this_batch) for ind_o, j in enumerate(objects_this_batch)])

                obj_to_peo_this_batch = torch.stack([torch.cat((i, j)) for ind_p, i in enumerate(objects_this_batch) for ind_o, j in enumerate(people_this_batch)])

            #########################################


            ########## Adjacency #####################
                adj_l = []
                adj_po = torch.zeros([no_peo, no_obj]).cpu()
                adj_op = torch.zeros([no_obj, no_peo]).cpu()

                for index_prob, probs in enumerate(interaction_prob_this_batch):
                    if index_prob % (no_obj+1) != 0:
                        adj_l.append(probs)

                adj_po = torch.cat(adj_l).view(len(adj_l), 1)
                adj_op = adj_po

            ######### Finding out refined graph features ##########

                people_this_batch_r = people_this_batch + torch.mm(adj_po.view([no_peo, no_obj]), self.peo_to_obj_w(objects_this_batch))

                objects_this_batch_r = objects_this_batch + torch.mm(adj_op.view([no_obj, no_peo]), self.obj_to_peo_w(people_this_batch))

                objects_this_batch_r = torch.cat((no_objects_this_batch.view([1, 1024]), objects_this_batch_r))

            #######################################################

            #### Restructuring ######
            people_f.append(people_this_batch_r)
            people_t_f = people_this_batch_r
            objects_f.append(objects_this_batch_r)
            objects_t_f = objects_this_batch_r

            pairs_f.append(torch.stack([torch.cat((i, j)) for ind_p, i in enumerate(people_t_f) for ind_o, j in enumerate(objects_t_f)]))


            start_p += int(l[0])
            start_o += int(l[1])
            start_c += int(l[0]) * int(l[1])



        people_graph = torch.cat(people_f)
        objects_graph = torch.cat(objects_f)
        pairs_graph = torch.cat(pairs_f)

        #####################################################################################



        ########## Prediction from visual features #################

        lin_h = self.lin_visual_head(pairs)
        lin_t = lin_h * out2_union
        lin_visual = self.lin_visual_tail(lin_t)

        ############################################################


        ########### Prediction from graph features ##################

        lin_graph_h = self.lin_graph_head(pairs_graph)
        lin_graph_t = lin_graph_h * out2_union
        lin_graph = self.lin_graph_tail(lin_graph_t)


        ########## Prediction from attention features ##############

        lin_att = self.lin_spmap_tail(out2_union)

        ############################################################

        return [lin_visual, lin_single, lin_graph, lin_att]

In [None]:
######################### TRAIN TEST ###################################################

import torch
import torch.nn as nn
import time
import errno
import os
import gc
import pickle
import shutil
import json
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
import random
from tqdm import tqdm




sigmoid = nn.Sigmoid()


### Fixing seeds ####

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

seed = 10
torch.manual_seed(seed)


np.random.seed(seed)
random.seed(seed)

softmax = nn.Softmax()

##########################

###################  parameters for person to object class mapping #############################

SCORE_TH = 0.6
SCORE_OBJ = 0.3
epoch_to_change = 400
thres_hold = -1

###############################################################################################

############# Loss function defination ########################################################

loss_com = nn.BCEWithLogitsLoss(reduction='sum')
loss_com_class = nn.BCEWithLogitsLoss(reduction='none')
loss_com_combine = nn.BCELoss(reduction='none')
loss_com_single = nn.BCEWithLogitsLoss(reduction='sum')

###############################################################################################

no_of_classes = 29

##### Helper function ######

#### Fixing the seeds for all threads ###########

def _init_fn(worker_id):
    np.random.seed(int(seed))


################################################

############# Extending number of people #####################################################

def extend(inputt, extend_number):

    res = np.zeros([1, np.shape(inputt)[-1]])

    for a in inputt:
        x = np.repeat(a.reshape(1, np.shape(inputt)[-1]), extend_number, axis=0)
        res = np.concatenate([res, x], axis=0)

    return res[1:]


def extend_object(inputt, extend_number):

    res = np.zeros([1, np.shape(inputt)[-1]])

    x = np.array(inputt.tolist()*extend_number)
    res = np.concatenate([res, x], axis=0)

    return res[1:]


################################# Filtering the results for preparing the output as per V-COCO ######################

def filtering(predicted_HOI, true, persons_np, objects_np, filters, pairs_info, image_id):

    res1 = np.zeros([1, no_of_classes])
    res2 = np.zeros([1, no_of_classes])
    res3 = np.zeros([1, no_of_classes])
    res4 = np.zeros([1, 4])
    res5 = np.zeros([1, 4])
    dict_1 = {}
    a = 0

    increment = [int(i[0] * i[1]) for i in pairs_info]
    start = 0

    for index, i in enumerate(filters):

        res1 = np.concatenate([res1, predicted_HOI[index].reshape(1, no_of_classes)], axis=0)
        res2 = np.concatenate([res2, true[index].reshape(1, no_of_classes)], axis=0)
        res3 = np.concatenate([res3, predicted_HOI[index].reshape(1, no_of_classes)], axis=0)
        res4 = np.concatenate([res4, persons_np[index].reshape(1, 4)], axis=0)
        res5 = np.concatenate([res5, objects_np[index].reshape(1, 4)], axis=0)

        if index == start + increment[a] - 1:

            dict_1[int(image_id[a]), 'score'] = res3[1:]
            dict_1[int(image_id[a]), 'pers_bbx'] = res4[1:]
            dict_1[int(image_id[a]), 'obj_bbx'] = res5[1:]
            res3 = np.zeros([1, no_of_classes])
            res4 = np.zeros([1, 4])
            res5 = np.zeros([1, 4])
            start += increment[a]
            a += 1

    return dict_1

########################################################################################################################

########### Saving Checkpoint ##########################

def save_checkpoint(state, filename = 'checkpoint.pth.tar'):
    torch.save(state, filename)

######################################################

########## LIS function from https://github.com/DirtyHarryLYL/Transferable-Interactiveness-Network #####################

def LIS(x, T, k, w):
    return T/(1+np.exp(k-w*x))

########################################################################################################################

def train_test(model, optimizer, scheduler, dataloader, number_of_epochs, break_point, saving_epoch, folder_name, batch_size, infr, start_epoch, mean_best, visualize):

    ######################## Creating the folder where the result will be stored ######################################
    try:
        os.mkdir(folder_name)

    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass

    file_name = folder_name + '/' + 'result.pickle'

    ###################################################################################################################

    loss_epoch_train = []
    loss_epoch_val = []
    loss_epoch_test = []
    initial_time = time.time()
    result = []

    ############## Freeing out the cache memories from cpu and gpus and declaring the phases ##########################

    torch.cuda.empty_cache()
    phases = ['train', 'val', 'test']

    if infr == 't' and visualize == 'f': ###### If running from a pretrained model for saving the best result #########
        start_epoch = start_epoch - 1
        phases = ['test']
        end_of_epochs = start_epoch + 1
        print("Only doing testing for storing results from a model")

    elif visualize != 'f':
        if visualize not in phases:
            print("Error! Asked to show visualization from a unknown set. The choice should be among train, val, test")
            return

        else:
            phases = [visualize]
            end_of_epochs = start_epoch + 1
            print("only showing predictions from a model")

    else:
        end_of_epochs = start_epoch + number_of_epochs

    ###################################################################################################################

    ############################ Starting the epochs #################################################################

    for epoch in range(start_epoch, end_of_epochs):

        scheduler.step()
        print('Epoch {}/{}'.format(epoch+1,end_of_epochs))
        print('-' * 10)
        initial_time_epoch = time.time()

        for phase in phases:

            if phase == 'train':
                model.train()

            elif phase == 'val':
                model.train()

            else:
                model.eval()


            print('In {}'.format(phase))

            detections_train = []
            detections_val = []
            detections_test = []

            true_scores_class = np.ones([1, 80], dtype=int)
            true_scores = np.ones([1, 29], dtype=int)
            true_scores_single = np.ones([1, 1], dtype=int)
            predicted_scores = np.ones([1, 29], dtype=float)
            predicted_scores_single = np.ones([1, 1], dtype=float)
            predicted_scores_class = np.ones([1, 80], dtype=float)
            acc_epoch = 0
            iteration = 1

            torch.cuda.empty_cache()

            ########### Starting the iterations ###################################################################

            for iter, i in enumerate(tqdm(dataloader[phase])):

                if iter % 20 == 0:
                    torch.cuda.empty_cache()

                inputs = i[0].to(device)
                labels = i[1].to(device)
                labels_single = i[2].to(device)
                image_id = i[3]
                pairs_info = i[4]
                min_batch_size = len(pairs_info)

                optimizer.zero_grad()

                if phase == 'train':
                    nav = torch.tensor([[0, epoch]]*min_batch_size).to(device)

                elif phase == 'val':
                    nav = torch.tensor([[1, epoch]]*min_batch_size).to(device)

                else:
                    nav = torch.tensor([[2, epoch]]*min_batch_size).to(device)

                true = (labels.data).cpu().numpy()
                true_single = (labels_single.data).cpu().numpy()



                with torch.set_grad_enabled(phase=='train' or phase=='val'):

                    model_out = model(inputs, pairs_info, pairs_info, image_id, nav, phase)
                    outputs = model_out[0]         ## Visual
                    outputs_single = model_out[1]
                    outputs_combine = model_out[2] ## graph
                    outputs_gem = model_out[3]     ## attention

                    predicted_HOI = sigmoid(outputs).data.cpu().numpy()
                    predicted_HOI_combine = sigmoid(outputs_combine).data.cpu().numpy()
                    predicted_single = sigmoid(outputs_single).data.cpu().numpy()
                    predicted_gem = sigmoid(outputs_gem).data.cpu().numpy()
                    predicted_HOI_pair = predicted_HOI


                    start_index = 0
                    start_obj = 0
                    start_pers = 0
                    start_tot = 0
                    pers_index = 1
                    persons_score_extended = np.zeros([1, 1])
                    objects_score_extended = np.zeros([1, 1])
                    class_ids_extended = np.zeros([1, 1])
                    persons_np_extended = np.zeros([1, 4])
                    objects_np_extended = np.zeros([1, 4])
                    start_no_obj = 0
                    class_ids_total = []


                    ################### Extending persons, obj scores to multiply with all pairs ######################

                    for batch in range(len(pairs_info)):

                        persons_score = []
                        objects_score = []
                        class_ids = []

                        this_image = int(image_id[batch])

                        scores_total = get_compact_detections(this_image, phase)
                        persons_score, objects_score, persons_np, objects_np, class_ids = scores_total['person_bbx_score'], scores_total['object_bbx_score'], scores_total['person_bbx'], scores_total['objects_bbx'], scores_total['class_id_objects']

                        objects_score.insert(0, float(1))


                        temp_scores = extend(np.array(persons_score).reshape(len(persons_score), 1), int(pairs_info[batch][1]))
                        persons_score_extended = np.concatenate([persons_score_extended, temp_scores])

                        temp_scores = extend(persons_np, int(pairs_info[batch][1]))
                        persons_np_extended = np.concatenate([persons_np_extended, temp_scores])

                        temp_scores = extend_object(np.array(objects_score).reshape(len(objects_score), 1), int(pairs_info[batch][0]))
                        objects_score_extended = np.concatenate([objects_score_extended, temp_scores])

                        temp_scores = extend_object(objects_np, int(pairs_info[batch][0]))
                        objects_np_extended = np.concatenate([objects_np_extended, temp_scores])

                        temp_scores = extend_object(np.array(class_ids).reshape(len(class_ids),1),int(pairs_info[batch][0]))
                        class_ids_extended = np.concatenate([class_ids_extended, temp_scores])
                        class_ids_total.append(class_ids)



                        start_pers += int(pairs_info[batch][0])
                        start_obj += int(pairs_info[batch][1])
                        start_tot = start_tot + int(pairs_info[batch][1]) * int(pairs_info[batch][0])


                    ####################################################################################################

                    ########## Applying LIS ##################################

                    persons_score_extended = LIS(persons_score_extended, 8.3, 12, 10)
                    objects_score_extended = LIS(objects_score_extended, 8.3, 12, 10)

                    ###########################################################

                    ########## Multiplying the scores from different streams along with the prior function from ican ###



                    predicted_HOI = (predicted_HOI*predicted_HOI_combine*predicted_single*predicted_gem*persons_score_extended[1:]*objects_score_extended[1:])

                    loss_mask = apply_prior(class_ids_extended[1:], predicted_HOI)
                    predicted_HOI = loss_mask * predicted_HOI

                    ######## Calculating loss ###################################################################

                    N_b = min_batch_size * 29
                    hum_obj_mask = torch.Tensor(objects_score_extended[1:]*persons_score_extended[1:]*loss_mask).to(device)

                    lossf = torch.sum(loss_com_combine(sigmoid(outputs)*sigmoid(outputs_combine)*sigmoid(outputs_single)*hum_obj_mask*sigmoid(outputs_gem), labels.float())) / N_b

                    lossc = lossf.item()

                    acc_epoch += lossc
                    iteration += 1

                    if phase == 'train' or phase == 'val':
                        lossf.backward()
                        optimizer.step()

                    #################################################################################################

                    del lossf
                    del model_out
                    del inputs
                    del outputs
                    del labels

                ########################### If visualization ######################################################

                if visualize != 'f':
                    visual(image_id, phase, pairs_info, predicted_HOI, predicted_single, objects_score_extended[1:], persons_score_extended[1:], predicted_HOI_combine, predicted_HOI_pair, true)

                ###################################################################################################


                ########## preparing for storing results #########################################################
                predicted_scores = np.concatenate((predicted_scores, predicted_HOI), axis=0)
                true_scores = np.concatenate((true_scores, true), axis=0)
                predicted_scores_single = np.concatenate((predicted_scores_single, predicted_single), axis=0)
                true_scores_single = np.concatenate((true_scores_single, true_single), axis=0)
                ################################################################################################

                ################### Storing the result in V-COCO format ########################################

                if phase == 'test':

                    if (epoch+1)%saving_epoch==0 or infr=='t':

                        all_scores = filtering(predicted_HOI, true, persons_np_extended[1:], objects_np_extended[1:],predicted_single, pairs_info, image_id)

                        infer_format(image_id, all_scores, phase, detections_test, pairs_info)

                ###############################################################################################



                ################# Breaking in particular number of epoch #####################################

                #if iteration == break_point + 1:

                    #break

                ##############################################################################################

            if phase == 'train':

                loss_epoch_train.append((acc_epoch))

                AP, AP_single = class_AP(predicted_scores[1:,:],true_scores[1:,:],predicted_scores_single[1:,],true_scores_single[1:,])

                AP_train = pd.DataFrame(AP,columns =['Name_TRAIN', 'Score_TRAIN'])
                AP_train_single = pd.DataFrame(AP_single,columns =['Name_TRAIN', 'Score_TRAIN'])

            elif phase == 'val':

                loss_epoch_val.append((acc_epoch))
                AP,AP_single=class_AP(predicted_scores[1:,:],true_scores[1:,:],predicted_scores_single[1:,],true_scores_single[1:,])

                AP_val = pd.DataFrame(AP,columns =['Name_VAL', 'Score_VAL'])
                AP_val_single = pd.DataFrame(AP_single,columns =['Name_VAL', 'Score_VAL'])

            elif phase == 'test':

                loss_epoch_test.append((acc_epoch))
                AP,AP_single=class_AP(predicted_scores[1:,:],true_scores[1:,:],predicted_scores_single[1:,],true_scores_single[1:,])
                AP_test = pd.DataFrame(AP,columns =['Name_TEST', 'Score_TEST'])
                AP_test_single = pd.DataFrame(AP_single,columns =['Name_TEST', 'Score_TEST'])

                if (epoch+1)%saving_epoch==0 or infr=='t':
                    file_name_p=folder_name+'/'+'test{}.pickle'.format(epoch+1)
                    with open(file_name_p, 'wb') as handle:
                        pickle.dump(detections_test, handle)



        ##################################### Saving the model ########################################################

        mean=AP_test.to_records(index=False)[29][1]

        ##### Best Model ############

        if mean>mean_best and infr!='t':

            mean_best = mean
            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'mean_best': mean_best,
                'optimizer': optimizer.state_dict(),
                'scheduler':scheduler.state_dict()
            }, filename=folder_name+'/'+'bestcheckpoint.pth.tar')

        ############################

        if (epoch+1)%saving_epoch==0  and infr!='t':

            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'mean_best': mean_best,
                'optimizer': optimizer.state_dict(),
                'scheduler':scheduler.state_dict()
            }, filename=folder_name+'/'+str(epoch + 1 + 12)+'checkpoint.pth.tar')

        ###########################

        if infr=='t':
            AP_final=pd.concat([AP_test],axis=1)
            AP_final_single=pd.concat([AP_test_single],axis=1)
            result.append(AP_final)
            with open(file_name, 'wb') as handle:
                pickle.dump(result, handle)

        else:
            AP_final=pd.concat([AP_train,AP_val,AP_test],axis=1)
            AP_final_single=pd.concat([AP_train_single,AP_val_single,AP_test_single],axis=1)
            result.append(AP_final)
            with open(file_name, 'wb') as handle:
                pickle.dump(result, handle)


        time_elapsed = time.time() - initial_time_epoch
        print('APs in EPOCH:{}'.format(epoch+1))
        print(AP_final)
        print(AP_final_single)

        try:
            print('Loss_train:{},Loss_validation:{},Loss_test:{}'.format(loss_epoch_train[epoch-start_epoch],loss_epoch_val[epoch-start_epoch],loss_epoch_test[epoch-start_epoch]))

        except:
            print('Loss_test:{}'.format(loss_epoch_test[epoch-start_epoch]))

        print('This epoch completes in {:.0f}m {:.06f}s'.format(
                        time_elapsed // 60, time_elapsed % 60))

        if infr=='t':
            break


    time_elapsed = time.time() - initial_time
    print('The whole process runs for {:.0f}h {:.0f}m {:0f}s'.format(time_elapsed //3600, (time_elapsed % 3600) // 60,((time_elapsed % 3600)%60)%60))

    return

In [None]:
####################### MAIN #######################################################


from __future__ import print_function, division
import torch
import torch.nn as nn
import argparse
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
import sys
import warnings
warnings.filterwarnings("ignore")

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import random

device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
print(device)

seed = 10
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

def _init_fn(worker_id):
    np.random.seed(int(seed))



number_of_epochs = 1
learning_rate = 0.01
breaking_point = 10
saving_epoch = 1
first_word = "/content/drive/MyDrive/HOI_Data/Checkpoints"
batch_size = 1
resume_model = 't'
infr = 't'
hyp = 'f'
visualize = 'f'
check = "best"
############################################

all_data_dir = ALL_DET

annotation_train = all_data_dir + 'Annotations_vcoco/train_annotations.json'
image_dir_train = all_data_dir + 'Data_vcoco/train2014/'

annotation_val = all_data_dir + 'Annotations_vcoco/val_annotations.json'
image_dir_val = all_data_dir + 'Data_vcoco/train2014/'

annotation_test = all_data_dir + 'Annotations_vcoco/test_annotations.json'
image_dir_test = all_data_dir + 'Data_vcoco/val2014/'

vcoco_train = vcoco_Dataset(annotation_train, image_dir_train, transform=transforms.Compose([Rescale((400, 400)), ToTensor()]))
vcoco_val = vcoco_Dataset(annotation_val, image_dir_val, transform=transforms.Compose([Rescale((400, 400)), ToTensor()]))
vcoco_test = vcoco_Dataset(annotation_test, image_dir_test, transform=transforms.Compose([Rescale((400, 400)), ToTensor()]))

dataloader_train = DataLoader(vcoco_train, batch_size, shuffle=True,  collate_fn=vcoco_collate, num_workers=4, worker_init_fn=_init_fn)
dataloader_val = DataLoader(vcoco_val, batch_size, shuffle=True, collate_fn=vcoco_collate, num_workers=4, worker_init_fn=_init_fn)
dataloader_test = DataLoader(vcoco_test, batch_size, shuffle=False, collate_fn=vcoco_collate, num_workers=4, worker_init_fn=_init_fn)
dataloader = {'train': dataloader_train, 'val': dataloader_val, 'test': dataloader_test}

folder_name = '{}'.format(first_word)

### Loading Model ###
res = HOI_Detector()

trainables = []
not_trainables = []
spmap = []
single = []

for name, p in res.named_parameters():
    if name.split('.')[0] == 'Conv_pretrain':
        p.requires_grad = False
        not_trainables.append(p)
    else:
        if name.split('.')[0] in ['conv_sp_map', 'spmap_up']:
            spmap.append(p)
        else:
            trainables.append(p)

optim1 = optim.SGD(
    [
        {"params": trainables, "lr": learning_rate},
        {"params": spmap, "lr": 0.001}
    ],
    momentum=0.9, weight_decay=0.0001
)

lambda1 = lambda epoch: 1.0 if epoch < 10 else (10 if epoch < 28 else 1)
lambda2 = lambda epoch: 1
scheduler = optim.lr_scheduler.LambdaLR(optim1, [lambda1, lambda2])

res.to(device)

epoch = 0
mean_best = 0

if resume_model == 't':
    try:
        checkpoint = torch.load(folder_name + '/' + check + 'checkpoint.pth.tar')
        res.load_state_dict(checkpoint['state_dict'], strict=True)
        epoch = checkpoint['epoch']
        mean_best = checkpoint['mean_best']
        print(f"=> loaded checkpoint when best_prediction mAP {mean_best} and epoch {checkpoint['epoch']}")
    except:
        print('Failed to load checkpoint')

if hyp == 't':
    try:
        print('Loading previous Hyperparameters')
        optim1.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
    except:
        print('Failed to load previous Hyperparameters')

train_test(res, optim1, scheduler, dataloader, number_of_epochs, breaking_point, saving_epoch, folder_name, batch_size, infr, epoch, mean_best, visualize)