In [1]:
import json
import statistics
import random
import copy
import string

import pandas as pd
import numpy as np

In [2]:
#set random seed for reproducibility
random.seed(1)

In [3]:
captions_filename = "annotations/captions_train2014.json"

annotation = json.load(open(captions_filename))

images = annotation["images"]

captions = annotation["annotations"]

images[0]
    

{'license': 5,
 'file_name': 'COCO_train2014_000000057870.jpg',
 'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000057870.jpg',
 'height': 480,
 'width': 640,
 'date_captured': '2013-11-14 16:28:13',
 'flickr_url': 'http://farm4.staticflickr.com/3153/2970773875_164f0c0b83_z.jpg',
 'id': 57870}

In [4]:
# Create dictionary of images with image id as key and metadata as value
images_dict = {}
for image in images:
    images_dict[image["id"]] = {"file_name": image["file_name"], "captions": []}
list(images_dict.items())[0:5]

[(57870, {'file_name': 'COCO_train2014_000000057870.jpg', 'captions': []}),
 (384029, {'file_name': 'COCO_train2014_000000384029.jpg', 'captions': []}),
 (222016, {'file_name': 'COCO_train2014_000000222016.jpg', 'captions': []}),
 (520950, {'file_name': 'COCO_train2014_000000520950.jpg', 'captions': []}),
 (69675, {'file_name': 'COCO_train2014_000000069675.jpg', 'captions': []})]

In [5]:
captions[0:5]

[{'image_id': 318556,
  'id': 48,
  'caption': 'A very clean and well decorated empty bathroom'},
 {'image_id': 116100,
  'id': 67,
  'caption': 'A panoramic view of a kitchen and all of its appliances.'},
 {'image_id': 318556,
  'id': 126,
  'caption': 'A blue and white bathroom with butterfly themed wall tiles.'},
 {'image_id': 116100,
  'id': 148,
  'caption': 'A panoramic photo of a kitchen and dining room'},
 {'image_id': 379340,
  'id': 173,
  'caption': 'A graffiti-ed stop sign across the street from a red car '}]

In [6]:
#define gender labels
male_labels = ["man", "male"]
female_labels = ["woman", "female"]

In [7]:
# Associate captions with images
for caption in captions:
    image_id = caption["image_id"]
    caption_text = caption["caption"]
    if not (caption_text in images_dict[image_id]["captions"]):
        images_dict[image_id]["captions"].append(caption_text.lower())

list(images_dict.items())[0:5]

[(57870,
  {'file_name': 'COCO_train2014_000000057870.jpg',
   'captions': ['a restaurant has modern wooden tables and chairs.',
    'a long restaurant table with rattan rounded back chairs.',
    'a long table with a plant on top of it surrounded with wooden chairs ',
    'a long table with a flower arrangement in the middle for meetings',
    'a table is adorned with wooden chairs with blue accents.']}),
 (384029,
  {'file_name': 'COCO_train2014_000000384029.jpg',
   'captions': ['a man preparing desserts in a kitchen covered in frosting.',
    'a chef is preparing and decorating many small pastries.',
    'a baker prepares various types of baked goods.',
    'a close up of a person grabbing a pastry in a container',
    'close up of a hand touching various pastries.']}),
 (222016,
  {'file_name': 'COCO_train2014_000000222016.jpg',
   'captions': ['a big red telephone booth that a man is standing in',
    'a person standing inside of a phone booth ',
    'this is an image of a man in

In [8]:
def is_whole_word_in_text(word, text):
    #replace punctuation with spaces
    for p in string.punctuation:
        text = text.replace(p, " ")
    words = text.split()
    return word in words

print(is_whole_word_in_text("man", "a woman is eating a sandwich"))
print(is_whole_word_in_text("man", "a man."))

False
True


In [9]:
len(images_dict.items())

82783

In [10]:
# Classify gender of images, removing ones that don't have a clear gender
images_dict_items = list(images_dict.items())
deleted_image_ids = []
for image_id, image_data in images_dict_items:
    male = False
    female = False
    for caption in image_data["captions"]:
        if any(is_whole_word_in_text(label, caption) for label in male_labels):
            male = True
        if any(is_whole_word_in_text(label, caption) for label in female_labels):
            female = True
    if male ^ female:
        if male:
            images_dict[image_id]["gender"] = "male"
        else:
            images_dict[image_id]["gender"] = "female"
    else:
        deleted_image_ids.append(image_id)
        del images_dict[image_id]

len(images_dict.items())

23279

In [11]:
len(deleted_image_ids)

59504

In [12]:
list(images_dict.items())[0:5]

[(384029,
  {'file_name': 'COCO_train2014_000000384029.jpg',
   'captions': ['a man preparing desserts in a kitchen covered in frosting.',
    'a chef is preparing and decorating many small pastries.',
    'a baker prepares various types of baked goods.',
    'a close up of a person grabbing a pastry in a container',
    'close up of a hand touching various pastries.'],
   'gender': 'male'}),
 (222016,
  {'file_name': 'COCO_train2014_000000222016.jpg',
   'captions': ['a big red telephone booth that a man is standing in',
    'a person standing inside of a phone booth ',
    'this is an image of a man in a phone booth.',
    'a man is standing in a red phone booth.',
    'a man using a phone in a phone booth.'],
   'gender': 'male'}),
 (69675,
  {'file_name': 'COCO_train2014_000000069675.jpg',
   'captions': ['a child and woman are cooking in the kitchen.',
    "a woman glances at a young girl's cooking on the stovetop",
    'a young girl and a woman preparing food in a kitchen.',
    

In [13]:
for image_id, image_data in images_dict.items():
    del image_data["captions"]

list(images_dict.items())[0:5]

[(384029, {'file_name': 'COCO_train2014_000000384029.jpg', 'gender': 'male'}),
 (222016, {'file_name': 'COCO_train2014_000000222016.jpg', 'gender': 'male'}),
 (69675, {'file_name': 'COCO_train2014_000000069675.jpg', 'gender': 'female'}),
 (547471, {'file_name': 'COCO_train2014_000000547471.jpg', 'gender': 'male'}),
 (90570, {'file_name': 'COCO_train2014_000000090570.jpg', 'gender': 'male'})]

# Combing targets

In [14]:
instaces_filename = "annotations/instances_train2014.json"

instance_annotations = json.load(open(instaces_filename))

cat_annotations_raw = instance_annotations["annotations"]

categories = instance_annotations["categories"]

In [15]:
categories[0:5]

[{'supercategory': 'person', 'id': 1, 'name': 'person'},
 {'supercategory': 'vehicle', 'id': 2, 'name': 'bicycle'},
 {'supercategory': 'vehicle', 'id': 3, 'name': 'car'},
 {'supercategory': 'vehicle', 'id': 4, 'name': 'motorcycle'},
 {'supercategory': 'vehicle', 'id': 5, 'name': 'airplane'}]

In [16]:
cat_annotations = []
for antn in cat_annotations_raw:
    new_antn = {"id": antn["id"], "image_id": antn["image_id"], "category_id": antn["category_id"]}
    cat_annotations.append(new_antn)

cat_annotations[0:5]

[{'id': 86, 'image_id': 480023, 'category_id': 58},
 {'id': 89, 'image_id': 50518, 'category_id': 58},
 {'id': 93, 'image_id': 142589, 'category_id': 58},
 {'id': 113, 'image_id': 209263, 'category_id': 58},
 {'id': 116, 'image_id': 15307, 'category_id': 58}]

In [17]:
# create a category dictionary
cat_dict = {}
for cat in categories:
    cat_dict[cat["id"]] = cat["name"]

# Associate categories with their names
for antn in cat_annotations:
    antn["category_name"] = cat_dict[antn["category_id"]]
    del antn["category_id"]

cat_annotations[0:5]

[{'id': 86, 'image_id': 480023, 'category_name': 'hot dog'},
 {'id': 89, 'image_id': 50518, 'category_name': 'hot dog'},
 {'id': 93, 'image_id': 142589, 'category_name': 'hot dog'},
 {'id': 113, 'image_id': 209263, 'category_name': 'hot dog'},
 {'id': 116, 'image_id': 15307, 'category_name': 'hot dog'}]

In [18]:
# Associate categories with images
for image_id, image_data in images_dict.items():
    image_data["targets"] = []

for antn in cat_annotations:
    image_id = antn["image_id"]
    category_name = antn["category_name"]
    if (image_id in images_dict):
        image_data = images_dict[image_id]
        if not (category_name in image_data["targets"]):
            image_data["targets"].append(category_name)

#remove images that do not include person targets
items = list(images_dict.items())
for image_id, image_data in items:
    if not ("person" in image_data["targets"]):
        del images_dict[image_id]

len(images_dict.items())

23010

# Making final JSON format

In [19]:
target_img_dict = {}
for category in categories:
    category_name = category["name"]
    target_img_dict[category_name] = {}
    for image_id, image_data in images_dict.items():
        if category_name in image_data["targets"]:
            target_img_dict[category_name][image_id] = image_data.copy()
            target_img_dict[category_name][image_id]["target"] = category_name
            target_img_dict[category_name][image_id]["other_targets"] = list(filter(lambda x: x != category_name, image_data["targets"]))
            del target_img_dict[category_name][image_id]["targets"]

list(target_img_dict.items())[1]

('bicycle',
 {283524: {'file_name': 'COCO_train2014_000000283524.jpg',
   'gender': 'male',
   'target': 'bicycle',
   'other_targets': ['potted plant', 'cup', 'knife', 'bowl', 'person']},
  530683: {'file_name': 'COCO_train2014_000000530683.jpg',
   'gender': 'female',
   'target': 'bicycle',
   'other_targets': ['dog',
    'car',
    'person',
    'traffic light',
    'bus',
    'handbag']},
  441488: {'file_name': 'COCO_train2014_000000441488.jpg',
   'gender': 'male',
   'target': 'bicycle',
   'other_targets': ['person', 'cell phone', 'bench']},
  64897: {'file_name': 'COCO_train2014_000000064897.jpg',
   'gender': 'male',
   'target': 'bicycle',
   'other_targets': ['person', 'skateboard', 'backpack']},
  481736: {'file_name': 'COCO_train2014_000000481736.jpg',
   'gender': 'male',
   'target': 'bicycle',
   'other_targets': ['person', 'surfboard', 'car']},
  57745: {'file_name': 'COCO_train2014_000000057745.jpg',
   'gender': 'male',
   'target': 'bicycle',
   'other_targets': [

In [20]:
# Convert dictionary to json file 
with open('data.json', 'w') as fp:
    json.dump(target_img_dict, fp, indent=4)

# Filter targets not strongly relevant

In [21]:
# gather targets that have less than 100 images in the training set

while True:
    irrelevant_targets = []
    target_img_dict_items = target_img_dict.items()
    for category in categories:
        category_name = category["name"]
        if not (category_name in target_img_dict):
            continue
        #print("category: ", category_name, " number of images: ", len(target_img_dict[category_name]))
        if len(target_img_dict[category_name]) < 100:
            irrelevant_targets.append(category_name)
    if not irrelevant_targets:
        break

    print("irrelevant targets: ", irrelevant_targets)

    # remove irrelevant targets from target_img_dict
    images_to_remove = set()
    for target in irrelevant_targets:
        images_to_remove.update(list(target_img_dict[target].keys()))
        del target_img_dict[target]
    print("images to remove: ", images_to_remove)
    print("number of images to remove: ", len(images_to_remove))
    for target, image_dict in target_img_dict.items():
        for image_id in images_to_remove:
            if image_id in image_dict:
                #print("removing image: ", image_id, " from target: ", target)
                del image_dict[image_id]

irrelevant targets:  ['bear', 'zebra', 'broccoli', 'carrot', 'toaster', 'hair drier']
images to remove:  {21504, 118785, 267266, 157186, 266756, 457734, 449032, 349709, 403982, 122382, 516624, 24081, 443410, 478227, 221717, 121366, 159768, 129563, 111646, 284703, 318497, 300578, 208931, 210471, 528938, 37931, 69675, 502827, 95278, 429614, 439346, 430643, 560691, 411189, 356916, 71224, 163385, 546361, 74820, 361547, 95308, 438349, 11856, 86611, 334423, 391258, 434270, 118369, 561763, 29799, 444010, 364139, 116334, 395888, 397938, 77426, 529524, 341621, 77432, 313465, 207481, 243327, 208000, 444033, 147073, 369283, 209027, 407173, 73864, 198793, 515210, 114316, 102550, 290967, 208022, 545944, 229530, 410779, 544408, 541343, 85666, 329380, 20136, 243373, 116911, 72370, 407734, 362166, 178872, 443579, 553660, 234685, 524476, 142016, 218305, 431817, 478410, 431820, 564729, 109774, 440527, 537293, 52433, 416978, 360147, 318677, 437464, 53464, 494811, 229084, 543454, 48863, 97504, 493792, 365

In [22]:
def read_filtered_targets(file_path):
    target_list = []
    with open(file_path, 'r') as file:
        for line in file:
            target_list.append(line.strip().replace("_", " "))  # strip() removes leading/trailing whitespaces and newlines
    return target_list

filtered_targets = read_filtered_targets("filtered_targets.txt")
len(filtered_targets)

66

In [23]:
category_names = [x["name"] for x in categories]
irrelevant_targets_MALS = list(set(category_names) - set(filtered_targets))
irrelevant_targets_MALS

['orange',
 'parking meter',
 'bird',
 'toaster',
 'apple',
 'scissors',
 'bear',
 'vase',
 'sheep',
 'carrot',
 'hair drier',
 'broccoli',
 'stop sign',
 'zebra']

In [24]:
weird_targets = [x for x in irrelevant_targets_MALS if x not in irrelevant_targets]
weird_targets

['orange',
 'parking meter',
 'bird',
 'toaster',
 'apple',
 'scissors',
 'bear',
 'vase',
 'sheep',
 'carrot',
 'hair drier',
 'broccoli',
 'stop sign',
 'zebra']

In [25]:
len(target_img_dict["vase"])

283

In [26]:
[id for id, data in target_img_dict["vase"].items()]

[467809,
 116268,
 261479,
 48711,
 177858,
 312247,
 301631,
 445965,
 490099,
 571799,
 296153,
 31536,
 387045,
 334364,
 557527,
 567268,
 276200,
 56608,
 52634,
 172506,
 252069,
 41319,
 221840,
 350044,
 55276,
 272741,
 347403,
 374239,
 258414,
 416878,
 151566,
 244597,
 171139,
 352357,
 444409,
 275982,
 24175,
 464766,
 201273,
 146972,
 578866,
 387087,
 198607,
 355638,
 342340,
 56287,
 509442,
 189750,
 496942,
 34795,
 575938,
 19994,
 105529,
 449913,
 421108,
 212311,
 446064,
 2083,
 330164,
 465123,
 456035,
 361437,
 298034,
 464810,
 355440,
 523997,
 175101,
 221663,
 233319,
 350940,
 362768,
 482728,
 304385,
 259067,
 267216,
 297724,
 97130,
 309840,
 90962,
 289075,
 252093,
 529676,
 152036,
 20421,
 503557,
 161749,
 347016,
 180351,
 571603,
 555586,
 260957,
 538537,
 158501,
 297995,
 432604,
 106123,
 398301,
 87101,
 282733,
 38718,
 551169,
 249455,
 551522,
 80911,
 158362,
 256190,
 358421,
 580026,
 72792,
 306439,
 362879,
 13106,
 185818,
 30

In [27]:
for img_id, img_data in target_img_dict["vase"].items():
    if not "person" in img_data["other_targets"]:
        print(img_id)