In [638]:
from pycocotools.coco import COCO
import pandas as pd
import nltk
from tqdm.notebook import tqdm

In [639]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package punkt to /Users/b-o-o-p/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/b-o-o-p/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/b-o-o-p/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [640]:
dataDir='./data'
dataType='val2017'
annFile='{}/annotations/instances_{}.json'.format(dataDir, dataType)

In [641]:
coco=COCO(annFile)

loading annotations into memory...
Done (t=0.59s)
creating index...
index created!


In [642]:
# get all images containing given categories, select one at random
cats = coco.loadCats(coco.getCatIds())
catsIds=[cat['id'] for cat in cats]
cats = coco.getCatIds()
print(cats[0])

1


In [643]:
imgIds = coco.getImgIds()
print(len(imgIds))
imgs = coco.loadImgs(ids=imgIds)
print(imgs[0])

5000
{'license': 4, 'file_name': '000000397133.jpg', 'coco_url': 'http://images.cocodataset.org/val2017/000000397133.jpg', 'height': 427, 'width': 640, 'date_captured': '2013-11-14 17:02:52', 'flickr_url': 'http://farm7.staticflickr.com/6116/6255196340_da26cf2c9e_z.jpg', 'id': 397133}


In [644]:
# display COCO categories and supercategories
cats = coco.loadCats(coco.getCatIds())
nms=[cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))

nms = set([cat['supercategory'] for cat in cats])
print('COCO supercategories: \n{}'.format(' '.join(nms)))

COCO categories: 
person bicycle car motorcycle airplane bus train truck boat traffic light fire hydrant stop sign parking meter bench bird cat dog horse sheep cow elephant bear zebra giraffe backpack umbrella handbag tie suitcase frisbee skis snowboard sports ball kite baseball bat baseball glove skateboard surfboard tennis racket bottle wine glass cup fork knife spoon bowl banana apple sandwich orange broccoli carrot hot dog pizza donut cake chair couch potted plant bed dining table toilet tv laptop mouse remote keyboard cell phone microwave oven toaster sink refrigerator book clock vase scissors teddy bear hair drier toothbrush

COCO supercategories: 
furniture person vehicle outdoor sports electronic food indoor accessory kitchen appliance animal


In [645]:
annFile = '{}/annotations/captions_{}.json'.format(dataDir,dataType)
coco_caps=COCO(annFile)

loading annotations into memory...
Done (t=0.05s)
creating index...
index created!


In [646]:
# load and display caption annotations
annIds = coco_caps.getAnnIds(imgIds=imgIds[0], iscrowd=None)
print(imgIds[0])
anns = coco_caps.loadAnns(annIds)
coco_caps.showAnns(anns)
print(anns)

397133
A man is in a kitchen making pizzas.
Man in apron standing on front of oven with pans and bakeware
A baker is working in the kitchen rolling dough.
A person standing by a stove in a kitchen.
A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.
[{'image_id': 397133, 'id': 370509, 'caption': 'A man is in a kitchen making pizzas.'}, {'image_id': 397133, 'id': 370584, 'caption': 'Man in apron standing on front of oven with pans and bakeware'}, {'image_id': 397133, 'id': 372252, 'caption': 'A baker is working in the kitchen rolling dough.'}, {'image_id': 397133, 'id': 372765, 'caption': 'A person standing by a stove in a kitchen.'}, {'image_id': 397133, 'id': 375891, 'caption': 'A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.'}]


In [647]:
# initialize COCO api for person keypoints annotations
annFile = '{}/annotations/person_keypoints_{}.json'.format(dataDir, dataType)
coco_kps=COCO(annFile)

annIds = coco_kps.getAnnIds(imgIds=imgIds[0], iscrowd=None)
anns = coco_kps.loadAnns(annIds)
print(anns)
print(int(anns[0]['category_id']))
print(cats[int(anns[0]['category_id'])]['name'])

loading annotations into memory...
Done (t=0.22s)
creating index...
index created!
[{'segmentation': [[446.71, 70.66, 466.07, 72.89, 471.28, 78.85, 473.51, 88.52, 473.51, 98.2, 462.34, 111.6, 475.74, 126.48, 484.67, 136.16, 494.35, 157.74, 496.58, 174.12, 498.07, 182.31, 485.42, 189.75, 474.25, 189.01, 470.53, 202.4, 475.74, 337.12, 469.04, 347.54, 455.65, 343.08, 450.44, 323.72, 441.5, 255.99, 433.32, 250.04, 406.52, 340.1, 397.59, 344.56, 388.66, 330.42, 408.01, 182.31, 396.85, 186.77, 392.38, 177.84, 389.4, 166.68, 390.89, 147.32, 418.43, 119.04, 434.06, 111.6, 429.6, 98.94, 428.85, 81.08, 441.5, 72.89, 443.74, 69.92]], 'num_keypoints': 13, 'area': 17376.91885, 'iscrowd': 0, 'keypoints': [433, 94, 2, 434, 90, 2, 0, 0, 0, 443, 98, 2, 0, 0, 0, 420, 128, 2, 474, 133, 2, 396, 162, 2, 489, 173, 2, 0, 0, 0, 0, 0, 0, 419, 214, 2, 458, 215, 2, 411, 274, 2, 458, 273, 2, 402, 333, 2, 465, 334, 2], 'image_id': 397133, 'bbox': [388.66, 69.92, 109.41, 277.62], 'category_id': 1, 'id': 200887}, {'

In [648]:
def getCaps(coco, imgId):
    caps = coco.loadAnns(coco.getAnnIds(imgIds=imgId))
    return list(map(lambda cap: cap['caption'], caps))

def get_url(coco, imgId):
    imgs = coco.loadImgs(ids=imgId)
    return next(img['coco_url'] for img in imgs if img['id'] == imgId)


def get_categories(coco, imgId):
    try:
        anns = coco.loadAnns(coco.getAnnIds(imgIds=imgId))
        if len(anns):
            catIds = [ann['category_id'] for ann in anns if ann['image_id'] == imgId]
            cats = coco.loadCats(coco.getCatIds(catIds=catIds))
            result = [cat['name'] for cat in cats]
            result.append('man')
            result.append('woman')
            result.append('human')
            result.append('people')
            result.append('person')
            return result
        else:
            return None
    except KeyError:
        return None

In [649]:
def process_cap(cap, cats):
    words = nltk.word_tokenize(cap.lower())
    no_punkt = [word for word in words if word.isalnum()]
    tags = nltk.pos_tag(no_punkt)
    catsL = [cat.lower() for cat in cats]
    nounsAndAdjs = [word for word,pos in tags if (pos == 'NN' or pos == 'NNS')]

    Lem = WordNetLemmatizer()
    words = [Lem.lemmatize(word) for word in nounsAndAdjs]
    result = []

    for word in words:
        isSyn = False
        for syn in nltk.corpus.wordnet.synsets(word):
            hypernyms = [h.name().split('.')[0] for h in syn.hypernyms()]
            synonyms = [l.name() for l in syn.lemmas()]
            hyponyms = [h.name().split('.')[0] for h in syn.hyponyms()]
            meronyms = [m.name().split('.')[0] for m in syn.part_meronyms()]

            if any(cat == word or
                   cat in hyponyms
                   or cat in synonyms
                   or cat in hypernyms
                   or cat in meronyms
                   for cat in catsL):
                isSyn = True

        if not isSyn:
            result.append(word)

    return result


In [650]:
def getImgsWithCaptions(coco, cocoCaps, imgs):
    result = []

    for img in tqdm(imgs):
        id = img['id']
        categories = get_categories(coco, id) or []
        url = get_url(coco, id)
        caps = getCaps(cocoCaps, id)
        if categories:
            for cap in caps:
                objects = list(set(process_cap(cap, categories)))
                result.append({
                    'id': id,
                    'cap': cap,
                    'url': url,
                    'background': objects,
                })

    return result

In [651]:
imgsWithCaptions = getImgsWithCaptions(coco, coco_caps, imgs)

df = pd.DataFrame(imgsWithCaptions, columns=['id', 'cap', 'url', 'background'])
df.head().style.hide_index()

  0%|          | 0/5000 [00:00<?, ?it/s]

id,cap,url,background
397133,A man is in a kitchen making pizzas.,http://images.cocodataset.org/val2017/000000397133.jpg,"['kitchen', 'pizza']"
397133,Man in apron standing on front of oven with pans and bakeware,http://images.cocodataset.org/val2017/000000397133.jpg,"['apron', 'front', 'standing', 'pan', 'bakeware']"
397133,A baker is working in the kitchen rolling dough.,http://images.cocodataset.org/val2017/000000397133.jpg,"['kitchen', 'baker', 'dough']"
397133,A person standing by a stove in a kitchen.,http://images.cocodataset.org/val2017/000000397133.jpg,"['kitchen', 'stove']"
397133,A table with pies being made and a person standing near a wall with pots and pans hanging on the wall.,http://images.cocodataset.org/val2017/000000397133.jpg,"['pie', 'pan', 'wall', 'table', 'pot']"


In [652]:
backgrounds = set()

for info in imgsWithCaptions:
    b = info['background']
    for background in b:
        backgrounds.add(background)

print('Total number of caps: {}'.format(len(imgsWithCaptions)))
print('Number of unique background words: {}'.format(len(backgrounds)))

Total number of caps: 24774
Number of unique background words: 3662


In [653]:
imgWithMountains = [img for img in imgsWithCaptions if 'mountain' in img['background']]

df = pd.DataFrame(imgWithMountains, columns=['id', 'cap', 'url', 'background'])
df.head().style.hide_index()

id,cap,url,background
500663,Cattle are grazing in a bright green pasture along a river and mountains.,http://images.cocodataset.org/val2017/000000500663.jpg,"['pasture', 'river', 'mountain']"
785,A young woman is skiing down the mountain slope.,http://images.cocodataset.org/val2017/000000000785.jpg,"['mountain', 'slope']"
99242,An older man is skiing down a snowy mountain.,http://images.cocodataset.org/val2017/000000099242.jpg,"['snowy', 'mountain']"
541055,A group of people on skis at a peak of a mountain.,http://images.cocodataset.org/val2017/000000541055.jpg,"['ski', 'peak', 'mountain']"
409475,A smiling pair of skiers with a huge snow covered mountain behind them.,http://images.cocodataset.org/val2017/000000409475.jpg,"['pair', 'snow', 'skier', 'mountain']"
