# Selecting and Splitting Data

In [1]:
from bs4 import BeautifulSoup
import os
import numpy as np
import json
import pickle
from sklearn.model_selection import train_test_split

In [8]:
def get_bnbox(soup):
    objs = soup.find_all('object')
    golds = {}
    for obj in objs:
        names = obj.find_all('name')
        bndboxes = obj.find_all('bndbox')
        if len(bndboxes) > 0:
            boxes = [get_coordinates(box) for box in bndboxes]
            for name in names:
                if name.string in golds:
                    golds[name.string].extend(boxes)
                else:
                    golds[name.string] = boxes
    return golds


def get_coordinates(bnbox):
    # takes the xml coordinates and returns a tuple (ymin, xmin, ymax, xmax)
    ymin = int(bnbox.find('ymin').string)
    xmin = int(bnbox.find('xmin').string)
    ymax = int(bnbox.find('ymax').string)
    xmax = int(bnbox.find('xmax').string)
    # debug
    #print(ymin, xmin, ymax, xmax)
    #print(type(ymin), type(xmin), type(ymax), type(xmax))
    return (ymin, xmin, ymax, xmax)


## helper function to turn a numpy unsigned box into a python tuple of (normal,
# signed) python integers
def as_int_tuple(np_box):
    box = [int(i) for i in np_box]
    return tuple(box)


## Function to remove objects from the dictionary that appear multiple times
#  in an image
def remove_multiple_objs(dict_obj):
    single_objs_by_file = {}
    
    for file in dict_obj.keys():
        single_objs_by_file[file] = {}
        
        for obj in dict_obj[file].keys():
            if len(dict_obj[file][obj]) == 1:
                single_objs_by_file[file][obj] = dict_obj[file][obj]
        
    return single_objs_by_file

def get_obj_names(text):
    temp = {}
    d = {}
    
    closing_bracket = 0
    while text.find("#", closing_bracket) != -1:
        hash_sign = text.find("#", closing_bracket) + 1
        closing_bracket = text.find("]", hash_sign)
        
        description = text[hash_sign:closing_bracket].lower().split("/")
        obj = description[0]
        phrase = description[-1]
        phrase = phrase.split(" ")[1:]
        
        if obj in temp:
            temp[obj].extend(phrase)
        else:            
            temp[obj] = phrase
    
    stopwords = ["a","an","the","his","her","their","one","two","three","four","five","six","seven","eight",\
                "nine","ten","blue","red","yellow","black","white","orange","green","purple","pink","gold","silver"]
    
    for obj in temp.keys():
        l = []
        c = []
        for word in temp[obj]:
            if word in stopwords:
                continue
            elif word not in l:
                l.append(word)
                c.append(1)
            else:
                c[l.index(word)] += 1
        
        if len(l) > 0:
            if c.count(max(c)) == 1:
                d[obj] = {"word":l[np.argmax(c)]}
    return d


def remove_categories(dict_obj, obj_cat):
    
    new_dict_obj = {}
    
    for file in dict_obj.keys():
        new_dict_obj[file] = {}
        used_cat = []
        
        for obj in dict_obj[file].keys():
        # Categories that occur more than once are removed:
            if obj not in obj_cat[file]:
                print("Obj not in obj_cat file: {}, obj: {}".format(file,obj))
                continue
            elif obj_cat[file][obj] in used_cat:
                continue
        # Categories "other" and "scene" are removed:
            elif obj_cat[file][obj] == "other":
                continue
            elif obj_cat[file][obj] == "scene":
                continue
            else:
                used_cat.append(obj_cat[file][obj])
                l = []
                l.append(dict_obj[file][obj])
                l.append(obj_cat[file][obj])
                new_dict_obj[file][obj] = l
        
        # Images with only 1 remaining object are removed:
        if len(new_dict_obj[file]) < 2:
            del new_dict_obj[file]

    return new_dict_obj

## Sentences

### Extract object names from `Sentences`.

In [3]:
sents = os.listdir("F30k/Flickr30kEntities/Sentences")
obj_names_by_file = {}

for sent in sents:
    file_name = sent.split(".")[0]
    
    with open(("F30k/Flickr30kEntities/Sentences/"+file_name+".txt"), "r") as f:
        contents = f.read()
        
        obj_names_by_file[file_name] = get_obj_names(contents)

obj_names_by_file 

{'3187924573': {'0': {'word': 'it'},
  '99600': {'word': 'bird'},
  '99601': {'word': 'seagulls'},
  '99602': {'word': 'beak'},
  '99604': {'word': 'water'},
  '99605': {'word': 'fish'}},
 '2856053254': {'77019': {'word': 'boy'},
  '77022': {'word': 'park'},
  '77024': {'word': 'hands'},
  '77025': {'word': 'ladder'}},
 '445242295': {'169815': {'word': 'boat'},
  '169817': {'word': 'water'},
  '169818': {'word': 'shore'},
  '169819': {'word': 'man'}},
 '298883053': {'0': {'word': 'he'},
  '85802': {'word': 'plastic'},
  '85803': {'word': 'legos'},
  '85805': {'word': 'playpen'}},
 '284644694': {'76591': {'word': 'girl'},
  '76593': {'word': 'man'},
  '76594': {'word': 'spectators'},
  '76597': {'word': 'toothbrush'},
  '76598': {'word': 'paste'},
  '76599': {'word': 'india'}},
 '2464118785': {'0': {'word': 'that'},
  '51429': {'word': 'women'},
  '51434': {'word': 'woman'},
  '51436': {'word': 'target'},
  '51437': {'word': 'glass'},
  '51439': {'word': 'game'},
  '51440': {'word': 'ca

## Annotations

### Create a Dictionary of Objects

In [14]:
annots = os.listdir("F30k/Flickr30kEntities/Annotations")

dict_of_objects = {} # {filename: {objects: [coordinates]}

for ann in annots:
    file_name = ann.split(".")[0]
    with open(("F30k/Flickr30kEntities/Annotations/"+file_name+".xml"), "r") as f:
        contents = f.read()

    soup = BeautifulSoup(contents, "xml")
    
    dict_of_objects[file_name] = get_bnbox(soup)

## Remove Multiple Objects

In [18]:
# Applying remove_multiple_objs:
dict_of_objects = remove_multiple_objs(dict_of_objects)

31783


### Putting the Dictionaries Together

In [53]:
dict_words_boxes = {}
# ALSO REMOVEs MULTIPLE IDENTICAL WORDS FROM THE SAME IMAGE

for file in obj_names_by_file.keys():
    dict_words_boxes[file] = {}
    used_words = []
    
    for obj in obj_names_by_file[file].keys():
        
        word = obj_names_by_file[file][obj]["word"]
        if word in used_words:
            continue # SKIPS OBJECTS WITH A USED WORD. NOT RANDOMLY SELECTED!
        else:
            used_words.append(word)
        
        if obj in dict_of_objects[file]:
            dict_words_boxes[file][obj] = {}         
            dict_words_boxes[file][obj]["word"] = obj_names_by_file[file][obj]["word"]
            dict_words_boxes[file][obj]["bnbox"] = dict_of_objects[file][obj]
    
    if len(dict_words_boxes[file]) < 2: # REMOVES IMAGES WITH FEWER THAN 2 OBJECTS REMAINING
        del dict_words_boxes[file]

File: 2464118785
{'51436': {'word': 'target', 'bnbox': [(29, 370, 151, 499)]}}

File: 7006590104
{'263824': {'word': 'ball', 'bnbox': [(259, 34, 301, 77)]}}

File: 16397322
{'18001': {'word': 'woman', 'bnbox': [(157, 14, 272, 93)]}}

File: 3599124739
{'130549': {'word': 'surf', 'bnbox': [(76, 1, 375, 499)]}}

File: 110600869
{'2672': {'word': 'jacket', 'bnbox': [(136, 4, 338, 319)]}}

File: 2468906010
{}

File: 3271495320
{'105961': {'word': 'waterside', 'bnbox': [(126, 1, 343, 407)]}}

File: 717673249
{'266598': {'word': 'grass', 'bnbox': [(1, 1, 332, 498)]}}

File: 4099476662
{'155392': {'word': 'man', 'bnbox': [(10, 132, 473, 473)]}}

File: 4104001356
{}

File: 2040964486
{}

File: 4891714957
{'215597': {'word': 'man', 'bnbox': [(19, 3, 353, 289)]}}

File: 3956420030
{'149504': {'word': 'bench', 'bnbox': [(357, 1, 472, 334)]}}

File: 4242041141
{'160116': {'word': 'softball', 'bnbox': [(327, 432, 363, 468)]}}

File: 799431781
{'277480': {'word': 'mouth', 'bnbox': [(130, 210, 154, 26

File: 3344692671
{'111315': {'word': 'vests', 'bnbox': [(143, 226, 241, 300)]}}

File: 188552459
{'22614': {'word': 'poster', 'bnbox': [(23, 111, 358, 343)]}}

File: 2764057744
{'71350': {'word': 'pit', 'bnbox': [(247, 89, 373, 276)]}}

File: 878758390
{}

File: 4940289938
{'221041': {'word': 'woman', 'bnbox': [(86, 263, 400, 500)]}}

File: 1467533293
{'14020': {'word': 'pink', 'bnbox': [(140, 168, 309, 225)]}}

File: 377872472
{'142296': {'word': 'dog', 'bnbox': [(105, 1, 169, 73)]}}

File: 3105691757
{}

File: 3415589320
{'116472': {'word': 'grass', 'bnbox': [(2, 3, 374, 500)]}}

File: 4015350856
{}

File: 4361532957
{}

File: 501699433
{'227064': {'word': 'child', 'bnbox': [(129, 332, 331, 499)]}}

File: 3184031654
{'99287': {'word': 'woman', 'bnbox': [(77, 242, 320, 480)]}}

File: 145291221
{'13413': {'word': 'sign', 'bnbox': [(127, 216, 175, 280)]}}

File: 4075205914
{'154337': {'word': 'others', 'bnbox': [(106, 56, 331, 499)]}}

File: 1457762320
{'13604': {'word': 'water', 'bnbox

File: 1461329041
{'13753': {'word': 'concrete', 'bnbox': [(225, 1, 371, 500)]}}

File: 3169276423
{'97942': {'word': 'woman', 'bnbox': [(52, 228, 332, 451)]}}

File: 2301379282
{'40059': {'word': 'baby', 'bnbox': [(49, 103, 392, 318)]}}

File: 4726714032
{}

File: 239807547
{}

File: 7425128736
{}

File: 908636680
{'283363': {'word': 'marshmallows', 'bnbox': [(240, 146, 253, 159)]}}

File: 3343197133
{'111151': {'word': 'skyline', 'bnbox': [(4, 2, 273, 499)]}}

File: 835606668
{'281179': {'word': 'people', 'bnbox': [(116, 2, 246, 134)]}}

File: 2774504145
{}

File: 4689716914
{}

File: 2772084628
{}

File: 4612024952
{}

File: 4629445758
{'184345': {'word': 'table', 'bnbox': [(241, 282, 334, 500)]}}

File: 97105139
{}

File: 6302275137
{'252036': {'word': 'competitor', 'bnbox': [(22, 199, 338, 487)]}}

File: 3468694409
{'120344': {'word': 'station', 'bnbox': [(7, 13, 194, 494)]}}

File: 76466808
{'272649': {'word': 'plane', 'bnbox': [(74, 20, 300, 499)]}}

File: 4671423167
{}

File: 46

{'185042': {'word': 'umbrella', 'bnbox': [(1, 238, 169, 415)]}}

File: 2705099283
{}

File: 4503659154
{'173586': {'word': 'couple', 'bnbox': [(182, 39, 500, 323)]}}

File: 4889806295
{'215254': {'word': 'man', 'bnbox': [(108, 225, 333, 349)]}}

File: 4752961136
{'198824': {'word': 'man', 'bnbox': [(44, 99, 309, 233)]}}

File: 2656914235
{}

File: 1084040636
{}

File: 4792134256
{'203181': {'word': 'table', 'bnbox': [(246, 1, 375, 499)]}}

File: 6948564341
{'262247': {'word': 'net', 'bnbox': [(43, 166, 264, 345)]}}

File: 3776839227
{}

File: 4716793596
{'194401': {'word': 'statue', 'bnbox': [(15, 220, 237, 345)]}}

File: 4571895229
{'179740': {'word': 'area', 'bnbox': [(1, 1, 232, 500)]}}

File: 2971381841
{'84777': {'word': 'island', 'bnbox': [(240, 46, 334, 383)]}}

File: 3431487300
{}

File: 22930048
{'39584': {'word': 'red', 'bnbox': [(67, 109, 230, 203)]}}

File: 2424556844
{'48444': {'word': 'blond', 'bnbox': [(127, 179, 375, 299)]}}

File: 2937365149
{}

File: 189740668
{'22868

File: 369047365
{}

File: 4701498512
{'192822': {'word': 'hats', 'bnbox': [(75, 293, 208, 481)]}}

File: 2782759274
{'72748': {'word': 'room', 'bnbox': [(1, 1, 333, 500)]}}

File: 2493825916
{'53203': {'word': 'boy', 'bnbox': [(66, 171, 271, 276)]}}

File: 2864854530
{'77613': {'word': 'women', 'bnbox': [(191, 311, 316, 387)]}}

File: 3168841415
{'97867': {'word': 'frisbee', 'bnbox': [(175, 206, 200, 265)]}}

File: 4862788297
{'212508': {'word': 'girl', 'bnbox': [(6, 5, 332, 294)]}}

File: 2087954558
{'27851': {'word': 'leader', 'bnbox': [(248, 143, 333, 180)]}}

File: 3473320907
{'120760': {'word': 'face', 'bnbox': [(287, 84, 304, 96)]}}

File: 3446191973
{}

File: 4545817922
{}

File: 3780768589
{'142358': {'word': 'building', 'bnbox': [(1, 2, 333, 500)]}}

File: 4888378070
{}

File: 277167533
{}

File: 3375070563
{'113943': {'word': 'white', 'bnbox': [(28, 122, 229, 334)]}}

File: 2980958891
{'85335': {'word': 'building', 'bnbox': [(4, 4, 404, 363)]}}

File: 242324909
{'48324': {'wo


File: 7983388093
{'277279': {'word': 'african-american', 'bnbox': [(7, 52, 328, 212)]}}

File: 4637341301
{'185294': {'word': 'dock', 'bnbox': [(138, 2, 330, 498)]}}

File: 3191120264
{}

File: 2127566743
{'30238': {'word': 'guitars', 'bnbox': [(150, 192, 275, 338)]}}

File: 5338568818
{}

File: 3928848343
{'148469': {'word': 'ride', 'bnbox': [(1, 43, 300, 500)]}}

File: 3111261053
{'94359': {'word': 'table', 'bnbox': [(289, 269, 375, 500)]}}

File: 4782588680
{'202284': {'word': 'street', 'bnbox': [(332, 3, 429, 499)]}}

File: 363056425
{}

File: 122188825
{'5499': {'word': 'pavement', 'bnbox': [(190, 231, 219, 289)]}}

File: 3069786374
{'91306': {'word': 'audience', 'bnbox': [(1, 1, 455, 361)]}}

File: 3313869283
{}

File: 268704620
{'66155': {'word': 'snow', 'bnbox': [(4, 1, 352, 500)]}}

File: 4650780114
{'186777': {'word': 'purses', 'bnbox': [(252, 363, 332, 406)]}}

File: 355869840
{}

File: 3923857105
{}

File: 6221289833
{'250463': {'word': 'boy', 'bnbox': [(7, 98, 315, 255)]}

File: 7651701676
{'272735': {'word': 'suit', 'bnbox': [(73, 301, 326, 412)]}}

File: 3273489163
{'106081': {'word': 'platform', 'bnbox': [(2, 1, 487, 332)]}}

File: 4083474327
{'154637': {'word': 'bicycle', 'bnbox': [(102, 227, 267, 370)]}}

File: 3879927955
{'146763': {'word': 'woman', 'bnbox': [(48, 87, 267, 225)]}}

File: 4444783987
{'168883': {'word': 'teacher', 'bnbox': [(25, 1, 333, 100)]}}

File: 487487795
{}

File: 4693542894
{}

File: 7036810145
{}

File: 6474991347
{'254745': {'word': 'basketball', 'bnbox': [(29, 212, 87, 276)]}}

File: 4552824261
{}

File: 3713324467
{'138857': {'word': 'man', 'bnbox': [(70, 187, 403, 297)]}}

File: 649596742
{}

File: 4089379359
{'154887': {'word': 'game', 'bnbox': [(273, 3, 411, 260)]}}

File: 8192398089
{'280266': {'word': 'man', 'bnbox': [(160, 131, 312, 201)]}}

File: 3482787182
{}

File: 4877121562
{'214033': {'word': 'bottle', 'bnbox': [(385, 76, 471, 99)]}}

File: 3093713234
{'93042': {'word': 'man', 'bnbox': [(21, 130, 116, 263)]}}


File: 2695001634
{}

File: 3389458786
{'114817': {'word': 'boy', 'bnbox': [(52, 227, 258, 398)]}}

File: 185571867
{'22035': {'word': 'sidewalk', 'bnbox': [(233, 57, 396, 341)]}}

File: 2251277216
{'36902': {'word': 'sidewalk', 'bnbox': [(175, 1, 340, 284)]}}

File: 32473949
{'104145': {'word': 'white', 'bnbox': [(33, 137, 102, 223)]}}

File: 512542833
{}

File: 533363682
{}

File: 2159440067
{}

File: 6536482681
{'255378': {'word': 'ball', 'bnbox': [(123, 100, 166, 160)]}}

File: 2249783869
{}

File: 4658311580
{'187691': {'word': 'street', 'bnbox': [(218, 1, 345, 230)]}}

File: 5696880945
{'241212': {'word': 'water', 'bnbox': [(5, 5, 323, 499)]}}

File: 32080607
{'101174': {'word': 'man', 'bnbox': [(166, 202, 261, 234)]}}

File: 2480820830
{'52600': {'word': 'waves', 'bnbox': [(57, 2, 140, 500)]}}

File: 4819349895
{}

File: 4725459622
{'195690': {'word': 'field', 'bnbox': [(224, 1, 333, 499)]}}

File: 3454315016
{}

File: 4967261262
{'224206': {'word': 'spectators', 'bnbox': [(367, 

File: 2147623140
{}

File: 4483343442
{'172125': {'word': 'sneakers', 'bnbox': [(419, 251, 463, 284)]}}

File: 3133044777
{'95743': {'word': 'man', 'bnbox': [(12, 326, 194, 469)]}}

File: 5214515583
{'231113': {'word': 'guitar', 'bnbox': [(166, 168, 255, 357)]}}

File: 2705925410
{'67537': {'word': 'man', 'bnbox': [(44, 75, 267, 192)]}}

File: 4851434789
{'210756': {'word': 'pond', 'bnbox': [(311, 1, 374, 394)]}}

File: 3074265400
{'91571': {'word': 'building', 'bnbox': [(2, 2, 369, 500)]}}

File: 508958120
{'228621': {'word': 'rock', 'bnbox': [(198, 29, 324, 181)]}}

File: 1357724865
{'9825': {'word': 'man', 'bnbox': [(299, 254, 343, 276)]}}

File: 2288988751
{'39242': {'word': 'table', 'bnbox': [(285, 1, 333, 112)]}}

File: 3580941967
{'129280': {'word': 'window', 'bnbox': [(31, 15, 249, 315)]}}

File: 6283123175
{'251641': {'word': 'volleyball', 'bnbox': [(70, 140, 444, 232)]}}

File: 3653497668
{'134384': {'word': 'cake', 'bnbox': [(251, 196, 326, 414)]}}

File: 4238141532
{'159977

{'202687': {'word': 'woman', 'bnbox': [(113, 223, 286, 315)]}}

File: 931028778
{'283648': {'word': 'symbol', 'bnbox': [(7, 77, 128, 198)]}}

File: 6677999435
{'256794': {'word': 'tree', 'bnbox': [(1, 373, 172, 433)]}}

File: 2102835360
{'28948': {'word': 'flags', 'bnbox': [(2, 402, 102, 469)]}}

File: 3864394764
{'146240': {'word': 'man', 'bnbox': [(161, 66, 224, 110)]}}

File: 3990553639
{'151088': {'word': 'track', 'bnbox': [(164, 3, 474, 369)]}}

File: 365150037
{'134230': {'word': 'wallet', 'bnbox': [(242, 190, 279, 219)]}}

File: 2981162237
{}

File: 155210731
{'16150': {'word': 'woman', 'bnbox': [(70, 114, 317, 272)]}}

File: 289639811
{'79873': {'word': 'field', 'bnbox': [(177, 1, 375, 500)]}}

File: 6195608205
{}

File: 3430779304
{'117702': {'word': 'calf', 'bnbox': [(179, 318, 291, 471)]}}

File: 141066782
{'11767': {'word': 'man', 'bnbox': [(11, 4, 373, 210)]}}

File: 2526086176
{}

File: 6887324285
{'260425': {'word': 'purple', 'bnbox': [(101, 132, 138, 174)]}}

File: 5048


File: 824123145
{}

File: 2268115375
{'37948': {'word': 'man', 'bnbox': [(147, 109, 438, 225)]}}

File: 1304961697
{}

File: 4355004220
{'164614': {'word': 'man', 'bnbox': [(163, 145, 499, 319)]}}

File: 2237153173
{'36079': {'word': 'sidewalk', 'bnbox': [(233, 2, 333, 499)]}}

File: 3332202255
{}

File: 3249624518
{'104308': {'word': 'girl', 'bnbox': [(8, 6, 333, 487)]}}

File: 3719712280
{'139336': {'word': 'frisbee', 'bnbox': [(204, 324, 234, 369)]}}

File: 506211106
{'227779': {'word': 'man', 'bnbox': [(18, 172, 313, 274)]}}

File: 10002456
{}

File: 4553348746
{}

File: 2926675936
{'81988': {'word': "'", 'bnbox': [(6, 136, 127, 345)]}}

File: 428512135
{}

File: 7417714642
{'269544': {'word': 'fence', 'bnbox': [(195, 1, 330, 500)]}}

File: 2647755755
{'63793': {'word': 'child', 'bnbox': [(188, 29, 267, 264)]}}

File: 3633169600
{'132878': {'word': 'hair', 'bnbox': [(156, 158, 217, 245)]}}

File: 2372820502
{'44814': {'word': 'building', 'bnbox': [(1, 103, 100, 456)]}}

File: 3433

File: 3111897772
{'94400': {'word': 'mcdonalds', 'bnbox': [(2, 1, 244, 234)]}}

File: 278496691
{'72840': {'word': 'injured', 'bnbox': [(64, 199, 300, 436)]}}

File: 3827175236
{'144546': {'word': 'church', 'bnbox': [(57, 108, 158, 208)]}}

File: 4434580214
{'167743': {'word': 'cane', 'bnbox': [(90, 328, 264, 358)]}}

File: 3101442804
{'93634': {'word': 'students', 'bnbox': [(125, 6, 374, 497)]}}

File: 4545767663
{'176942': {'word': 'chairs', 'bnbox': [(236, 175, 286, 287)]}}

File: 3142761553
{'96255': {'word': 'orange', 'bnbox': [(63, 128, 207, 366)]}}

File: 2536993581
{'56178': {'word': 'railway', 'bnbox': [(43, 420, 375, 500)]}}

File: 396763804
{'150046': {'word': 'sand', 'bnbox': [(128, 1, 375, 500)]}}

File: 2511019188
{'54285': {'word': 'grass', 'bnbox': [(5, 5, 375, 500)]}}

File: 4533479526
{'176111': {'word': 'table', 'bnbox': [(64, 71, 333, 498)]}}

File: 4616045208
{}

File: 2064417101
{}

File: 3495546122
{'122540': {'word': 'street', 'bnbox': [(198, 1, 299, 500)]}}

Fi

File: 14868339
{'14625': {'word': 'elevator', 'bnbox': [(3, 2, 307, 296)]}}

File: 3365602213
{'113184': {'word': 'boy', 'bnbox': [(347, 173, 500, 328)]}}

File: 6074401666
{'247541': {'word': 'lab', 'bnbox': [(2, 3, 333, 499)]}}

File: 4930631660
{'219480': {'word': 'bench', 'bnbox': [(132, 138, 310, 364)]}}

File: 3158680604
{}

File: 4752283706
{'198743': {'word': 'crowd', 'bnbox': [(120, 1, 271, 497)]}}

File: 4923964340
{}

File: 2148982
{'31223': {'word': 'kitchen', 'bnbox': [(1, 1, 439, 499)]}}

File: 5938205507
{}

File: 6271611352
{}

File: 3154528397
{'97016': {'word': 'skier', 'bnbox': [(64, 196, 366, 387)]}}

File: 3411335115
{'116179': {'word': 'buildings', 'bnbox': [(122, 136, 252, 271)]}}

File: 501471231
{}

File: 250892549
{'54208': {'word': 'blue', 'bnbox': [(244, 284, 267, 307)]}}

File: 3274691778
{}

File: 72009374
{'267033': {'word': 'beer', 'bnbox': [(262, 105, 375, 160)]}}

File: 4900530053
{'216571': {'word': 'man', 'bnbox': [(28, 3, 361, 314)]}}

File: 4679881


File: 2493974889
{'53214': {'word': 'path', 'bnbox': [(331, 48, 499, 282)]}}

File: 309493622
{}

File: 6514004309
{}

File: 2914160252
{}

File: 1017675163
{'555': {'word': 'gymnast', 'bnbox': [(14, 63, 289, 331)]}}

File: 6959709744
{}

File: 3220650628
{}

File: 5371586024
{}

File: 732671252
{'268520': {'word': 'trampoline', 'bnbox': [(162, 1, 371, 428)]}}

File: 2793597468
{'73253': {'word': 'open', 'bnbox': [(240, 76, 335, 299)]}}

File: 3412450683
{'116252': {'word': 'bird', 'bnbox': [(16, 37, 367, 365)]}}

File: 4963810508
{'223798': {'word': 'young', 'bnbox': [(15, 131, 375, 368)]}}

File: 7454547004
{'270001': {'word': 'leash', 'bnbox': [(108, 129, 140, 272)]}}

File: 137430967
{'10467': {'word': 'people', 'bnbox': [(34, 3, 332, 500)]}}

File: 378969539
{'142670': {'word': 'sign', 'bnbox': [(139, 210, 215, 335)]}}

File: 7099368287
{'265359': {'word': 'ball', 'bnbox': [(7, 122, 58, 175)]}}

File: 6086876085
{'247809': {'word': 'man', 'bnbox': [(3, 62, 469, 272)]}}

File: 347

File: 2705099765
{'67189': {'word': 'sandals', 'bnbox': [(250, 117, 280, 160)]}}

File: 4752463128
{}

File: 2838619742
{'76048': {'word': 'man', 'bnbox': [(1, 110, 333, 500)]}}

File: 4741494498
{'197630': {'word': 'cowboy', 'bnbox': [(23, 96, 248, 266)]}}

File: 27323284
{'69135': {'word': 'man', 'bnbox': [(147, 234, 387, 321)]}}

File: 5526034
{'238275': {'word': 'plane', 'bnbox': [(1, 2, 217, 290)]}}

File: 7024547367
{'264123': {'word': 'table', 'bnbox': [(86, 95, 307, 378)]}}

File: 4567985212
{}

File: 3429142249
{'117546': {'word': 'climber', 'bnbox': [(71, 122, 223, 176)]}}

File: 3330019493
{'109948': {'word': 'forest', 'bnbox': [(3, 1, 467, 331)]}}

File: 6744211811
{'257407': {'word': 'child', 'bnbox': [(10, 146, 334, 289)]}}

File: 6006191679
{'246550': {'word': 'water', 'bnbox': [(203, 1, 334, 500)]}}

File: 2689001252
{'66339': {'word': 'orange', 'bnbox': [(181, 435, 218, 491)]}}

File: 4483576244
{}

File: 2101741007
{'28794': {'word': 'surface', 'bnbox': [(195, 61, 334

{'168871': {'word': 'street', 'bnbox': [(36, 1, 402, 499)]}}

File: 6789373753
{}

File: 122190424
{'5514': {'word': 'rocks', 'bnbox': [(196, 203, 233, 303)]}}

File: 3516285214
{'124082': {'word': 'wave', 'bnbox': [(30, 1, 227, 500)]}}

File: 3393940726
{'115044': {'word': 'guitar', 'bnbox': [(156, 1, 304, 387)]}}

File: 25818051
{'59567': {'word': 'tracks', 'bnbox': [(121, 1, 321, 498)]}}

File: 1867334641
{'22250': {'word': 'crosses', 'bnbox': [(67, 157, 92, 177)]}}

File: 3273585735
{}

File: 444881000
{}

File: 6177258327
{'249236': {'word': 'river', 'bnbox': [(173, 3, 332, 500)]}}

File: 506455461
{'227881': {'word': 'apparel', 'bnbox': [(186, 212, 286, 292)]}}

File: 8117746605
{}

File: 4069578876
{'154124': {'word': ',', 'bnbox': [(42, 1, 266, 153)]}}

File: 4875847698
{'213918': {'word': 'window', 'bnbox': [(1, 3, 462, 500)]}}

File: 4783219327
{'202376': {'word': 'street', 'bnbox': [(207, 4, 371, 498)]}}

File: 4860835912
{}

File: 5925539370
{'245484': {'word': 'red', 'bnbo

File: 2845080955
{'76422': {'word': 'flag', 'bnbox': [(56, 50, 437, 210)]}}

File: 3207358897
{'101097': {'word': 'square', 'bnbox': [(183, 1, 375, 499)]}}

File: 405534993
{'153566': {'word': 'other', 'bnbox': [(153, 331, 188, 354)]}}

File: 3453369116
{'119315': {'word': 'woman', 'bnbox': [(122, 149, 498, 319)]}}

File: 3717309680
{}

File: 3759999147
{'141592': {'word': 'asphalt', 'bnbox': [(338, 1, 500, 327)]}}

File: 3724150944
{}

File: 128391889
{'7181': {'word': 'paddles', 'bnbox': [(312, 372, 373, 477)]}}

File: 52989402
{'233160': {'word': 'table', 'bnbox': [(276, 161, 375, 344)]}}

File: 4871324186
{'213436': {'word': 'woman', 'bnbox': [(136, 67, 327, 182)]}}

File: 1629549272
{}

File: 2600883097
{'60738': {'word': 'water', 'bnbox': [(18, 6, 176, 457)]}}

File: 3527407036
{}

File: 1916798494
{}

File: 507419384
{'228188': {'word': 'table', 'bnbox': [(94, 162, 222, 327)]}}

File: 2421156369
{'48106': {'word': 'cobblestone', 'bnbox': [(126, 3, 332, 500)]}}

File: 791338571
{

File: 174984835
{'19913': {'word': 'woman', 'bnbox': [(31, 48, 490, 314)]}}

File: 2521213787
{}

File: 503872649
{'227372': {'word': 'woman', 'bnbox': [(14, 256, 288, 409)]}}

File: 4688351939
{'191039': {'word': 'patient', 'bnbox': [(179, 244, 365, 375)]}}

File: 2414384480
{'47710': {'word': 'train', 'bnbox': [(42, 3, 272, 500)]}}

File: 184326428
{'21857': {'word': 'woman', 'bnbox': [(126, 175, 251, 267)]}}

File: 527440136
{}

File: 4562632645
{'178717': {'word': 'of', 'bnbox': [(165, 54, 500, 310)]}}

File: 3306212559
{'108211': {'word': 'sun', 'bnbox': [(167, 245, 188, 276)]}}

File: 8066894280
{'278497': {'word': 'hand', 'bnbox': [(154, 204, 179, 222)]}}

File: 7003919692
{}

File: 6453399365
{'254452': {'word': 'ball', 'bnbox': [(265, 192, 311, 239)]}}

File: 272940778
{'69040': {'word': 'black', 'bnbox': [(1, 1, 465, 400)]}}

File: 7190066807
{'266835': {'word': 'man', 'bnbox': [(48, 7, 320, 347)]}}

File: 4712292911
{'193954': {'word': 'bus', 'bnbox': [(4, 68, 390, 488)]}}



File: 2446315531
{}

File: 246300869
{'51341': {'word': 'sign', 'bnbox': [(87, 13, 173, 495)]}}

File: 3668244651
{}

File: 2784073733
{'72807': {'word': 'man', 'bnbox': [(198, 220, 489, 335)]}}

File: 3319405494
{'109088': {'word': 'sun', 'bnbox': [(8, 252, 75, 319)]}}

File: 4685179348
{}

File: 2902599503
{'80291': {'word': 'woman', 'bnbox': [(353, 198, 500, 282)]}}

File: 1525153022
{'15628': {'word': 'water', 'bnbox': [(239, 1, 334, 499)]}}

File: 2894309943
{'79686': {'word': 'leader', 'bnbox': [(108, 190, 372, 332)]}}

File: 2511752119
{}

File: 1489471156
{'14675': {'word': 'street', 'bnbox': [(344, 1, 444, 474)]}}

File: 3139238055
{}

File: 3712008738
{}

File: 3689975998
{'137020': {'word': 'woman', 'bnbox': [(95, 144, 446, 262)]}}

File: 2644206088
{'63478': {'word': 'basket', 'bnbox': [(229, 25, 308, 137)]}}

File: 3864680925
{'146247': {'word': 'pillar', 'bnbox': [(1, 295, 251, 333)]}}

File: 2838590130
{}

File: 4725691880
{'195797': {'word': 'wares', 'bnbox': [(309, 303

File: 2957526701
{'84035': {'word': 'bag', 'bnbox': [(296, 135, 434, 236)]}}

File: 1014785440
{'476': {'word': 'street', 'bnbox': [(157, 1, 333, 500)]}}

File: 4738781753
{'197376': {'word': 'red-hair', 'bnbox': [(84, 201, 237, 299)]}}

File: 4755599999
{}

File: 4810458132
{}

File: 241031670
{'47007': {'word': 'stage', 'bnbox': [(81, 59, 122, 448)]}}

File: 779266595
{'275281': {'word': 'laugh', 'bnbox': [(182, 73, 201, 100)]}}

File: 4763738062
{'200403': {'word': 'dock', 'bnbox': [(63, 1, 319, 500)]}}

File: 395805433
{'149585': {'word': 'eyes', 'bnbox': [(93, 177, 112, 195)]}}

File: 5626262792
{'239991': {'word': 'stage', 'bnbox': [(243, 1, 331, 385)]}}

File: 3780240908
{}

File: 6907176643
{}

File: 2756636539
{}

File: 4889242095
{'215171': {'word': 'table', 'bnbox': [(154, 210, 233, 282)]}}

File: 298920219
{'85807': {'word': 'dog', 'bnbox': [(48, 333, 92, 499)]}}

File: 1375501091
{'10599': {'word': 'bag', 'bnbox': [(110, 1, 471, 149)]}}

File: 2922668858
{'81560': {'word':

File: 2957702898
{}

File: 4818426478
{'206697': {'word': 'jeans', 'bnbox': [(271, 251, 425, 294)]}}

File: 7411341578
{}

File: 3728164558
{'140038': {'word': 'railing', 'bnbox': [(189, 421, 317, 477)]}}

File: 5487530231
{'237412': {'word': 'sky', 'bnbox': [(1, 1, 360, 332)]}}

File: 1082110795
{'1964': {'word': 'shirts', 'bnbox': [(115, 250, 234, 353)]}}

File: 2899549022
{}

File: 4808416879
{'205128': {'word': 'crosswalk', 'bnbox': [(252, 1, 333, 333)]}}

File: 4953541345
{'222771': {'word': 'boy', 'bnbox': [(152, 384, 303, 447)]}}

File: 509264638
{'228762': {'word': 'bag', 'bnbox': [(201, 203, 251, 240)]}}

File: 4201060050
{'158969': {'word': 'dumpster', 'bnbox': [(261, 20, 392, 317)]}}

File: 7376855592
{}

File: 3482668767
{}

File: 3201910861
{'100559': {'word': 'glasses', 'bnbox': [(209, 193, 241, 263)]}}

File: 4584261734
{'180427': {'word': 'bacon', 'bnbox': [(165, 338, 183, 355)]}}

File: 2573625591
{'58953': {'word': 'field', 'bnbox': [(47, 1, 375, 497)]}}

File: 378434

{'102045': {'word': 'street', 'bnbox': [(454, 1, 500, 400)]}}

File: 226697434
{}

File: 3195320645
{}

File: 871909953
{'282605': {'word': 'woman', 'bnbox': [(66, 429, 140, 463)]}}

File: 3325910784
{'109553': {'word': 'dog', 'bnbox': [(40, 138, 300, 290)]}}

File: 2573811416
{}

File: 2999638340
{'86414': {'word': 'dog', 'bnbox': [(28, 15, 322, 280)]}}

File: 89253626
{}

File: 2485788356
{'52895': {'word': 'floor', 'bnbox': [(63, 2, 500, 331)]}}

File: 381767781
{}

File: 7348146586
{'268816': {'word': 'grass', 'bnbox': [(220, 1, 332, 499)]}}

File: 7027460567
{'264271': {'word': 'pavement', 'bnbox': [(238, 4, 366, 498)]}}

File: 3038760935
{'89304': {'word': 'lawn', 'bnbox': [(178, 3, 335, 499)]}}

File: 32627994
{}

File: 3472540184
{'120698': {'word': 'grass', 'bnbox': [(3, 2, 423, 500)]}}

File: 2174648405
{}

File: 2520892855
{'54865': {'word': 'street', 'bnbox': [(273, 1, 374, 500)]}}

File: 2916920568
{'81253': {'word': 'beachfront', 'bnbox': [(174, 1, 334, 500)]}}

File: 324


File: 7185642084
{'266750': {'word': 'banner', 'bnbox': [(62, 43, 183, 400)]}}

File: 3643684044
{'133729': {'word': 'table', 'bnbox': [(237, 35, 331, 494)]}}

File: 2542415282
{}

File: 4805567702
{'204711': {'word': 'rv', 'bnbox': [(2, 5, 247, 323)]}}

File: 3231749680
{'102907': {'word': 'floor', 'bnbox': [(87, 1, 374, 500)]}}

File: 2354064281
{}

File: 4491304968
{'172842': {'word': 'hands', 'bnbox': [(261, 214, 295, 248)]}}

File: 241347114
{}

File: 4445485475
{'168992': {'word': 'building', 'bnbox': [(44, 34, 241, 455)]}}

File: 134653574
{'9351': {'word': 'women', 'bnbox': [(191, 19, 493, 235)]}}

File: 2209317
{'34270': {'word': 'black', 'bnbox': [(217, 388, 333, 474)]}}

File: 3551717658
{}

File: 3609061538
{'131304': {'word': 'road', 'bnbox': [(179, 1, 284, 164)]}}

File: 4718146904
{}

File: 5536018957
{'238438': {'word': 'roof', 'bnbox': [(115, 3, 333, 500)]}}

File: 3855516061
{'145787': {'word': 'white', 'bnbox': [(87, 324, 173, 433)]}}

File: 363701716
{'133190': {'w

File: 5846714531
{'243988': {'word': 'person', 'bnbox': [(214, 431, 259, 458)]}}

File: 152809468
{'15686': {'word': 'bird', 'bnbox': [(117, 224, 226, 331)]}}

File: 450138714
{'173463': {'word': 'table', 'bnbox': [(238, 80, 375, 500)]}}

File: 229059021
{'39407': {'word': 'bench', 'bnbox': [(89, 88, 270, 434)]}}

File: 4603713542
{'181837': {'word': 'scarf', 'bnbox': [(38, 247, 273, 347)]}}

File: 3670681503
{'135701': {'word': 'mother', 'bnbox': [(109, 418, 331, 499)]}}

File: 4773293976
{'201379': {'word': 'black', 'bnbox': [(27, 334, 142, 449)]}}

File: 4935686055
{'220432': {'word': 'woman', 'bnbox': [(53, 442, 298, 500)]}}

File: 3719461451
{}

File: 4718134968
{'194822': {'word': 'street', 'bnbox': [(241, 2, 499, 498)]}}

File: 2537000363
{}

File: 2536991887
{}

File: 2831394884
{'75396': {'word': 'man', 'bnbox': [(1, 1, 324, 398)]}}

File: 2091171488
{'28174': {'word': 'woman', 'bnbox': [(17, 11, 499, 411)]}}

File: 4683565867
{'190116': {'word': 'blue', 'bnbox': [(235, 1, 332

File: 3722006169
{'139567': {'word': 'car', 'bnbox': [(107, 96, 251, 375)]}}

File: 4631909374
{'184673': {'word': 'man', 'bnbox': [(271, 77, 387, 156)]}}

File: 2340111592
{'42704': {'word': 'grass', 'bnbox': [(56, 5, 374, 498)]}}

File: 535309053
{'234382': {'word': 'woman', 'bnbox': [(186, 179, 248, 209)]}}

File: 4930823945
{'219614': {'word': 'other', 'bnbox': [(54, 310, 330, 427)]}}

File: 2260369648
{'37440': {'word': 'woman', 'bnbox': [(2, 299, 440, 500)]}}

File: 4711637472
{}

File: 3173461705
{'98244': {'word': 'fence', 'bnbox': [(3, 1, 240, 500)]}}

File: 2596900481
{'60444': {'word': 'basket', 'bnbox': [(195, 51, 290, 121)]}}

File: 4793108729
{}

File: 3385593926
{'114602': {'word': 'fence', 'bnbox': [(71, 1, 158, 499)]}}

File: 3534592364
{'125633': {'word': 'children', 'bnbox': [(122, 2, 375, 500)]}}

File: 141364977
{'11916': {'word': 'blond', 'bnbox': [(60, 113, 138, 179)]}}

File: 4437028723
{}

File: 32628083
{}

File: 2285664
{'39003': {'word': 'sidewalk', 'bnbox':

File: 43244430
{}

File: 3357209547
{}

File: 6295775965
{}

File: 4891975016
{'215617': {'word': 'street', 'bnbox': [(116, 3, 423, 379)]}}

File: 3374776293
{'113924': {'word': 'grass', 'bnbox': [(222, 1, 375, 500)]}}

File: 286797291
{'77815': {'word': 'sidewalk', 'bnbox': [(211, 1, 375, 500)]}}

File: 7396941510
{'269342': {'word': 'orange', 'bnbox': [(219, 102, 338, 281)]}}

File: 396154083
{'149778': {'word': 'woman', 'bnbox': [(51, 352, 115, 462)]}}

File: 2468921460
{'51721': {'word': 'net', 'bnbox': [(80, 237, 199, 497)]}}

File: 4810830329
{}

File: 6993511259
{'263453': {'word': 'man', 'bnbox': [(36, 176, 415, 276)]}}

File: 5381043783
{'235000': {'word': 'street', 'bnbox': [(258, 82, 374, 500)]}}

File: 3300679815
{}

File: 7669392800
{'273022': {'word': 'road', 'bnbox': [(287, 1, 500, 500)]}}

File: 6829605875
{}

File: 98377566
{'284907': {'word': 'cage', 'bnbox': [(1, 27, 500, 459)]}}

File: 2536978777
{}

File: 5973146939
{'246295': {'word': 'wicketkeeper', 'bnbox': [(13

File: 5458061796
{}

File: 968081289
{'284487': {'word': 'water', 'bnbox': [(89, 1, 499, 433)]}}

File: 2291958125
{'39518': {'word': 'calf', 'bnbox': [(220, 169, 435, 346)]}}

File: 3065560742
{'91101': {'word': 'ice', 'bnbox': [(210, 3, 385, 500)]}}

File: 6872034011
{'259990': {'word': 'snow', 'bnbox': [(104, 1, 333, 500)]}}

File: 4545487394
{'176926': {'word': 'bicycle', 'bnbox': [(181, 190, 374, 322)]}}

File: 2685518281
{}

File: 3677927146
{'136268': {'word': 'grass', 'bnbox': [(100, 1, 333, 500)]}}

File: 2427094466
{}

File: 4902710339
{}

File: 3159569570
{}

File: 3251646144
{'104483': {'word': 'large', 'bnbox': [(1, 1, 422, 500)]}}

File: 4746185365
{}

File: 2766926202
{'71564': {'word': 'water', 'bnbox': [(165, 5, 333, 499)]}}

File: 6184114561
{'249422': {'word': 'field', 'bnbox': [(157, 4, 373, 499)]}}

File: 2916012955
{'81167': {'word': 'gate', 'bnbox': [(87, 292, 159, 392)]}}

File: 3498240367
{'122813': {'word': 'picture', 'bnbox': [(30, 85, 500, 333)]}}

File: 437

## Saving the Dictionary with Words and Boxes

The bounding boxes were included for an earlier form of the model. In the eventual model, the bounding boxes were no longer necessary.

In [None]:
json_file = json.dumps(dict_words_boxes)
f = open("dict_words_boxes.json","w")
f.write(json_file)
f.close()

## Splitting Data

In [59]:
files = list(cleaned_dict.keys())

train_data, test_data = train_test_split(files, test_size= 1000)
train_data, validation_data = train_test_split(train_data, test_size= 1000)

'3187924573'

In [62]:
print(len(train_data))
print(train_data[:5])

21545
['3352791995', '2905250733', '870186901', '5188555891', '3201845131']


In [63]:
print(len(validation_data))
print(validation_data[:5])

1000
['4461621138', '4721533619', '4053707091', '7666764962', '4756096577']


In [64]:
print(len(test_data))
print(test_data[:5])

1000
['3483140026', '4467946265', '6156276992', '338857661', '175556963']


### Saving Splits

In [65]:
with open("train_data.txt", "wb") as fp:
    pickle.dump(train_data, fp)

with open("validation_data.txt", "wb") as fp:
    pickle.dump(validation_data, fp)

with open("test_data.txt", "wb") as fp:
    pickle.dump(test_data, fp)

In [68]:
print(train_data2[300])
cleaned_dict[train_data2[300]]

4728183104


{'196117': {'bnbox': [[20, 154, 374, 277]], 'word': 'man'},
 '196118': {'bnbox': [[110, 143, 146, 169]], 'word': 'beer'},
 '196119': {'bnbox': [[160, 101, 192, 121]], 'word': 'glass'},
 '196122': {'bnbox': [[150, 77, 194, 161]], 'word': 'food'},
 '196123': {'bnbox': [[97, 135, 166, 180]], 'word': 'tap'},
 '196124': {'bnbox': [[167, 83, 405, 209]], 'word': 'cart'}}