In [1]:
import os
import ijson
import dask.bag as db
import pandas as pd
import dask.dataframe as ddf
from dask.diagnostics import ProgressBar

ProgressBar().register()

file_path = "../dataset/annotation/instances_val2017.json"

In [2]:
def stream_to_partitions(items, chunk_size=50_000):
    batch=[]
    for obj in items:
        batch.append(obj)
        if len(batch) >= chunk_size:
            yield pd.DataFrame(batch)
            batch.clear()
    
    if batch:
        yield pd.DataFrame(batch)



In [3]:
# --- Annotations ---
with open(file_path, "rb") as f:
    annotations_iter = ijson.items(f, "annotations.item")
    annotations_partitions = list(stream_to_partitions(annotations_iter, chunk_size=5000))

annotations_ddf = ddf.from_pandas(
    pd.concat(annotations_partitions, ignore_index=True),
    npartitions=len(annotations_partitions)
)

# --- Images ---
with open(file_path, "rb") as f:
    images_iter = ijson.items(f, "images.item")
    images_partitions = list(stream_to_partitions(images_iter, chunk_size=1000))

images_ddf = ddf.from_pandas(
    pd.concat(images_partitions, ignore_index=True),
    npartitions=len(images_partitions)
)

In [9]:
annotations_ddf.drop(columns=["area", "iscrowd", "id"])
annotations_ddf.head()

[########################################] | 100% Completed | 104.08 ms


Unnamed: 0,segmentation,area,iscrowd,image_id,bbox,category_id,id
0,"[[Decimal('510.66'), Decimal('423.01'), Decima...",702.1057499999998,0,289343,"[Decimal('473.07'), Decimal('395.93'), Decimal...",18,1768
1,"[[Decimal('289.74'), Decimal('443.39'), Decima...",27718.476299999995,0,61471,"[Decimal('272.1'), Decimal('200.23'), Decimal(...",18,1773
2,"[[Decimal('147.76'), Decimal('396.11'), Decima...",78969.31690000003,0,472375,"[Decimal('124.71'), Decimal('196.18'), Decimal...",18,2551
3,"[[Decimal('260.4'), Decimal('231.26'), Decimal...",108316.66515000002,0,520301,"[Decimal('112.71'), Decimal('154.82'), Decimal...",18,3186
4,"[[Decimal('200.61'), Decimal('253.97'), Decima...",75864.53530000002,0,579321,"[Decimal('200.61'), Decimal('89.65'), Decimal(...",18,3419


In [5]:
images_ddf.head()

[########################################] | 100% Completed | 101.84 ms


Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,id
0,4,000000397133.jpg,http://images.cocodataset.org/val2017/00000039...,427,640,2013-11-14 17:02:52,http://farm7.staticflickr.com/6116/6255196340_...,397133
1,1,000000037777.jpg,http://images.cocodataset.org/val2017/00000003...,230,352,2013-11-14 20:55:31,http://farm9.staticflickr.com/8429/7839199426_...,37777
2,4,000000252219.jpg,http://images.cocodataset.org/val2017/00000025...,428,640,2013-11-14 22:32:02,http://farm4.staticflickr.com/3446/3232237447_...,252219
3,1,000000087038.jpg,http://images.cocodataset.org/val2017/00000008...,480,640,2013-11-14 23:11:37,http://farm8.staticflickr.com/7355/8825114508_...,87038
4,6,000000174482.jpg,http://images.cocodataset.org/val2017/00000017...,388,640,2013-11-14 23:16:55,http://farm8.staticflickr.com/7020/6478877255_...,174482


In [None]:
images_ddf.drop(columns=["license", "coco_url", "id"])
images_ddf = images_ddf.rename(columns={"id": "image_id"})
images_ddf.head()

[########################################] | 100% Completed | 106.07 ms


Unnamed: 0,license,file_name,coco_url,height,width,date_captured,flickr_url,image_id
0,4,000000397133.jpg,http://images.cocodataset.org/val2017/00000039...,427,640,2013-11-14 17:02:52,http://farm7.staticflickr.com/6116/6255196340_...,397133
1,1,000000037777.jpg,http://images.cocodataset.org/val2017/00000003...,230,352,2013-11-14 20:55:31,http://farm9.staticflickr.com/8429/7839199426_...,37777
2,4,000000252219.jpg,http://images.cocodataset.org/val2017/00000025...,428,640,2013-11-14 22:32:02,http://farm4.staticflickr.com/3446/3232237447_...,252219
3,1,000000087038.jpg,http://images.cocodataset.org/val2017/00000008...,480,640,2013-11-14 23:11:37,http://farm8.staticflickr.com/7355/8825114508_...,87038
4,6,000000174482.jpg,http://images.cocodataset.org/val2017/00000017...,388,640,2013-11-14 23:16:55,http://farm8.staticflickr.com/7020/6478877255_...,174482


In [7]:
merged_ddf = annotations_ddf.merge(
    images_ddf,
    on="image_id",
    how="left"
)
merged_ddf.head()

[########################################] | 100% Completed | 106.26 ms


Unnamed: 0,segmentation,area,iscrowd,image_id,bbox,category_id,id,license,file_name,coco_url,height,width,date_captured,flickr_url
0,"[[Decimal('322.06'), Decimal('341.82'), Decima...",344.57855000000023,0,227491,"[Decimal('307.0'), Decimal('326.77'), Decimal(...",60,1080902,3,000000227491.jpg,http://images.cocodataset.org/val2017/00000022...,480,640,2013-11-19 18:13:44,http://farm1.staticflickr.com/41/102772079_df7...
1,"[[Decimal('230.48'), Decimal('150.3'), Decimal...",2262.98595,0,261116,"[Decimal('230.48'), Decimal('140.46'), Decimal...",61,1083776,1,000000261116.jpg,http://images.cocodataset.org/val2017/00000026...,375,500,2013-11-19 21:09:38,http://farm2.staticflickr.com/1226/686014029_4...
2,"[[Decimal('45.71'), Decimal('180.09'), Decimal...",1449.4411999999995,0,261116,"[Decimal('16.08'), Decimal('137.77'), Decimal(...",61,1084731,1,000000261116.jpg,http://images.cocodataset.org/val2017/00000026...,375,500,2013-11-19 21:09:38,http://farm2.staticflickr.com/1226/686014029_4...
3,"[[Decimal('494.61'), Decimal('156.55'), Decima...",4036.1712,0,261116,"[Decimal('408.22'), Decimal('111.24'), Decimal...",61,1085149,1,000000261116.jpg,http://images.cocodataset.org/val2017/00000026...,375,500,2013-11-19 21:09:38,http://farm2.staticflickr.com/1226/686014029_4...
4,"[[Decimal('29.93'), Decimal('141.43'), Decimal...",47714.037049999984,0,324715,"[Decimal('29.93'), Decimal('71.09'), Decimal('...",73,1102939,1,000000324715.jpg,http://images.cocodataset.org/val2017/00000032...,333,500,2013-11-19 22:16:56,http://farm1.staticflickr.com/84/244801562_344...


In [8]:
image_id_to_find = 397133  # example ID from your dataset

filtered_ddf = merged_ddf[merged_ddf["image_id"] == image_id_to_find]
filtered_df = filtered_ddf.persist()
filtered_ddf

[########################################] | 100% Completed | 106.46 ms


Unnamed: 0_level_0,segmentation,area,iscrowd,image_id,bbox,category_id,id,license,file_name,coco_url,height,width,date_captured,flickr_url
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
,string,string,int64,int64,string,int64,int64,int64,string,string,int64,int64,string,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...
