In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
import os
import ijson
import pandas as pd
import dask
import dask.dataframe as ddf
from typing import List, Optional
# dask.config.set({"dataframe.convert-string": False})

In [3]:
from src.utils.config_loader import load_config
cfg = load_config("../config.yaml")
data_cfg = cfg['data']

In [4]:
input_dir = os.path.join('..', data_cfg['annotations_dir'])
output_dir = os.path.join('..', data_cfg['processed_dir'])

In [5]:
def load_annotations_file(file_name: str, key: str, columns: Optional[List[str]] = None, chunk_size=10_000):
    chunks = []
    with open(os.path.join(input_dir, file_name), 'rb') as data:
        # Stream items using ijson
        objects = ijson.items(data, f'{key}.item')
        current_chunk = []
        
        for obj in objects:
            current_chunk.append(obj)
            if len(current_chunk) >= chunk_size:
                chunks.append(pd.DataFrame(current_chunk, columns=columns))
                current_chunk = []
        
        if current_chunk:
            chunks.append(pd.DataFrame(current_chunk, columns=columns))
    
    if not chunks:
        return ddf.from_pandas(pd.DataFrame(columns=columns), npartitions=1)
    
    # Convert list of Pandas DataFrames to a Dask DataFrame
    dask_chunks = [ddf.from_pandas(chunk, npartitions=1) for chunk in chunks]
    return ddf.concat(dask_chunks)

In [6]:
instances_images = load_annotations_file(
                        file_name="instances_val2017.json",
                        key="images",
                        columns=["file_name", "height", "width", "id"],
                        chunk_size=1_000
                    )
instances_annots = load_annotations_file(
                        file_name="instances_val2017.json",
                        key="annotations",
                        columns=['segmentation', 'area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'id'],
                        chunk_size=10_000
                    )
instances_catego = load_annotations_file(
                        file_name="instances_val2017.json",
                        key="categories",
                        columns=['supercategory', 'id', 'name'],
                        chunk_size=1_000
                    )

In [7]:
instances_images

Unnamed: 0_level_0,file_name,height,width,id
npartitions=5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,string,int64,int64,int64
,...,...,...,...
...,...,...,...,...
,...,...,...,...
,...,...,...,...


In [8]:
instances_annots

Unnamed: 0_level_0,segmentation,area,iscrowd,image_id,bbox,category_id,id
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,string,string,int64,int64,string,int64,int64
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [9]:
instances_catego

Unnamed: 0_level_0,supercategory,id,name
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,string,int64,string
79,...,...,...


In [10]:
stuff_images = load_annotations_file(
                        file_name="stuff_val2017.json",
                        key="images",
                        columns=["file_name", "height", "width", "id"],
                        chunk_size=1_000
                    )
stuff_annots = load_annotations_file(
                        file_name="stuff_val2017.json",
                        key="annotations",
                        columns=['segmentation', 'area', 'iscrowd', 'image_id', 'bbox', 'category_id', 'id'],
                        chunk_size=10_000
                    )
stuff_catego = load_annotations_file(
                        file_name="stuff_val2017.json",
                        key="categories",
                        columns=['supercategory', 'id', 'name'],
                        chunk_size=1_000
                    )

In [11]:
stuff_images

Unnamed: 0_level_0,file_name,height,width,id
npartitions=5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,string,int64,int64,int64
,...,...,...,...
...,...,...,...,...
,...,...,...,...
,...,...,...,...


In [12]:
stuff_annots

Unnamed: 0_level_0,segmentation,area,iscrowd,image_id,bbox,category_id,id
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,string,string,int64,int64,string,int64,int64
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [13]:
stuff_catego

Unnamed: 0_level_0,supercategory,id,name
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,string,int64,string
91,...,...,...


In [14]:
images_combined = ddf.concat([instances_images, stuff_images], axis=0)
images_combined = images_combined.drop_duplicates()
images_combined

Unnamed: 0_level_0,file_name,height,width,id
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,string,int64,int64,int64
,...,...,...,...
...,...,...,...,...
,...,...,...,...
,...,...,...,...


In [15]:
images_combined.compute()

Unnamed: 0,file_name,height,width,id
0,000000397133.jpg,427,640,397133
1,000000037777.jpg,230,352,37777
9,000000331352.jpg,500,351,331352
18,000000181666.jpg,425,640,181666
23,000000460347.jpg,640,427,460347
...,...,...,...,...
952,000000236690.jpg,399,640,236690
957,000000272212.jpg,480,640,272212
966,000000549220.jpg,640,480,549220
969,000000574297.jpg,427,640,574297


In [16]:
annots_combined = ddf.concat([instances_annots, stuff_annots], axis=0)
annots_combined = annots_combined.drop_duplicates()
annots_combined

Unnamed: 0_level_0,segmentation,area,iscrowd,image_id,bbox,category_id,id
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,string,string,int64,int64,string,int64,int64
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [17]:
annots_combined.compute()

Unnamed: 0,segmentation,area,iscrowd,image_id,bbox,category_id,id
9,"[[Decimal('115.46'), Decimal('164.53'), Decima...",2613.772699999999,0,404484,"[Decimal('86.93'), Decimal('90.76'), Decimal('...",18,7981
13,"[[Decimal('387.99'), Decimal('176.5'), Decimal...",2991.9213,0,65485,"[Decimal('387.99'), Decimal('97.43'), Decimal(...",18,10176
14,"[[Decimal('0.97'), Decimal('59.05'), Decimal('...",179462.9809,0,498286,"[Decimal('0.97'), Decimal('9.8'), Decimal('467...",18,12182
17,"[[Decimal('563.38'), Decimal('247.89'), Decima...",15711.620349999997,0,67213,"[Decimal('402.7'), Decimal('58.79'), Decimal('...",18,13714
28,"[[Decimal('109.78'), Decimal('1.08'), Decimal(...",27667.51730000001,0,522007,"[Decimal('38.74'), Decimal('0.0'), Decimal('16...",64,20979
...,...,...,...,...,...,...,...
2745,{'counts': 'TPZ51Y=3N2M21O02N2M1O2O1OO02O01O00...,1112.0,0,580418,"[Decimal('406.0'), Decimal('301.0'), Decimal('...",162,20032745
2762,{'counts': '`7\\1k1YOdN`2Q7WN[JYOdN`2Q7WN[JXOe...,82530.0,0,581100,"[Decimal('0.0'), Decimal('151.0'), Decimal('64...",124,20032762
2768,{'counts': 'e5n3Q`02O2N1N3N23NL2O2M4M2M3N3L100...,79346.0,0,581206,"[Decimal('0.0'), Decimal('88.0'), Decimal('479...",100,20032768
2782,{'counts': 'g^<\\1ha03M3M2M3N1O1O2N1O1O1O1O1O1...,68797.0,0,581357,"[Decimal('20.0'), Decimal('416.0'), Decimal('5...",144,20032782


In [18]:
catego_combined = ddf.concat([instances_catego, stuff_catego], axis=0) \
                    .drop_duplicates() \
                    .reset_index()
catego_combined

Unnamed: 0_level_0,index,supercategory,id,name
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,int64,string,int64,string
,...,...,...,...
,...,...,...,...


In [28]:
# Rename columns explicitly before merging to prevent ambiguous suffixes
images_renamed = images_combined.rename(columns={"id": "image_id_pk"})
annots_renamed = annots_combined.rename(columns={"id": "annotation_id"})

ddf_combined = ddf.merge(
    left=images_renamed, 
    right=annots_renamed, 
    how="inner", 
    left_on="image_id_pk", 
    right_on="image_id"
).merge(
    catego_combined, 
    how="inner", 
    left_on="category_id", 
    right_on="id"
)
ddf_combined

Unnamed: 0_level_0,file_name,height,width,id_x,segmentation,area,iscrowd,image_id,bbox,category_id,id_y,index,supercategory,id,name
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,string,int64,int64,int64,string,string,int64,int64,string,int64,int64,int64,string,int64,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [29]:
# Drop the original category_id and annotation_id first
ddf_combined = ddf_combined.drop(columns=["category_id", "annotation_id"])

# Rename columns to final names
ddf_combined = ddf_combined.rename(columns={
    "image_id_pk": "id", 
    "id": "old_category_id", 
    "index": "category_id"
})