In [None]:
import sys

sys.path.append("/home/maximilien/work/pixano/")
sys.path.append("/home/maximilien/work/lib/bop_toolkit")

dir = "/home/maximilien/work/pixano/pixano/data/bop"
coco = "/home/maximilien/work/adapt-2023_with_gt/test/coco_gt.json"

In [None]:
import lance
import duckdb

import bop_toolkit_lib.dataset.bop_webdataset as btk

import webdataset as wds

import json
import numpy as np
import time
from pathlib import Path
from PIL import Image as pilImage
import pyarrow as pa

from pixano.core import *
from pixano.data import data_loader
from pixano.core.arrow_types import *
from pixano.transforms import image_to_binary


In [None]:

from lance import LanceDataset
from matplotlib import category
from pixano.core.dataset import DatasetInfo


class BOPImporter(data_loader.DataLoader):

    def __init__(
        self,
        shard_split:dict[str, list[str]],
        info:DatasetInfo,
        target_dir: Path,
    ):
        
        self.shard_split = shard_split
        self.info = info
        self.target_dir = target_dir

    @property
    def features(self) -> Features:
        return Features.from_string_dict(self.info.features)

    def create_json(self):
        """Create dataset spec.json"""

        # Read dataset
        #"dataset = ds.dataset(self.target_dir + "/db", partitioning=self.partitioning)

        # Check number of rows in the created dataset
        #self.info.num_elements = dataset.count_rows()

        # Create spec.json
        with open(self.target_dir + "/spec.json", "w") as f:
            json.dump(vars(self.info), f, indent=4)


    def fields(self):
        return self.features.to_fields()
    
    def schema(self):
        return pa.schema(self.fields())
    
    def get_row(self):
        
        #split dataset
        for split, shard_list in self.shard_split.items():
            print(split)

            _wds_pipeline = wds.DataPipeline(
                wds.SimpleShardList(shard_list),
                wds.tarfile_to_samples()
            )
            
            #extract row of each split
            for n, row in enumerate(_wds_pipeline):
                if True:
                    sample = btk.decode_sample(
                        row,
                        decode_camera=True,
                        decode_rgb=True,
                        decode_gray=False,
                        decode_depth=True,
                        decode_gt=True,
                        decode_gt_info=True,
                        decode_mask_visib=False,
                        decode_mask=False,
                        rgb_suffix='.png'
                    )

                    #id
                    id = row["__key__"]

                    scene, image = id.split('_')
                    coco_json_path = f'/home/maximilien/work/adapt-2023_with_gt/{split}/{scene}/scene_gt_coco.json'

                    #rgb
                    im_pil = pilImage.fromarray(sample['im_rgb'])
                    im_pil.thumbnail((128, 128))
                    preview = image_to_binary(im_pil)

                    rgb = Image(f"media/rgb/{id}.jpeg", image_to_binary(im_pil, format="JPEG"), preview)
                    rgbs = ImageType.Array.from_list([rgb])
                    #dept
                    depths = DepthImageType.Array.from_list([DepthImage(depth_map=sample["im_depth"], shape=sample["im_depth"].shape)])
                    #camera
                    cameras = CameraType.Array.from_list([Camera.from_dict(sample['camera'])])


                #Objects
                    nb_object = len(sample['gt'])
                    #category
                    category_id = [sample['gt'][i]['object_id'] for i in range(nb_object)]
                    category_id_arr = pa.array([category_id])

                    #pose
                    gt = [Pose(sample['gt'][i]['cam_R_m2c'].flatten(), sample['gt'][i]['cam_t_m2c'].flatten()) for i in range(nb_object)]
                    gt_arr = PoseType.Array.from_lists([gt])

                    #gt_info
                    gt_infos = [
                        GtInfo.from_dict(
                            {
                                **sample["gt_info"][i],
                                "bbox_obj": BBox.from_xywh(sample["gt_info"][i]["bbox_obj"]),
                                "bbox_visib": BBox.from_xywh(sample["gt_info"][i]["bbox_visib"]),
                            }
                        )
                        for i in range(nb_object)
                    ]
                    gt_infos_arr = GtInfoType.Array.from_lists([gt_infos])

                    #objects_ids and masks
                    with open(coco_json_path, 'r') as f:
                        data = json.load(f)
        
                    object_ids = []
                    masks = []
                    for ann in data['annotations']:
                        #check if same image key, then annotations are in same order as other object's attribute in coco.json
                        if '000' + ann['image_id'] == id.replace('_','-'):
                            object_ids.append(ann['id'])
                            masks.append(CompressedRLE.from_urle(ann['segmentation'], ann['segmentation']['size'][0], ann['segmentation']['size'][1]))

                    masks_arr = CompressedRLEType.Array.from_lists([masks])
                    object_ids_arr = pa.array([object_ids])

                #Struct array
                    struct_arr = pa.StructArray.from_arrays(
                        [
                            pa.array([id]),
                            rgbs,
                            depths,
                            cameras,
                            category_id_arr,
                            object_ids_arr,
                            masks_arr,
                            gt_arr,
                            gt_infos_arr,
                            pa.array([split])
                        ],
                        fields=self.fields()
                    )

                    yield pa.RecordBatch.from_struct_array(struct_arr)


    def import_dataset(self, max_rows_per_file: int = 1024*1024, max_rows_per_group:int = 1024) -> LanceDataset:
        """Import dataset to Pixano format

        Args:
            batch_size (int, optional): Number of rows per file. Defaults to 2048.
        """
        reader = pa.RecordBatchReader.from_batches(self.schema(), self.get_row())
        ds = lance.write_dataset(reader, self.target_dir ,max_rows_per_file=max_rows_per_file, max_rows_per_group=max_rows_per_group)

        self.info.num_elements = ds.count_rows()

        # Create spec.json
        self.create_json()
        
        return ds


In [None]:
import os

shard_test_dir = "/home/maximilien/work/adapt-2023_with_gt/shard/shard_test/"
shard_test_list = [os.path.join(shard_test_dir, shard) for shard in os.listdir(shard_test_dir) if shard.endswith(".tar")]

shard_validation_dir = "/home/maximilien/work/adapt-2023_with_gt/shard/shard_validation/"
shard_validation_list = [os.path.join(shard_validation_dir, shard) for shard in os.listdir(shard_validation_dir) if shard.endswith(".tar")]

shard_split = {'test':shard_test_list, 'validation': shard_validation_list}

In [None]:
features_dict = {
    'id': 'str',
    'rgb': 'Image',
    'depth': 'DepthImage',
    'camera': 'Camera',
    'category_id': '[int]',
    'objects_id': '[str]',
    'masks': '[CompressedRLE]',
    'gt': '[Pose]',
    'gt_info':'[GtInfo]',
    'split':'str'
}

bop_info = DatasetInfo(id="0", name="Bop", description="Bop dataset", features=features_dict)

bop_importer = BOPImporter(shard_split, bop_info, dir + '/bop_ds.lance')


In [None]:
bop_ds = bop_importer.import_dataset()

In [None]:

r = lance.dataset("/home/maximilien/work/pixano/pixano/data/bop/bop_ds.lance/")



In [None]:
for batch in r.to_batches(limit=10):
    a = batch.to_pandas()
    break



In [24]:
a.head()
#aire mask
#distribution catégorie
#stat pose par scene


Unnamed: 0,id,rgb,depth,camera,category_id,objects_id,masks,gt,gt_info,split
0,000005_000065,"{'uri': 'media/rgb/000005_000065.jpeg', 'bytes...",{'bytes': b'\x00\xc0hD\x00\xc0jD\x00\x00nD\x00...,"{'cam_K': [617.52197265625, 0.0, 321.682006835...","[1, 2, 3]","[6nQpr5HXD5MRQBvpaqifXi, QSwWYGs57CRCu9Eu53hZa...","[{'size': [480, 640], 'counts': b'Pff61o>1O001...","[{'cam_R_m2c': [-0.726655, -0.612474, -0.31120...",[{'bbox_obj': {'coords': [458. 0. 38. 15.]...,test
1,000003_000072,"{'uri': 'media/rgb/000003_000072.jpeg', 'bytes...",{'bytes': b'\x00@%D\x00\x00%D\x00\xc0$D\x00\xc...,"{'cam_K': [617.52197265625, 0.0, 321.682006835...",[2],[9UCjNbsyZqSkobj2MtUry5],"[{'size': [480, 640], 'counts': b'd[[7:c>6L2M3...","[{'cam_R_m2c': [0.492073, -0.790511, 0.364631,...",[{'bbox_obj': {'coords': [502. 39. 32. 32.]...,test
2,000014_000070,"{'uri': 'media/rgb/000014_000070.jpeg', 'bytes...",{'bytes': b'\x00\x00\x06D\x00\xc0\x05D\x00\xc0...,"{'cam_K': [617.52197265625, 0.0, 321.682006835...","[9, 9, 9, 9, 9, 10, 10, 11, 11, 12, 12]","[HKMfJUWUBaS5SwXLQhHTtC, diWAmvNZGFS4AowTouQ5N...","[{'size': [480, 640], 'counts': b'^]P31n>3M2N3...","[{'cam_R_m2c': [-0.439467, 0.732672, -0.51967,...",[{'bbox_obj': {'coords': [205. 327. 31. 40.]...,test
3,000006_000141,"{'uri': 'media/rgb/000006_000141.jpeg', 'bytes...",{'bytes': b'\x00\x00\x9aC\x00\x00\x9aC\x00\x80...,"{'cam_K': [617.52197265625, 0.0, 321.682006835...","[1, 2, 3]","[j3oHNFo8M4jgKX6f6hVFZF, ENV7F8EmABMRw4BBAEZLh...","[{'size': [480, 640], 'counts': b'j[[21k>8J3N3...","[{'cam_R_m2c': [0.874432, -0.237646, 0.422954,...",[{'bbox_obj': {'coords': [160. 352. 55. 35.]...,test
4,000003_000246,"{'uri': 'media/rgb/000003_000246.jpeg', 'bytes...",{'bytes': b'\x00@/D\x00\x80.D\x00\x80.D\x00\x0...,"{'cam_K': [617.52197265625, 0.0, 321.682006835...","[1, 2, 2, 3]","[FTQcCymrjHxAm8w8nGrWUV, cfVjQjU5RtUTK3K8mQJ8M...","[{'size': [480, 640], 'counts': b'jXb68g>3L3N2...","[{'cam_R_m2c': [-0.753183, -0.413673, -0.51145...",[{'bbox_obj': {'coords': [448. 276. 46. 37.]...,test


In [None]:
dim:DepthImage = DepthImage.from_dict(r['depth'])
dim.display()

In [None]:
duckdb.query("SELECT * FROM bop_ds LIMIT 5").to_df()
