## Import Modules

In [None]:
# ! pip install -U albumentations opencv-python
# ! pip install datasets
from datasets import load_dataset

import json

## Data Labeling

In [None]:
# Data labelling
# https://labelstud.io/guide/get_started.html#Quick-start
# https://labelstud.io/guide/storage.html#Local-storage

## COCO format to JSONL conversion

We perform this conversion for convenience. If COCO format annotations are used, please note that changes to the Hugging Face dataset builder has to be made accordingly.

In [25]:
# Load the MS COCO JSON file
coco_file_path = 'data/plant-sample-segmentation-dataset/metadata.json'
with open(coco_file_path, 'r') as file:
    coco_data = json.load(file)

# Extracting image file names and their corresponding IDs
image_info = {image['id']: "data/"+image['file_name'].split("/")[-1] for image in coco_data['images']}

# Initialize a dictionary to hold the objects for each image
image_objects = {image_id: {'bbox': [], 'categories': []} for image_id in image_info.keys()}

# Populate the image_objects dictionary with bounding boxes and categories
for annotation in coco_data['annotations']:
    image_id = annotation['image_id']
    bbox = annotation['bbox']
    category_id = annotation['category_id']
    image_objects[image_id]['bbox'].append(bbox)
    image_objects[image_id]['categories'].append(category_id)

# Convert to our desired JSONL format and write to a file
output_file_path = 'data/plant-sample-segmentation-dataset/metadata.jsonl'
with open(output_file_path, 'w') as file:
    for image_id, objects in image_objects.items():
        file_name = image_info[image_id]
        line = {'file_name': file_name, 'objects': objects}
        file.write(json.dumps(line) + '\n')

**Sample JSONL output:**

{"file_name": "data/1318182025.jpg", "objects": {"bbox": [[3743.3308550185875, 7655.788104089219, 2469.2453531598517, 1251.5353159851297], [2390.31970260223, 8084.241635687732, 1262.8104089219332, 417.1784386617103]
, [3608.0297397769523, 259.3271375464684, 2390.31970260223, 575.0297397769516], [3596.7546468401483, 868.1821561338289, 2063.3420074349438, 541.2044609665426]], "categories": [1, 0, 0, 0]}}

{"file_name": "data/1318212360.jpg", "objects": {"bbox": [[1576.1598513011154, 8037.2973977695165, 2246.8661710037177, 558.9219330855032], [3890.0966542750934, 7511.9107806691445, 2347.4721189591082, 1442.0185873605
942]], "categories": [0, 1]}}

## Hugging Face (HF) Dataset Creation

Please feel free to use on of the methods below to load your dataset. The HF loading script offers great flexibility. 

REFERENCES: 
1. https://huggingface.co/docs/datasets/image_dataset
2. https://huggingface.co/datasets/cppe-5/blob/main/cppe-5.py

### Dataset loaded using HF loading script

In [58]:
dataset = load_dataset("data/plant-sample-segmentation-dataset/")
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'objects'],
        num_rows: 15
    })
})

In [59]:
dataset["train"][0]["objects"]

{'bbox': [[3743.330810546875,
   7655.7880859375,
   2469.245361328125,
   1251.5352783203125],
  [2390.31982421875, 8084.24169921875, 1262.8104248046875, 417.1784362792969],
  [3608.02978515625, 259.3271484375, 2390.31982421875, 575.0297241210938],
  [3596.754638671875, 868.18212890625, 2063.342041015625, 541.2044677734375]],
 'categories': [1, 0, 0, 0]}

### Dataset loaded using HF image folder based loader

In [50]:
# ImageFolder based data loader ()
dataset = load_dataset("imagefolder", data_dir="data/plant-sample-segmentation-dataset/", split="train")
dataset

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/16 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['image', 'objects'],
    num_rows: 15
})

In [51]:
dataset[0]["objects"]

{'bbox': [[3743.3308550185875,
   7655.788104089219,
   2469.2453531598517,
   1251.5353159851297],
  [2390.31970260223, 8084.241635687732, 1262.8104089219332, 417.1784386617103],
  [3608.0297397769523, 259.3271375464684, 2390.31970260223, 575.0297397769516],
  [3596.7546468401483,
   868.1821561338289,
   2063.3420074349438,
   541.2044609665426]],
 'categories': [1, 0, 0, 0]}

In [61]:
# This step is necessary to push the dataset we created above, to HF Datasets
# ! huggingface-cli login

In [60]:
dataset.push_to_hub("KabilanM/plant-label-classification")

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/596 [00:00<?, ?B/s]

### Now our dataset has been pushed to the HF hub, which we will access from our training notebook/script.