## Import modules

In [2]:
from datasets import load_dataset

## Dataset Import (from HF hub)

In [4]:
dataset = load_dataset("KabilanM/plant-label-classification", cache_dir="data/plant-sample-segmentation-dataset/")
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/452M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/379M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/15 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'objects'],
        num_rows: 15
    })
})

In [5]:
dataset["train"].features

{'image': Image(decode=True, id=None),
 'objects': Sequence(feature={'bbox': Sequence(feature=Value(dtype='float32', id=None), length=4, id=None), 'categories': ClassLabel(names=['Old Label', 'New Label'], id=None)}, length=-1, id=None)}

In [14]:
example = dataset["train"][0]
# example["image"] # image is too big to load (takes a while)

In [15]:
bboxes, categories = example["objects"]["bbox"], example["objects"]["categories"]
print(bboxes, end="\n\n")
print(categories)

[[3743.330810546875, 7655.7880859375, 2469.245361328125, 1251.5352783203125], [2390.31982421875, 8084.24169921875, 1262.8104248046875, 417.1784362792969], [3608.02978515625, 259.3271484375, 2390.31982421875, 575.0297241210938], [3596.754638671875, 868.18212890625, 2063.342041015625, 541.2044677734375]]

[1, 0, 0, 0]


## Prepare dataset for the model

In [16]:
# As per the model, we need to normalize the bounding values using the below function

def normalize_bbox(bbox, size):
    return [
        int(1000 * bbox[0] / size[0]),
        int(1000 * bbox[1] / size[1]),
        int(1000 * bbox[2] / size[0]),
        int(1000 * bbox[3] / size[1]),
    ]

In [64]:
from transformers import AutoProcessor, TrOCRProcessor, VisionEncoderDecoderModel

# we'll use the Auto API here - it will load LayoutLMv3Processor behind the scenes,
# based on the checkpoint we provide from the hub
layoutlm_processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", 
                                          apply_ocr=True, 
                                          cache_dir="data/plant-sample-segmentation-dataset/")

# trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten",
#                                                 cache_dir="data/plant-sample-segmentation-dataset/") 
# trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten",
#                                                        cache_dir="data/plant-sample-segmentation-dataset/")

In [51]:
from datasets.features import ClassLabel

features = dataset["train"].features
column_names = dataset["train"].column_names
column_names

image_column_name = "image"
boxes_column_name = "bbox"
label_column_name = "categories"

In [52]:
id2label = {"0": "Old Label", "1": "New Label"}

In [71]:
def prepare_examples(examples):
    images = examples[image_column_name]
    
#     # Use processor to convert cropped image to input format
#     inputs = trocr_processor(images=images, return_tensors="pt")
#     # Generate prediction
#     outputs = trocr_model.generate(**inputs)
#     # Decode the prediction
#     words = trocr_processor.decode(outputs[0])
#     print(words)
    
    boxes = [item[boxes_column_name] for item in examples["objects"][:]]
    label_categories = [item[label_column_name] for item in examples["objects"][:]]

    encoding = layoutlm_processor(images, 
                                  words, 
                                  boxes=boxes, 
                                  word_labels=label_categories,
                                  truncation=True, padding="max_length")

    return encoding

In [72]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

# we need to define custom features for `set_format` (used later on) to work properly
features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'categories': Sequence(feature=Value(dtype='int64')),
})

train_dataset = dataset["train"].map(
    prepare_examples,
    batched=True,
    remove_columns=column_names,
    features=features,
)

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

ImportError: 
LayoutLMv3FeatureExtractor requires the PyTesseract library but it was not found in your environment. You can install it with pip:
`pip install pytesseract`. Please note that you may need to restart your runtime after installation.
