# Put together dataset
This notebook is used to merge the data related to paintings with the annotations. There will be created 2 versions:
- one includes all objects even though they don't have descriptions or bounding boxes
- the other includes only objects with bounding boxes and descriptions

### 0. Import libraries and data

In [None]:
import json
import polars as pl
from tqdm import tqdm

ANNOTATIONS_PATH = "../../data/annotations/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/filtered_paintings/"
PROCESSED_DATA_PATH = "../../data/processed/"

In [None]:
with open(f"{ANNOTATIONS_PATH}filtered_and_refined_annotations.json") as f:
    annotations = json.load(f)

paintings_data = pl.read_json(f"{INTERMEDIATE_DATA_PATH}filtered_paintings_enhanced_data.json").to_dicts()

### 1. Create the first version of the dataset with all objects

In [None]:
for annotation_index in range(len(annotations)):
    for obj, desc in annotations[annotation_index]["objects"].items():
        annotations[annotation_index]["objects"][obj] = {"description": desc, "bounding_boxes": []}

    for bbox in annotations[annotation_index]["bounding_boxes"]:
        annotations[annotation_index]["objects"][bbox[0]]["bounding_boxes"].append([bbox[1], bbox[2]])

    del annotations[annotation_index]["bounding_boxes"]


In [None]:
for index in tqdm(range(len(paintings_data))):
    painting_id = paintings_data[index]["id"]

    found_objects = False
    for annotation in annotations:
        if annotation["painting_id"] == painting_id:
            paintings_data[index]["objects"] = annotation["objects"]
            found_objects = True
            break
    
    if not found_objects:
        paintings_data[index]["objects"] = []

In [None]:
with open(f"{PROCESSED_DATA_PATH}paintings_with_all_objects.json", "w") as f:
    json.dump(paintings_data, f, indent=4)

### 2. Create the first version of the dataset with objects with bboxes and descriptions

In [None]:
filtered_paintings_data = []

for index in range(len(paintings_data)):
    if len(paintings_data[index]["objects"]) == 0:
        continue

    filtered_objects = {}

    for obj, obj_data in paintings_data[index]["objects"].items():
        if len(obj_data["description"]) != 0 and len(obj_data["bounding_boxes"]) != 0:
            filtered_objects[obj] = obj_data
    
    if len(filtered_objects) != 0:
        paintings_data[index]["objects"] = filtered_objects
        filtered_paintings_data.append(paintings_data[index])

In [None]:
with open(f"{PROCESSED_DATA_PATH}paintings_with_filtered_objects.json", "w") as f:
    json.dump(filtered_paintings_data, f, indent=4)

### 3. Split into train and test