In [None]:
import json
import os
import pandas as pd
import shutil

from datetime import datetime
from typing import List

In [None]:
# All recorded images and the corresponding metadata csv file are assumed to be in the same folder
data_folder = "../datasets/oor/data_collection/"

# Name of metadata file
metadata_file = "edw - extraction - all.csv"

# Landing zone files will be prepared and stored here, which can be uploaded afterwards
output_folder = "../datasets/oor/data_collection/landingzone"

FILETYPE = ".jpg"
suffix = ""  # Optional, append a suffix to redo the ingestion

In [None]:
def fix_metadata(raw_metadata: pd.DataFrame) -> pd.DataFrame:
    """
    Make sure all relevant fields are filled, and rename / delete fields to
    match what we expect.
    """
    # Check if object class column is present and valid
    if "object_class" in raw_metadata.columns:
        if raw_metadata["object_class"].isnull().any():
             raise ValueError("The 'object_class' column contains empty cells.")
        valid_values = {2, 3, 4}
        invalid_rows = raw_metadata.loc[~raw_metadata["object_class"].isin(valid_values), "object_class"]
        if not invalid_rows.empty:
            invalid_values = invalid_rows.unique()
            raise ValueError(f"Invalid values in 'object_class' column found: {invalid_values}. Expected only 2, 3, or 4.")
    else:
        raise ValueError("The 'object_class' column is not present.")

    # Helper function to fix GPS coordinate formatting.
    # It removes any extra dots so that only the first dot remains.
    def fix_coordinate(coord):
        coord_str = str(coord)
        parts = coord_str.split('.')
        # If there are more than two parts, reassemble the coordinate by keeping the first dot only.
        if len(parts) > 2:
            return float(f"{parts[0]}.{''.join(parts[1:])}")
        else:
            return float(coord_str)
    
    # Fix gps_lat and gps_lon if present.
    if "gps_lat" in raw_metadata.columns:
        raw_metadata["gps_lat"] = raw_metadata["gps_lat"].apply(fix_coordinate)
    if "gps_lon" in raw_metadata.columns:
        raw_metadata["gps_lon"] = raw_metadata["gps_lon"].apply(fix_coordinate)

    return raw_metadata

def generate_detection_metadata(run_metadata: pd.DataFrame) -> List[dict]:
    """
    Generate detection_metadata from prepared run_metadata. This adds
    arbitrary detection bounding boxes for each image.
    """
    project = {
        "model_name": "manual_collection",
        "aml_model_version": 0,
        "project_version": "0",
        "customer": "THOR"
    }
    default_detection = {
        "object_class": -1,
        "confidence": 1.0,
        "tracking_id": -1,
        "boundingBox": {
            "x_center": 0.5,
            "y_center": 0.5,
            "width": 0.5,
            "height": 0.5,
        }
    }

    detection_jsons = []
    img_groups = run_metadata.groupby(by="pylon://0_frame_counter")

    for name, group in img_groups:
        dets = []
        for _, row in group.iterrows():
            det = default_detection.copy()
            det["object_class"] = row["object_class"]
            dets.append(det)
        
        row = group.iloc[0, :]
        timestamp = datetime.fromtimestamp(row["timestamp"]).isoformat()

        detection_jsons.append(
            {
                "image_file_name": name,
                "image_file_timestamp": timestamp,
                "gps_data": {
                    "latitude": row["gps_lat"],
                    "longitude": row["gps_lon"],
                    "altitude": 0.0,
                    "coordinate_time_stamp": timestamp
                },
                "project": project,
                "detections": dets,
            }
        )

    return detection_jsons

def get_image_names_in_folder(folder: str, filetype: str = FILETYPE) -> List[str]:
    """
    List all files with a given file_type (default: .json) in root_folder
    recursively. Returns a sorted list.
    """
    files = []
    for dirpath, _, filenames in os.walk(folder, topdown=True):
        for filename in filenames:
            if filename.endswith(filetype):
                filepath = os.path.join(dirpath, filename)
                files.append(filepath)
    return sorted(files)

def get_date_from_metadata(metadata: pd.DataFrame) -> str:
    def unix_to_yyyy_mm_dd(unix_timestamp) -> str:
        date_time = datetime.fromtimestamp(unix_timestamp)
        date_time_transformed = date_time.strftime("%Y-%m-%d")
        print(f"timestamp of detections: {date_time_transformed}")
        return date_time.strftime("%Y-%m-%d")

    unix_timestamp = metadata.loc[0, "timestamp"]
    return unix_to_yyyy_mm_dd(unix_timestamp)

In [None]:
raw_metadata = pd.read_csv(os.path.join(data_folder, metadata_file))
image_names = get_image_names_in_folder(data_folder)

# Convert raw metadata to detection_metadata
run_metadata = fix_metadata(raw_metadata=raw_metadata)
detection_metadata = generate_detection_metadata(run_metadata=run_metadata)

In [None]:
# Date of collection is date of folder in landingzone (e.g. 2025-01-28)
date = get_date_from_metadata(metadata=run_metadata)

detection_metadata_folder = os.path.join(output_folder, "detection_metadata", date)
images_folder = os.path.join(output_folder, "images", date)

os.makedirs(detection_metadata_folder, exist_ok=True)
os.makedirs(images_folder, exist_ok=True)

# Write json files
for detection in detection_metadata:
    name, _ = os.path.splitext(detection["image_file_name"])
    filename = os.path.join(detection_metadata_folder, f"{name}.json")
    with open(filename, 'w') as f:
        json.dump(detection, f, indent=4)
    
# Copy images (for convenience)
for img in image_names:
    dest_file = os.path.join(images_folder, os.path.basename(img))
    shutil.copy2(src=img, dst=dest_file)