In [1]:
# Data Harmonizer - Pydantic Models and Configuration Loading
# This notebook demonstrates how to load and work with data harmonization configurations

from pydantic import BaseModel
from typing import List
import yaml


In [2]:

class FieldMapping(BaseModel):
    """Represents a single field mapping between source and target."""
    source_field: str
    target_field: str


class DataMapping(BaseModel):
    """Represents a complete data mapping configuration."""
    origin: str
    fields_map: List[FieldMapping]


In [3]:
def load_config_mappings(config_path: str) -> dict:
    """
    Load configuration from YAML file and convert to dict with target_classes, target_dataset_path and datasets.
    
    Args:
        config_path: Path to the YAML configuration file
        
    Returns:
        Dict with target_classes, target_dataset_path and datasets fields
    """
    with open(config_path, 'r') as file:
        config_data = yaml.safe_load(file)
    
    # Convert datasets to Pydantic objects
    datasets = []
    for mapping_config in config_data["datasets"]:
        # Convert each mapping to Pydantic objects
        field_mappings = [
            FieldMapping(
                source_field=field["source_field"],
                target_field=field["target_field"]
            )
            for field in mapping_config["fields_map"]
        ]
        
        data_mapping = DataMapping(
            origin=mapping_config["origin"],
            fields_map=field_mappings
        )
        datasets.append(data_mapping)
    
    return {
        "target_classes": config_data["target_classes"],
        "target_dataset_path": config_data["target_dataset_path"],
        "datasets": datasets
    }

In [4]:
import os
def generate_internal_structure(config, step_path):
    if not os.path.exists(config["target_dataset_path"] + f"/{step_path}" + "/images"):
        os.makedirs(config["target_dataset_path"] + f"/{step_path}" + "/images")
    if not os.path.exists(config["target_dataset_path"] + f"/{step_path}" + "/labels"):
        os.makedirs(config["target_dataset_path"] + f"/{step_path}" + "/labels")

def generate_target_dataset_path(config):
    if not os.path.exists(config["target_dataset_path"]):
        os.makedirs(config["target_dataset_path"])
    generate_internal_structure(config, "train")
    generate_internal_structure(config, "valid")
    generate_internal_structure(config, "test")
    
    return config["target_dataset_path"]

In [5]:
def generate_target_yolo_data_file(config):
    import yaml
    import os
    
    data_yaml_path = os.path.join(config["target_dataset_path"], "data.yaml")
    names = f"[{', '.join(config['target_classes'])}]"
    print(names)
    # Create the YOLO data structure
    yolo_data = {
        'train': '../train/images',
        'val': '../valid/images', 
        'test': '../test/images',
        'nc': len(config['target_classes']),
        'names': names
    }
    
    if not os.path.exists(data_yaml_path):
        with open(data_yaml_path, "w") as file:
            yaml.dump(yolo_data, file)
    
    return data_yaml_path

In [6]:
from typing import Dict

def extract_origin_classes(dataset: DataMapping):
    with open(dataset.origin + "/data.yaml", 'r') as file:
        classes = yaml.safe_load(file)
    return classes["names"]

def map_classes(config_classes: List[FieldMapping], target_classes: List[str], origin_classes_list: List[str]) -> Dict[int, int]:
    """
     THe result is a dict {idx_origin_class: idx_target_class}
     The idea is that this mapping will be used to map the classes of the origin dataset to the target dataset
    """
    origin_classes_indexed = {origin_classes_list[i]: i for i in range(len(origin_classes_list))}
    target_classes_indexed = {target_classes[i]: i for i in range(len(target_classes))}

    
    result = {}
    for config_class in config_classes:
        result[origin_classes_indexed[config_class.source_field]] = target_classes_indexed[config_class.target_field]
    return result

def copy_images(origin_path: str, target_dataset_path: str):
    import os
    import shutil
    
    # Define source and target image directories
    source_images_dir = os.path.join(origin_path, "images")
    target_images_dir = os.path.join(target_dataset_path, "images")
    
    # Check if source images directory exists
    if not os.path.exists(source_images_dir):
        print(f"❌ Source images directory not found: {source_images_dir}")
        return False
    
    # Create target images directory if it doesn't exist
    os.makedirs(target_images_dir, exist_ok=True)
    
    # Copy all files from source to target
    try:
        for filename in os.listdir(source_images_dir):
            source_file = os.path.join(source_images_dir, filename)
            target_file = os.path.join(target_images_dir, filename)
            
            # Only copy files (not directories)
            if os.path.isfile(source_file):
                shutil.copy2(source_file, target_file)
                print(f"✅ Copied: {filename}")
        
        print(f"✅ Successfully copied all images from {source_images_dir} to {target_images_dir}")
        return True
        
    except Exception as e:
        print(f"❌ Error copying images: {str(e)}")
        return False

def transform_and_copy_labels(origin_path: str, target_dataset_path: str, classes: Dict[int, int]):
    """
    Transform and copy label files from origin to target dataset.
    
    Args:
        origin_path: Path to the origin dataset directory
        target_dataset_path: Path to the target dataset directory  
        classes: Dictionary mapping origin class indices to target class indices
    """
    import os
    
    # Define source and target label directories
    source_labels_dir = os.path.join(origin_path, "labels")
    target_labels_dir = os.path.join(target_dataset_path, "labels")
    
    # Check if source labels directory exists
    if not os.path.exists(source_labels_dir):
        print(f"❌ Source labels directory not found: {source_labels_dir}")
        return False
    
    # Create target labels directory if it doesn't exist
    os.makedirs(target_labels_dir, exist_ok=True)
    
    try:
        # Process each label file in the source directory
        for filename in os.listdir(source_labels_dir):
            source_file = os.path.join(source_labels_dir, filename)
            
            # Only process text files
            if os.path.isfile(source_file) and filename.endswith('.txt'):
                target_file = os.path.join(target_labels_dir, filename)
                
                # Read and transform the label file
                transformed_lines = []
                
                with open(source_file, 'r') as f:
                    for line in f:
                        line = line.strip()
                        if not line:  # Skip empty lines
                            continue
                            
                        # Split the line into parts
                        parts = line.split()
                        if len(parts) < 5:  # YOLO format requires at least class + 4 coordinates
                            continue
                            
                        try:
                            # Extract the class number (first value)
                            origin_class = int(parts[0])
                            
                            # Check if this class is in our mapping
                            if origin_class in classes:
                                # Map to target class
                                target_class = classes[origin_class]
                                
                                # Create new line with mapped class and same coordinates
                                new_line = f"{target_class} {' '.join(parts[1:])}"
                                transformed_lines.append(new_line)
                            # If class not in mapping, drop the line (as requested)
                            
                        except ValueError:
                            # Skip lines that don't have valid integer class
                            continue
                
                # Write the transformed content to target file
                with open(target_file, 'w') as f:
                    f.write('\n'.join(transformed_lines))
                
                print(f"✅ Transformed: {filename} ({len(transformed_lines)} lines kept)")
        
        print(f"✅ Successfully transformed all labels from {source_labels_dir} to {target_labels_dir}")
        return True
        
    except Exception as e:
        print(f"❌ Error transforming labels: {str(e)}")
        return False

def process_dataset(dataset: str, classes: Dict[int, int], target_dataset_path: str):
    copy_images(dataset, target_dataset_path)
    transform_and_copy_labels(dataset, target_dataset_path, classes)
    pass

def process_origin(dataset, target_classes, target_dataset_path):
    mapped_classes = map_classes(dataset.fields_map, target_classes, extract_origin_classes(dataset))
    process_dataset(dataset.origin + "/test", mapped_classes, target_dataset_path + "/test")
    process_dataset(dataset.origin + "/valid", mapped_classes, target_dataset_path + "/valid")
    process_dataset(dataset.origin + "/train", mapped_classes, target_dataset_path + "/train")


def process_origins(config):
    for dataset in config["datasets"]:
        print("--------------------------------")
        print(dataset)
        print(config["target_classes"])
        print(config["target_dataset_path"])
        process_origin(dataset, config["target_classes"], config["target_dataset_path"])

In [7]:
# Load the configuration with the new structure
config = load_config_mappings("config.yml")
generate_target_dataset_path(config)
generate_target_yolo_data_file(config)
process_origins(config)

[container]
--------------------------------
origin='/home/emma/facultad/pps/datasets/containers/raw/container.v1i.yolov11' fields_map=[FieldMapping(source_field='0', target_field='container')]
['container']
./testing
❌ Source images directory not found: /home/emma/facultad/pps/datasets/containers/raw/container.v1i.yolov11/test/images
❌ Source labels directory not found: /home/emma/facultad/pps/datasets/containers/raw/container.v1i.yolov11/test/labels
❌ Source images directory not found: /home/emma/facultad/pps/datasets/containers/raw/container.v1i.yolov11/valid/images
❌ Source labels directory not found: /home/emma/facultad/pps/datasets/containers/raw/container.v1i.yolov11/valid/labels
✅ Copied: 20240526_233610-640_Camera_4_jpg.rf.b63c7ed014c3c785499042adef2dfd10.jpg
✅ Copied: 20240529_202717-223_Camera_5_jpg.rf.554f8f20f07888d873a9e2b792c88800.jpg
✅ Copied: 20240524_203220-523_Camera_9_jpg.rf.e30a7b39b1e24d55d5ce11eb60ceedbb.jpg
✅ Copied: 20240525_142707-659_Camera_5_jpg.rf.0ac005b0e