# Workflow for Processing and Simplifying Geospatial Data

This notebook details the steps involved in processing, vectorizing, filtering, and simplifying geospatial data from images, and subsequently grouping the data into a JSON file.

## Vectorization

In [20]:
import cv2
import os
import subprocess
from osgeo import gdal

def preprocess_image(input_path, output_path):
    """
    Preprocess the image: convert to grayscale and binarize.
    Save the result as a GeoTIFF file.
    """
    # Read the image
    img = cv2.imread(input_path)
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Invert the image so the background is black and the polygon is white
    inverted = cv2.bitwise_not(gray)
    
    # Apply binarization (using a fixed threshold)
    _, binary = cv2.threshold(inverted, 127, 255, cv2.THRESH_BINARY)
    
    # Save the binarized image as GeoTIFF
    driver = gdal.GetDriverByName('GTiff')
    out_dataset = driver.Create(output_path, binary.shape[1], binary.shape[0], 1, gdal.GDT_Byte)
    out_dataset.GetRasterBand(1).WriteArray(binary)
    out_dataset.SetProjection('EPSG:4326')  # WGS84 projection

def vectorize_image(input_raster, output_vector):
    """
    Uses GDAL to vectorize the binarized image and generate a Shapefile.
    """
    subprocess.run(['gdal_polygonize.py', input_raster, '-f', 'ESRI Shapefile', output_vector])

def process_directory(input_dir, output_dir):
    """
    Processes all JPG images in a directory.
    """
    for filename in os.listdir(input_dir):
        if filename.endswith('.jpg'):
            input_path = os.path.join(input_dir, filename)
            output_tiff_path = os.path.join(output_dir, f'{os.path.splitext(filename)[0]}.tif')
            output_shapefile_path = os.path.join(output_dir, f'{os.path.splitext(filename)[0]}.shp')

            preprocess_image(input_path, output_tiff_path)
            
            vectorize_image(output_tiff_path, output_shapefile_path)
            print(f'Procesado {filename}, resultado guardado en {output_shapefile_path}')



In [None]:
input_directory = 'data/processed/002/building_blocks/'
output_directory = 'data/processed/002/vectorized/'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

process_directory(input_directory, output_directory)

ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00019.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00019.jpg, resultado guardado en data/processed/002/vectorized/component_00019.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00037.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00037.jpg, resultado guardado en data/processed/002/vectorized/component_00037.shp
Creating output data/processed/002/vectorized/component_00001.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.


ERROR 1: missing [
ERROR 1: missing [


Procesado component_00001.jpg, resultado guardado en data/processed/002/vectorized/component_00001.shp
Creating output data/processed/002/vectorized/component_00002.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00002.jpg, resultado guardado en data/processed/002/vectorized/component_00002.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00003.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00003.jpg, resultado guardado en data/processed/002/vectorized/component_00003.shp
Creating output data/processed/002/vectorized/component_00005.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00005.jpg, resultado guardado en data/processed/002/vectorized/component_00005.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00006.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00006.jpg, resultado guardado en data/processed/002/vectorized/component_00006.shp
Creating output data/processed/002/vectorized/component_00007.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00007.jpg, resultado guardado en data/processed/002/vectorized/component_00007.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00008.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00008.jpg, resultado guardado en data/processed/002/vectorized/component_00008.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00009.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00009.jpg, resultado guardado en data/processed/002/vectorized/component_00009.shp
Creating output data/processed/002/vectorized/component_00010.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.


ERROR 1: missing [
ERROR 1: missing [


Procesado component_00010.jpg, resultado guardado en data/processed/002/vectorized/component_00010.shp
Creating output data/processed/002/vectorized/component_00011.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00011.jpg, resultado guardado en data/processed/002/vectorized/component_00011.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00012.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00012.jpg, resultado guardado en data/processed/002/vectorized/component_00012.shp
Creating output data/processed/002/vectorized/component_00013.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00013.jpg, resultado guardado en data/processed/002/vectorized/component_00013.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00014.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00014.jpg, resultado guardado en data/processed/002/vectorized/component_00014.shp
Creating output data/processed/002/vectorized/component_00015.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00015.jpg, resultado guardado en data/processed/002/vectorized/component_00015.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00017.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00017.jpg, resultado guardado en data/processed/002/vectorized/component_00017.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00018.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00018.jpg, resultado guardado en data/processed/002/vectorized/component_00018.shp
Creating output data/processed/002/vectorized/component_00020.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00020.jpg, resultado guardado en data/processed/002/vectorized/component_00020.shp
Creating output data/processed/002/vectorized/component_00021.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00021.jpg, resultado guardado en data/processed/002/vectorized/component_00021.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00022.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00022.jpg, resultado guardado en data/processed/002/vectorized/component_00022.shp
Creating output data/processed/002/vectorized/component_00023.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00023.jpg, resultado guardado en data/processed/002/vectorized/component_00023.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00025.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00025.jpg, resultado guardado en data/processed/002/vectorized/component_00025.shp
Creating output data/processed/002/vectorized/component_00026.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00026.jpg, resultado guardado en data/processed/002/vectorized/component_00026.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00027.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00027.jpg, resultado guardado en data/processed/002/vectorized/component_00027.shp
Creating output data/processed/002/vectorized/component_00028.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00028.jpg, resultado guardado en data/processed/002/vectorized/component_00028.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00029.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00029.jpg, resultado guardado en data/processed/002/vectorized/component_00029.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00030.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00030.jpg, resultado guardado en data/processed/002/vectorized/component_00030.shp
Creating output data/processed/002/vectorized/component_00031.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00031.jpg, resultado guardado en data/processed/002/vectorized/component_00031.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00032.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00032.jpg, resultado guardado en data/processed/002/vectorized/component_00032.shp
Creating output data/processed/002/vectorized/component_00033.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00033.jpg, resultado guardado en data/processed/002/vectorized/component_00033.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00034.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00034.jpg, resultado guardado en data/processed/002/vectorized/component_00034.shp
Creating output data/processed/002/vectorized/component_00035.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00035.jpg, resultado guardado en data/processed/002/vectorized/component_00035.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00036.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00036.jpg, resultado guardado en data/processed/002/vectorized/component_00036.shp
Creating output data/processed/002/vectorized/component_00038.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00038.jpg, resultado guardado en data/processed/002/vectorized/component_00038.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00039.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00039.jpg, resultado guardado en data/processed/002/vectorized/component_00039.shp
Creating output data/processed/002/vectorized/component_00041.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00041.jpg, resultado guardado en data/processed/002/vectorized/component_00041.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00042.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00042.jpg, resultado guardado en data/processed/002/vectorized/component_00042.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00043.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00043.jpg, resultado guardado en data/processed/002/vectorized/component_00043.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00044.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00044.jpg, resultado guardado en data/processed/002/vectorized/component_00044.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00046.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00046.jpg, resultado guardado en data/processed/002/vectorized/component_00046.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00048.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00048.jpg, resultado guardado en data/processed/002/vectorized/component_00048.shp
Creating output data/processed/002/vectorized/component_00049.shp of format ESRI Shapefile.


ERROR 1: missing [
ERROR 1: missing [


0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00049.jpg, resultado guardado en data/processed/002/vectorized/component_00049.shp
Creating output data/processed/002/vectorized/component_00050.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00050.jpg, resultado guardado en data/processed/002/vectorized/component_00050.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00051.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00051.jpg, resultado guardado en data/processed/002/vectorized/component_00051.shp
Creating output data/processed/002/vectorized/component_00052.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00052.jpg, resultado guardado en data/processed/002/vectorized/component_00052.shp


ERROR 1: missing [
ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00053.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00053.jpg, resultado guardado en data/processed/002/vectorized/component_00053.shp
Creating output data/processed/002/vectorized/component_00054.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00054.jpg, resultado guardado en data/processed/002/vectorized/component_00054.shp


ERROR 1: missing [


Creating output data/processed/002/vectorized/component_00055.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00055.jpg, resultado guardado en data/processed/002/vectorized/component_00055.shp
Creating output data/processed/002/vectorized/component_00056.shp of format ESRI Shapefile.


ERROR 1: missing [
ERROR 1: missing [


0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00056.jpg, resultado guardado en data/processed/002/vectorized/component_00056.shp
Creating output data/processed/002/vectorized/component_00057.shp of format ESRI Shapefile.
0...10...20...30...40...50...60...70...80...90...100 - done.
Procesado component_00057.jpg, resultado guardado en data/processed/002/vectorized/component_00057.shp


## Simplifying Geospatial Data

In [19]:
import os
from pathlib import Path
import fiona
from shapely.geometry import shape, mapping, Polygon, MultiPolygon
from rdp import rdp

def filter_polygons(input_shapefile: str, output_shapefile: str):
    """
    Filters the main polygon (black in the binarized image) based on its area or position.
    """
    with fiona.open(input_shapefile, 'r') as source:
        schema = source.schema
        crs = source.crs
        polygons = []

        for feature in source:
            geom = shape(feature['geometry'])
            if geom.geom_type == 'Polygon' or geom.geom_type == 'MultiPolygon':
                polygons.append(geom)

        # Filter the largest polygon (you can adjust this logic if necessary)
        main_polygon = max(polygons, key=lambda x: x.area)

        # Create shapefile with only the polygon of interest
        with fiona.open(output_shapefile, 'w', driver=source.driver, schema=schema, crs=crs) as sink:
            feature = {
                'type': 'Feature',
                'geometry': mapping(main_polygon),
                'properties': {key: None for key in schema['properties']}  # Clear original properties
            }
            sink.write(feature)

def simplify_shapefile_with_rdp(input_path: str, output_path: str, tolerance: float):
    """
    Simplifies a shapefile using the Ramer-Douglas-Peucker (RDP) algorithm.
    """
    with fiona.open(input_path, 'r') as source:
        schema = source.schema
        crs = source.crs

        # Create output file
        with fiona.open(output_path, 'w', driver=source.driver, schema=schema, crs=crs) as sink:
            for feature in source:
                geom = shape(feature['geometry'])
                if geom.geom_type == 'Polygon':
                    simplified_coords = rdp(list(geom.exterior.coords), epsilon=tolerance)
                    simplified_geom = Polygon(simplified_coords)
                elif geom.geom_type == 'MultiPolygon':
                    simplified_geom = MultiPolygon([
                        Polygon(rdp(list(poly.exterior.coords), epsilon=tolerance))
                        for poly in geom.geoms
                    ])
                else:
                    continue  # Ignore unsupported geometries
                feature['geometry'] = fiona.Geometry.from_dict(mapping(simplified_geom))
                sink.write(feature)

def simplify_shapefile_with_shapely(input_path: str, output_path: str, tolerance: float):
    """
    Simplifies a shapefile using shapely.simplify.
    """
    with fiona.open(input_path, 'r') as source:
        schema = source.schema
        crs = source.crs

        # Create output file
        with fiona.open(output_path, 'w', driver=source.driver, schema=schema, crs=crs) as sink:
            for feature in source:
                geom = shape(feature['geometry'])
                if geom.geom_type in ['Polygon', 'MultiPolygon']:
                    simplified_geom = geom.simplify(tolerance, preserve_topology=True)
                else:
                    continue  # Ignore unsupported geometries
                feature['geometry'] = fiona.Geometry.from_dict(mapping(simplified_geom))
                sink.write(feature)

def process_shapefiles(input_directory: str, tolerance: float):
    """
    Processes all shapefiles in a directory: filters the polygon of interest and applies simplifications.
    """
    input_path = Path(input_directory)
    filtered_output_dir = input_path / 'filtered'
    rdp_output_dir = input_path / 'rdp'
    shapely_output_dir = input_path / 'simplify'

    filtered_output_dir.mkdir(parents=True, exist_ok=True)
    rdp_output_dir.mkdir(parents=True, exist_ok=True)
    shapely_output_dir.mkdir(parents=True, exist_ok=True)

    for shp_file in input_path.glob('*.shp'):
        filtered_output_path = filtered_output_dir / shp_file.name
        rdp_output_path = rdp_output_dir / shp_file.name
        shapely_output_path = shapely_output_dir / shp_file.name

        print(f"Filtering the main polygon of {shp_file.name}...")
        filter_polygons(str(shp_file), str(filtered_output_path))

        print(f"Simplifying {shp_file.name} with RDP...")
        simplify_shapefile_with_rdp(str(filtered_output_path), str(rdp_output_path), tolerance)

        print(f"Simplifying {shp_file.name} with Shapely...")
        simplify_shapefile_with_shapely(str(filtered_output_path), str(shapely_output_path), tolerance)

        print(f"Processed {shp_file.name}: saved in {rdp_output_dir} and {shapely_output_dir}")


Filtrando el polígono principal de component_00005.shp...
Simplificando component_00005.shp con RDP...


  feature['geometry'] = fiona.Geometry.from_dict(mapping(simplified_geom))
  feature['geometry'] = fiona.Geometry.from_dict(mapping(simplified_geom))


Simplificando component_00005.shp con Shapely...
Procesado component_00005.shp: guardado en data/processed/001/vectorized/rdp y data/processed/001/vectorized/simplify
Filtrando el polígono principal de component_00006.shp...
Simplificando component_00006.shp con RDP...
Simplificando component_00006.shp con Shapely...
Procesado component_00006.shp: guardado en data/processed/001/vectorized/rdp y data/processed/001/vectorized/simplify
Filtrando el polígono principal de component_00007.shp...
Simplificando component_00007.shp con RDP...
Simplificando component_00007.shp con Shapely...
Procesado component_00007.shp: guardado en data/processed/001/vectorized/rdp y data/processed/001/vectorized/simplify
Filtrando el polígono principal de component_00008.shp...
Simplificando component_00008.shp con RDP...
Simplificando component_00008.shp con Shapely...
Procesado component_00008.shp: guardado en data/processed/001/vectorized/rdp y data/processed/001/vectorized/simplify
Filtrando el polígono p

In [22]:
input_directory = 'data/processed/002/vectorized'
tolerance = 2.0

process_shapefiles(input_directory, tolerance)

Filtrando el polígono principal de component_00001.shp...
Simplificando component_00001.shp con RDP...
Simplificando component_00001.shp con Shapely...
Procesado component_00001.shp: guardado en data/processed/002/vectorized/rdp y data/processed/002/vectorized/simplify
Filtrando el polígono principal de component_00002.shp...
Simplificando component_00002.shp con RDP...
Simplificando component_00002.shp con Shapely...
Procesado component_00002.shp: guardado en data/processed/002/vectorized/rdp y data/processed/002/vectorized/simplify
Filtrando el polígono principal de component_00003.shp...
Simplificando component_00003.shp con RDP...


  feature['geometry'] = fiona.Geometry.from_dict(mapping(simplified_geom))
  feature['geometry'] = fiona.Geometry.from_dict(mapping(simplified_geom))


Simplificando component_00003.shp con Shapely...
Procesado component_00003.shp: guardado en data/processed/002/vectorized/rdp y data/processed/002/vectorized/simplify
Filtrando el polígono principal de component_00005.shp...
Simplificando component_00005.shp con RDP...
Simplificando component_00005.shp con Shapely...
Procesado component_00005.shp: guardado en data/processed/002/vectorized/rdp y data/processed/002/vectorized/simplify
Filtrando el polígono principal de component_00006.shp...
Simplificando component_00006.shp con RDP...
Simplificando component_00006.shp con Shapely...
Procesado component_00006.shp: guardado en data/processed/002/vectorized/rdp y data/processed/002/vectorized/simplify
Filtrando el polígono principal de component_00007.shp...
Simplificando component_00007.shp con RDP...
Simplificando component_00007.shp con Shapely...
Procesado component_00007.shp: guardado en data/processed/002/vectorized/rdp y data/processed/002/vectorized/simplify
Filtrando el polígono p

## Digitization

In [8]:
import os
import json
from pathlib import Path
import fiona
from shapely.geometry import shape, Polygon


def extract_polygon_data(shapefile_path: str) -> list:
    """
    Extracts the vertex coordinates of polygons in clockwise order from a shapefile.
    """
    polygons_data = []

    with fiona.open(shapefile_path, 'r') as shapefile:
        for feature in shapefile:
            geom = shape(feature['geometry'])
            if geom.geom_type == 'Polygon':
                # Ensure that the coordinates are in clockwise order
                if not geom.exterior.is_ccw:
                    coords = list(geom.exterior.coords)
                else:
                    coords = list(Polygon(geom.exterior.coords[::-1]).exterior.coords)
                polygons_data.append({"coordinates": coords})
            elif geom.geom_type == 'MultiPolygon':
                for polygon in geom.geoms:
                    # Ensure that the coordinates are in clockwise order
                    if not polygon.exterior.is_ccw:
                        coords = list(polygon.exterior.coords)
                    else:
                        coords = list(Polygon(polygon.exterior.coords[::-1]).exterior.coords)
                    polygons_data.append({"coordinates": coords})

    return polygons_data


def parse_components_info(components_info_path: str) -> dict:
    """
    Parses the components_info.txt file to extract data for each component.
    """
    components_data = {}

    with open(components_info_path, 'r') as file:
        lines = file.readlines()
        current_component = None

        for line in lines:
            line = line.strip()
            if line.startswith("Component"):
                current_component = line.split()[1][:-1]
                components_data[current_component] = {}
            elif "Top-left corner" in line:
                top_left = line.split(":")[1].strip()
                components_data[current_component]["top_left_corner"] = eval(top_left)  
    return components_data


def group_shapefile_data(input_directory: str, components_info_path: str, output_json_path: str):
    """
    Groups shapefile data with the same base name, adds data from the components_info.txt file,
    and saves the information in a JSON file.
    """
    input_dir = Path(input_directory)
    shapefiles = list(input_dir.glob("*.shp"))
    grouped_data = {}

    # Read data from the components_info.txt file
    components_data = parse_components_info(components_info_path)

    # Iterate over all .shp files in the directory
    for shapefile_path in shapefiles:
        base_name = shapefile_path.stem

        # Extract vertex coordinates
        vertices_data = extract_polygon_data(str(shapefile_path))

        # Add data to the corresponding group
        if base_name not in grouped_data:
            grouped_data[base_name] = []
        grouped_data[base_name].extend(vertices_data)

        # If the shapefile has a name like "component_0000x", add top_left_corner
        component_number = base_name.split("_")[-1].lstrip("0")  # Extract the component number
        if component_number in components_data:
            for item in grouped_data[base_name]:
                item["top_left_corner"] = components_data[component_number]["top_left_corner"]

    # Save the grouped data to a JSON file
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(grouped_data, json_file, ensure_ascii=False, indent=4)

    print(f"JSON file saved at: {output_json_path}")


input_directory = 'data/processed/001/vectorized/simplify'  
components_info_path = 'data/processed/001/components_info.txt'
output_json_path = "001_grouped_shapefiles_simplify.json"

group_shapefile_data(input_directory, components_info_path, output_json_path)


Archivo JSON guardado en: 001_grouped_shapefiles_rdp.json
