In [1]:
from dotenv import load_dotenv
load_dotenv()

import geopandas as gpd
import os
from pathlib import Path
workdir = Path(os.getenv("WORKDIR", '.'))
scratchdir = Path(os.getenv("SCRATCHDIR", '.'))

# Computing virtual tiles


In [2]:
import rasterio as rio
from math import floor
from itertools import product
from shapely.geometry import box

Finding the image CRS to compute appropriately the grid geometry using GeoPandas.

In [3]:
source_db = rio.open(workdir / 'data/GEE/one_image_per_month_2022/Sentinel_Apr.tif')
image_crs = source_db.crs
image_crs

CRS.from_epsg(32633)

Computing the virtual tiles geometry

In [4]:
overlap = 0.5
tile_width = 256
tile_height = 256

x_indices = range(0, source_db.width, floor(tile_width * (1 - overlap)))
y_indices = range(0, source_db.height, floor(tile_height * (1 - overlap)))
grid_coordinates = product(y_indices, x_indices) # Not a typo, the raster starts from the top-left coordinate. 

def tile_box_for_coordinates(grid_x, grid_y):
    tile_min = source_db.xy(grid_x, grid_y)
    tile_max = source_db.xy(grid_x + tile_width, grid_y + tile_height)
    tile_box = box(tile_min[0], tile_min[1], tile_max[0], tile_max[1])
    return tile_box
grid_boxes = [tile_box_for_coordinates(grid_xy[0], grid_xy[1]) for grid_xy in grid_coordinates]


In [5]:
raster_tiles_df = gpd.GeoDataFrame(geometry=grid_boxes, crs=image_crs)
raster_tiles_df["height"] = tile_height
raster_tiles_df["width"] = tile_width
raster_tiles_df.explore(style_kwds=dict(fill=False))

Filtering the train and test tiles.

In [7]:
test_set_df = gpd.read_file(workdir / 'data/AREA_TRAIN_TEST/TestSet_CLS_UTM.geojson')
train_set_df = gpd.read_file(workdir / 'data/AREA_TRAIN_TEST/TrainSet_CLS_UTM.geojson')
cls_area_df = gpd.read_file(workdir / 'data/AREA_TRAIN_TEST/Area_CLS_UTM.geojson')
test_area_map = test_set_df.explore(style_kwds=dict(fill=False, color='red'))

# The `unary_union` method is used bacause the used DataFrame methods expect 
# Series of the same length but work with broadcast semantics.
#
# We consider test tiles those that have more than 50% overlap with the test 
# area polygons.  
test_overlap_query = (raster_tiles_df.intersection(test_set_df.geometry.unary_union).area / raster_tiles_df.geometry.area) > 0.5
test_set_tiles = raster_tiles_df[test_overlap_query]

# We consider train tiles those that have more than 50% overlap with the train
# area polygons and do not intersect with the test area.
train_overlap_query = (raster_tiles_df.intersection(train_set_df.geometry.unary_union).area / raster_tiles_df.geometry.area) > 0.5
train_set_tiles = raster_tiles_df[train_overlap_query & ~(raster_tiles_df.contains(test_set_tiles.geometry.unary_union))]

print(f"Got {len(train_set_tiles)} training tiles and {len(test_set_tiles)} test tiles")
test_set_tiles.explore(m=test_area_map, style_kwds=dict(fill=False, color='blue'))
train_set_tiles.explore(m=test_area_map, style_kwds=dict(fill=False, color='green'))


Got 824 training tiles and 85 test tiles


Saving the tiles to file.

In [8]:
test_set_tiles.to_file(workdir / "data/AREA_TRAIN_TEST/TestTiles_CLS_UTM.geojson", driver='GeoJSON', index=False)
train_set_tiles.to_file(workdir / "data/AREA_TRAIN_TEST/TrainTiles_CLS_UTM.geojson", driver='GeoJSON', index=False)