This notebook was used to do the training and testing split of the dataset for the standalone (Planet-NICFI and Sentinel-1) models. The starting input is represented by the ASM points previously obtain with the R script that leverages the BlockCV package.

Note that this notebook was ran in Colab, as the dataset was originally stored in Google Drive, so it might be necessary to install some packages in the environment to be executable locally (as well as changing the directories). 

In [None]:
# !pip install rasterio -qqq

In [None]:
import geopandas as gpd
import rasterio
import shapely.geometry
import os
import numpy as np
import matplotlib.pyplot as plt
import shutil

from shapely.geometry import box
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
train_points = gpd.read_file('/content/drive/MyDrive/mgi_thesis/train_data.geojson')
test_points = gpd.read_file('/content/drive/MyDrive/mgi_thesis/test_data.geojson')

In [None]:
gt_dir = '/content/drive/MyDrive/mgi_thesis/gt_binary_resampled_s1'
images_dir = '/content/drive/MyDrive/mgi_thesis/s1_images_nospecklefilter'

base_dir = '/content/drive/MyDrive/mgi_thesis/asm_dataset_split_0/s1'
train_dir = os.path.join(base_dir, 'training_data')
test_dir = os.path.join(base_dir, 'testing_data')

In [None]:
def get_image_boundaries(image_path):
    """
    Extracts the boundaries of a raster image as a polygon. This function
    will allow to determine which ASM sites fall within the area covered by
    each satellite image.

    Parameters:
    image_path (str): Path to the raster image file.

    Returns:
    shapely.geometry.Polygon: A polygon of the image's geographical bounds.
    """
    with rasterio.open(image_path) as dataset:
        bounds = dataset.bounds
        return box(bounds.left, bounds.bottom, bounds.right, bounds.top)

In [None]:
image_boundaries = []
for image_filename in os.listdir(images_dir):
    if image_filename.endswith('.tif') and not image_filename.startswith('gt_'):
        image_path = os.path.join(images_dir, image_filename)
        boundary = get_image_boundaries(image_path)
        image_boundaries.append({'image_id': image_filename.replace('.tif', ''), 'geometry': boundary})

images_gdf = gpd.GeoDataFrame(image_boundaries, geometry='geometry')

In [None]:
# set the CRS again
images_gdf = images_gdf.set_crs(4326)

In [None]:
# function to perform spatial join and categorize images
def categorize_images(images_gdf, train_points_gdf, test_points_gdf):
    # spatial join points with images
    train_overlap = gpd.sjoin(images_gdf, train_points_gdf, how='left', op='intersects')
    test_overlap = gpd.sjoin(images_gdf, test_points_gdf, how='left', op='intersects')

    # count points in each image
    train_counts = train_overlap.groupby('image_id').size()
    test_counts = test_overlap.groupby('image_id').size()

    # determine category based on counts
    image_category = {}
    for image_id in images_gdf['image_id']:
        train_count = train_counts.get(image_id, 0)
        test_count = test_counts.get(image_id, 0)
        category = 'training' if train_count > test_count else 'testing'
        image_category[image_id] = category

    return image_category

# categorize images
image_category = categorize_images(images_gdf, train_points, test_points)

In [None]:
def validate_and_copy(src_path, dest_path, file_type):
    if os.path.exists(src_path):
        shutil.copy(src_path, dest_path)

In [None]:
for image_id, category in image_category.items():
    # construct the filenames of the images
    src_image_filename = f"{image_id}.tif"  #

    # construct the ground truth filenames
    gt_suffix = '_gt'
    parts = image_id.split('_')
    src_gt_filename = f"resampled_nicfi{gt_suffix}_{parts[1]}.tif"

    # construct the full paths to the source image and ground truth files
    src_image_path = os.path.join(images_dir, src_image_filename)
    src_gt_path = os.path.join(gt_dir, src_gt_filename)

    # define destination directories for images and ground truth
    dest_image_dir = os.path.join(base_dir, category + '_data/images')
    dest_gt_dir = os.path.join(base_dir, category + '_data/gt')

    dest_image_path = os.path.join(dest_image_dir, os.path.basename(src_image_path))
    dest_gt_path = os.path.join(dest_gt_dir, os.path.basename(src_gt_path))

    validate_and_copy(src_image_path, dest_image_path, "Image")
    validate_and_copy(src_gt_path, dest_gt_path, "Ground Truth")