# PyTorch EO Semantic Segmentation Example
## Step 1: Training data preparation
*Rob Knapen, Wageningen Environmental Research*
<br>
In this notebook a tiles collection is examined and all usable images are copied to a temporary folder, split into training, test, and validation sets.

In [3]:
# connect to Google Drive for access to the training data
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# Note: To see a folder that is shared with you, you need to create
#   a shortcut for it on your Google Drive before it you can access
#   it here in CoLab.

# expect a data / tiles folder with collections of training images
!ls '/content/drive/MyDrive/AI_WENR/data/tiles/collection_1/'

features  labels


In [6]:
# setup paths to the data
import os
import shutil
from glob import glob

# construct the paths to the training data
content_root_folder = os.path.join('/content', 'drive', 'MyDrive', 'AI_WENR', 'data')

# image tiles for training
tiles_collection_folder = os.path.join(content_root_folder, 'tiles', 'collection_1')
features_folder = os.path.join(tiles_collection_folder, 'features', 'sent2_2018_seasons_10m_7x4bands_224px')
labels_folder = os.path.join(tiles_collection_folder, 'labels', 'parcels_2018_10m_coded_76_224px')

base_feature_image_name = 'sent2_2018_seasons_10m_7x4bands_224px'
expected_feature_image_size_pixels = 224

base_label_image_name = 'parcels_2018_10m_coded_76_224px'
expected_label_image_size_pixels = 224

# file with information about the encoding used for the labels
codes_folder = os.path.join(content_root_folder, 'codes')
codes_file = os.path.join(codes_folder, 'cell_value_to_crop_info_76_classes.csv')

# temporary folders for cleaned set of tiles
tiles_temp_folder = os.path.join(content_root_folder, 'tiles', 'temp')
tiles_train_folder = os.path.join(tiles_temp_folder, 'train')
tiles_val_folder = os.path.join(tiles_temp_folder, 'val')
tiles_test_folder = os.path.join(tiles_temp_folder, 'test')

In [7]:
# local install of the rasterio library
!pip install rasterio

Collecting rasterio
  Downloading rasterio-1.2.8-cp37-cp37m-manylinux1_x86_64.whl (19.3 MB)
[K     |████████████████████████████████| 19.3 MB 1.2 MB/s 
[?25hCollecting click-plugins
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting snuggs>=1.4.1
  Downloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Collecting affine
  Downloading affine-2.3.0-py2.py3-none-any.whl (15 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Installing collected packages: snuggs, cligj, click-plugins, affine, rasterio
Successfully installed affine-2.3.0 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.2.8 snuggs-1.4.7


In [8]:
# import some packages for GeoTIFF processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import rasterio

In [9]:
# collect lists of all available images
all_feature_files = sorted(glob(os.path.join(features_folder, '*.tif')))
all_label_files = sorted(glob(os.path.join(labels_folder, '*.tif')))
len(all_feature_files), len(all_label_files)

(250, 250)

In [10]:
# lists to be filled with names of images to further process
keep_feature_files = []
keep_label_files = []

# check if there is a matching label image for each feature image and clean-up
for file_name in all_feature_files:
    number = file_name.split('.')[-2]
    expected_file = '%s.%s.tif' % (base_label_image_name, number)
    matches = [x for x in all_label_files if expected_file in x]
    if len(matches) == 0:
        print('No matching label image for feature image %s, it will be skipped!' % (file_name))
    else:
      keep_feature_files.append(file_name)
    
# check if there is a matching feature image for each label image and clean-up        
for file_name in all_label_files:
    number = file_name.split('.')[-2]
    expected_file = '%s.%s.tif' % (base_feature_image_name, number)
    matches = [x for x in all_feature_files if expected_file in x]
    if len(matches) == 0:
        print('No matching feature image for label image %s, it will be skipped!' % (file_name))
    else:
      keep_label_files.append(file_name)

len(keep_feature_files), len(keep_label_files)

(250, 250)

In [11]:
# lists to be filled with names of images to further process
keep2_feature_files = []
keep2_label_files = []

# check if all images have the expected size
for file_name in keep_feature_files:
  image = rasterio.open(file_name)
  if image.width != expected_feature_image_size_pixels or image.height != expected_feature_image_size_pixels:
    print('Feature image %s does not have the expected width or height, skipping it!' % (file_name))
  else:
    keep2_feature_files.append(file_name)

# check if all images have the expected size
for file_name in keep_label_files:
  image = rasterio.open(file_name)
  if image.width != expected_label_image_size_pixels or image.height != expected_label_image_size_pixels:
    print('Label image %s does not have the expected width or height, skipping it!' % (file_name))
  else:
    keep2_label_files.append(file_name)


len(keep2_feature_files), len(keep2_label_files)

Feature image /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/features/sent2_2018_seasons_10m_7x4bands_224px/sent2_2018_seasons_10m_7x4bands_224px.124.tif does not have the expected width or height, skipping it!
Feature image /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/features/sent2_2018_seasons_10m_7x4bands_224px/sent2_2018_seasons_10m_7x4bands_224px.149.tif does not have the expected width or height, skipping it!
Feature image /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/features/sent2_2018_seasons_10m_7x4bands_224px/sent2_2018_seasons_10m_7x4bands_224px.174.tif does not have the expected width or height, skipping it!
Feature image /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/features/sent2_2018_seasons_10m_7x4bands_224px/sent2_2018_seasons_10m_7x4bands_224px.199.tif does not have the expected width or height, skipping it!
Feature image /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/features/sent2_2018_seasons_10m_7x4bands_224px/sent2_20

(216, 216)

In [12]:
all_files = list(zip(sorted(keep2_feature_files), sorted(keep2_label_files)))
len(all_files)

216

In [13]:
# clean up previous data (BE CAREFUL!)
shutil.rmtree(tiles_train_folder, ignore_errors=True)
shutil.rmtree(tiles_val_folder, ignore_errors=True)
shutil.rmtree(tiles_test_folder, ignore_errors=True)

#create new dirs
folder = os.path.join(tiles_train_folder, 'features')
if not os.path.exists(folder): os.makedirs(folder)

folder = os.path.join(tiles_train_folder, 'labels')
if not os.path.exists(folder): os.makedirs(folder)

folder = os.path.join(tiles_val_folder, 'features')
if not os.path.exists(folder): os.makedirs(folder)

folder = os.path.join(tiles_val_folder, 'labels')
if not os.path.exists(folder): os.makedirs(folder)

folder = os.path.join(tiles_test_folder, 'features')
if not os.path.exists(folder): os.makedirs(folder)

folder = os.path.join(tiles_test_folder, 'labels')
if not os.path.exists(folder): os.makedirs(folder)

In [14]:
# calculate split
train_ratio = 0.7
validation_ratio = 0.2
test_ratio = 0.1

total_file_count = len(all_files)
train_file_count = int(np.round(total_file_count * train_ratio))
val_file_count   = int(np.round(total_file_count * validation_ratio))
test_file_count  = int(np.round(total_file_count * test_ratio))

In [15]:
# copy the files to the folders
import random

# take a random sample without replacement
random.seed = 1234
sample = random.sample(all_files, len(all_files))

count = 0
for files in sample:
    if count < train_file_count:
        feature_dst = os.path.join(tiles_train_folder, 'features', files[0].split(os.path.sep)[-1])
        label_dst   = os.path.join(tiles_train_folder, 'labels',   files[1].split(os.path.sep)[-1])
    elif count < (train_file_count + val_file_count):
        feature_dst = os.path.join(tiles_val_folder,   'features', files[0].split(os.path.sep)[-1])
        label_dst   = os.path.join(tiles_val_folder,   'labels',   files[1].split(os.path.sep)[-1])
    else:
        feature_dst = os.path.join(tiles_test_folder,  'features', files[0].split(os.path.sep)[-1])
        label_dst   = os.path.join(tiles_test_folder,  'labels',   files[1].split(os.path.sep)[-1])
        
    print('Copying %s to %s' % (files[0], feature_dst))
    shutil.copy(files[0], feature_dst)
    print('Copying %s to %s' % (files[1], label_dst))
    shutil.copy(files[1], label_dst)        
    
    count += 1

Copying /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/features/sent2_2018_seasons_10m_7x4bands_224px/sent2_2018_seasons_10m_7x4bands_224px.64.tif to /content/drive/MyDrive/AI_WENR/data/tiles/temp/train/features/sent2_2018_seasons_10m_7x4bands_224px.64.tif
Copying /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/labels/parcels_2018_10m_coded_76_224px/parcels_2018_10m_coded_76_224px.64.tif to /content/drive/MyDrive/AI_WENR/data/tiles/temp/train/labels/parcels_2018_10m_coded_76_224px.64.tif
Copying /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/features/sent2_2018_seasons_10m_7x4bands_224px/sent2_2018_seasons_10m_7x4bands_224px.212.tif to /content/drive/MyDrive/AI_WENR/data/tiles/temp/train/features/sent2_2018_seasons_10m_7x4bands_224px.212.tif
Copying /content/drive/MyDrive/AI_WENR/data/tiles/collection_1/labels/parcels_2018_10m_coded_76_224px/parcels_2018_10m_coded_76_224px.212.tif to /content/drive/MyDrive/AI_WENR/data/tiles/temp/train/labels/parcels_2018_10m_cod

In [16]:
# verify numbers of features and label files
train_feature_files = glob(os.path.join(tiles_train_folder, 'features', '*.tif'))
train_label_files   = glob(os.path.join(tiles_train_folder, 'labels', '*.tif'))
print('Training   : found %d feature and %d label files' % (len(train_feature_files), len(train_label_files)))

val_feature_files   = glob(os.path.join(tiles_val_folder, 'features', '*.tif'))
val_label_files     = glob(os.path.join(tiles_val_folder, 'labels', '*.tif'))
print('Validation : Found %d feature and %d label files' % (len(val_feature_files), len(val_label_files)))

test_feature_files  = glob(os.path.join(tiles_test_folder, 'features', '*.tif'))
test_label_files    = glob(os.path.join(tiles_test_folder, 'labels', '*.tif'))
print('Test       : Found %d feature and %d label files' % (len(test_feature_files), len(test_label_files)))

# zip into a list of tuples, sort on names to get matching pairs (requires consistent file naming!)
train_set = list(zip(sorted(train_feature_files), sorted(train_label_files)))
val_set   = list(zip(sorted(val_feature_files), sorted(val_label_files)))
test_set  = list(zip(sorted(test_feature_files), sorted(test_label_files)))

Training   : found 151 feature and 151 label files
Validation : Found 43 feature and 43 label files
Test       : Found 22 feature and 22 label files
