# Introduction

Author: Austin Dibble

A short notebook which was used to get a subset of the full OMS2CD data based on which files had been fully annotated with labels.

# Load OM2CD From Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil
import zipfile

shutil.copy('/content/drive/MyDrive/2023_dissertation/dataset_archives/OMS2CD_full.zip', '/content/OMS2CD.zip')

with zipfile.ZipFile('/content/OMS2CD.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Download Git Repo

In [None]:
!git clone https://dibz15:$key@github.com/Dibz15/OpenMineChangeDetection.git
import sys
sys.path.append('/content/OpenMineChangeDetection/')

Cloning into 'OpenMineChangeDetection'...
remote: Enumerating objects: 256, done.[K
remote: Counting objects: 100% (256/256), done.[K
remote: Compressing objects: 100% (115/115), done.[K
remote: Total 256 (delta 138), reused 255 (delta 137), pack-reused 0[K
Receiving objects: 100% (256/256), 6.93 MiB | 22.97 MiB/s, done.
Resolving deltas: 100% (138/138), done.


# Filter the Dataset for Annotated CD Files

Using the mapping file, we look at the CD triplets (imageA, imageB, and mask). If a triplet doesn't have a mask file, then we remove it from the dataset.

In [None]:
import os
import csv
import shutil
import re

source_dir = '/content/OMS2CD'  # replace with your source directory
mapping_file_path = os.path.join(source_dir, 'mapping.csv')
mask_dir = os.path.join(source_dir, 'mask')  # replace with your mask directory
destination_dir = '/content/filtered_files'  # replace with your destination directory

# Create the destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Open the mapping file in read mode
good_files = set()
all_files = set()
with open(mapping_file_path, 'r', newline='') as mapping_file:
    reader = csv.DictReader(mapping_file)

    for row in reader:
        id = row['id']
        imageA = row['imageA']
        imageB = row['imageB']
        imageA_without_ext = imageA.replace('.tif', '')
        s2_prefix_and_rest, date_str = imageA_without_ext.rsplit('_', 1)
        s2_prefix, rest = s2_prefix_and_rest.split('_', 1)
        facility = re.split(r'[\d-]', rest)[0].strip('_')
        mask_file = f'{facility}_{id}.tif'

        # Check if the mask file exists
        if os.path.exists(os.path.join(mask_dir, mask_file)):
            good_files.add(imageA)
            good_files.add(imageB)
        else:
            print(f'No mask file found for {facility} with id {id}')

        all_files.add(imageA)
        all_files.add(imageB)

    for image in all_files:
        if image not in good_files:
            print(f'Removed {image}')
            try:
                os.remove(os.path.join(source_dir, image))
            except FileNotFoundError:
                pass

No mask file found for Guizhou_Xiuwen with id 0022
No mask file found for Guizhou_Xiuwen with id 0026
No mask file found for Guizhou_Xiuwen with id 0027
No mask file found for Guizhou_Xiuwen with id 0028
No mask file found for Gunnedah_leard with id 0029
No mask file found for Gunnedah_leard with id 0032
No mask file found for Gunnedah_leard with id 0033
No mask file found for Gunnedah_leard with id 0034
No mask file found for Gunnedah_leard with id 0035
No mask file found for Gunnedah_leard with id 0036
No mask file found for Gunnedah_leard with id 0037
No mask file found for Gunnedah_leard with id 0038
No mask file found for Gunnedah_leard with id 0039
No mask file found for Gunnedah_leard with id 0040
No mask file found for Gunnedah_leard with id 0041
No mask file found for Gunnedah_leard with id 0042
No mask file found for Gunnedah_leard with id 0043
No mask file found for Gunnedah_leard with id 0044
No mask file found for Gunnedah_leard with id 0045
No mask file found for Gunnedah

# Fix RowsPerStrip with GDAL

In [None]:
!apt update
!apt install -y gdal-bin libgdal-dev
!pip install gdal

In [None]:
import os
import subprocess
from shutil import copy2

def set_rows_per_strip(file_path, new_file_path, rows_per_strip):
    result = subprocess.run(['gdal_translate', '-co', f'TILED=YES', '-co', f'BLOCKYSIZE={rows_per_strip}', file_path, new_file_path])
    return result.returncode

def update_tiffs_in_directory(src_directory, dst_directory, rows_per_strip):
    if not os.path.exists(dst_directory):
        os.makedirs(dst_directory)

    for root, dirs, files in os.walk(src_directory):
        for dir in dirs:
            dst_dir = os.path.join(root.replace(src_directory, dst_directory), dir)
            os.makedirs(dst_dir, exist_ok=True)

        for file in files:
            old_file_path = os.path.join(root, file)
            new_file_path = os.path.join(root.replace(src_directory, dst_directory), file)

            if file.endswith('.tif'):
                set_rows_per_strip(old_file_path, new_file_path, rows_per_strip)
            else:
                copy2(old_file_path, new_file_path)

src_directory = '/content/OMS2CD'  # replace with your source directory path
dst_directory = '/content/OMS2CD_rows'  # replace with your destination directory path
rows_per_strip = 256  # replace with desired number of rows per strip
update_tiffs_in_directory(src_directory, dst_directory, rows_per_strip)


# Create Train/Val/Test Splits by No. of Chips

In [None]:
%%capture
!pip install rasterio
!pip install torchgeo
!pip install tiler==0.5.7
!pip install kornia

In [None]:
import tempfile
import os
import torch
import random
import numpy as np

if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
torch.manual_seed(543)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(543)
random.seed(543)
np.random.seed(543)

In [None]:
from OpenMineChangeDetection.datasets import OMS2CD
dataset = OMS2CD(root='OMS2CD', load_area_mask=True, split='all')

In [None]:
def get_num_chips(dataset, file_index):
    tiler, _ = dataset._get_tiler(file_index)
    tile_shape = tiler.get_mosaic_shape(with_channel_dim=True)
    num_tiles = tile_shape[0] * tile_shape[1] * tile_shape[2]
    return num_tiles

def is_in_aoi(dataset, index):
    mask = dataset[index]['area_mask']
    num_pixels_in_aoi = mask.sum()
    total_pixels = mask.numel()
    return num_pixels_in_aoi >= 0.01 * total_pixels

In [None]:
import os
import random
from collections import defaultdict
from typing import List, Tuple

# input: list of triplets
triplets = dataset.file_list # replace with your list of triplets

# separating triplets by geolocation
triplets_by_geolocation = defaultdict(list)
for i, triplet in enumerate(triplets):
    # split the first element of the triplet by underscores
    split_name = triplet[0].split("_")
    # get the geolocation part by excluding the first and last 2 elements
    geolocation = "_".join(split_name[1:-1])
    triplets_by_geolocation[geolocation].append((i, triplet))

# creating chunks, each chunk is now a list of (index, triplet) tuples
chunks = [triplets for triplets in triplets_by_geolocation.values()]
random.shuffle(chunks)

# Calculate the total number of chips within the AOI
total_chips = sum(get_num_chips(dataset, index) for chunk in chunks for index, triplet in chunk if is_in_aoi(dataset, index))

# Initialize empty lists for each dataset
train_data, val_data, test_data = [], [], []

# Initialize a counter for the number of chips in each dataset
num_train_chips, num_val_chips, num_test_chips = 0, 0, 0

# Assign chunks to each dataset based on the number of chips within the AOI
for chunk in chunks:
    chunk_chips = sum(get_num_chips(dataset, index) for index, triplet in chunk if is_in_aoi(dataset, index))
    if num_train_chips + chunk_chips <= 0.7 * total_chips:
        train_data.extend(triplet for index, triplet in chunk)
        num_train_chips += chunk_chips
    elif num_val_chips + chunk_chips <= 0.2 * total_chips:
        val_data.extend(triplet for index, triplet in chunk)
        num_val_chips += chunk_chips
    else:
        test_data.extend(triplet for index, triplet in chunk)
        num_test_chips += chunk_chips

print(len(train_data))
print(len(val_data))
print(len(test_data))
print()
print(num_train_chips)
print(num_val_chips)
print(num_test_chips)

## Create sets of facility names and write out to .csv files

In [None]:
def get_facility(path):
    filename_with_extension = os.path.basename(path)
    filename_without_extension, _ = os.path.splitext(filename_with_extension)
    return filename_without_extension

train_set = set([get_facility(chunk[3]) for chunk in train_data])
val_set = set([get_facility(chunk[3]) for chunk in val_data])
test_set = set([get_facility(chunk[3]) for chunk in test_data])

In [None]:
print(train_set)
print(val_set)
print(test_set)

In [None]:
import csv

def write_set_file(subset, file):
    with open(file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['facility'])  # Write the header

        for facility in subset:
            writer.writerow([facility])

write_set_file(train_set, 'OMS2CD/train.csv')
write_set_file(val_set, 'OMS2CD/val.csv')
write_set_file(test_set, 'OMS2CD/test.csv')

# Compress Dataset and save to Drive

In [None]:
import zipfile

directory_to_compress = '/content/OMS2CD/'
zip_filename = '/content/OMS2CD_subset.zip'

# Create a zip archive of the directory
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(directory_to_compress):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            zipf.write(file_path, os.path.relpath(file_path, directory_to_compress))

In [None]:
# Copy the zip archive to Colab
destination_colab = '/content/drive/MyDrive/2023_dissertation/dataset_archives/OMS2CD_subset.zip'
shutil.copyfile(zip_filename, destination_colab)

'/content/drive/MyDrive/2023_dissertation/dataset_archives/OMS2CD_subset.zip'