# Convert images to png and split into train, test, val sets 

In [1]:
import pathlib
import cv2
from tqdm import tqdm
import random
import shutil
import numpy as np

## Data folders

In [2]:
def create_clean_folder(path):
    path = pathlib.Path(path)
    shutil.rmtree(path, ignore_errors=True)
    path.mkdir(parents=True, exist_ok=True)

data_path = pathlib.Path('../../data/')
rid_path = data_path / 'RID'
input_path = rid_path / 'input'
output_path = rid_path / 'output'
tmp_path = output_path / 'tmp'

dataset_tmp_path = list(tmp_path.glob('*'))[0]

# Raw data paths
split_tmp_path = dataset_tmp_path / 'filenames_train_val_test_split'
images_tmp_path = dataset_tmp_path / 'images_roof_centered_geotiff'
segments_tmp_path = dataset_tmp_path / 'masks_segments_reviewed'
superstructures_tmp_path = dataset_tmp_path / 'masks_superstructures_reviewed'

# Rooftop segmentation processed and split data paths
segment_dataset_path = output_path / segments_tmp_path.name
segment_train_path = segment_dataset_path / 'train'
segment_train_image_path = segment_train_path / 'image'
create_clean_folder(segment_train_image_path)
segment_train_label_path = segment_train_path / 'label'
create_clean_folder(segment_train_label_path)

segment_val_path = segment_dataset_path / 'val'
segment_val_image_path = segment_val_path / 'image'
create_clean_folder(segment_val_image_path)
segment_val_label_path = segment_val_path / 'label'
create_clean_folder(segment_val_label_path)

segment_test_path = segment_dataset_path / 'test'
segment_test_image_path = segment_test_path / 'image'
create_clean_folder(segment_test_image_path)
segment_test_label_path = segment_test_path / 'label'
create_clean_folder(segment_test_label_path)

# Superstructure segmentation processed and split data paths
superstructures_dataset_path = output_path / superstructures_tmp_path.name
superstructures_train_path = superstructures_dataset_path / 'train'
superstructures_train_image_path = superstructures_train_path / 'image'
create_clean_folder(superstructures_train_image_path)
superstructures_train_label_path = superstructures_train_path / 'label'
create_clean_folder(superstructures_train_label_path)

superstructures_val_path = superstructures_dataset_path / 'val'
superstructures_val_image_path = superstructures_val_path / 'image'
create_clean_folder(superstructures_val_image_path)
superstructures_val_label_path = superstructures_val_path / 'label'
create_clean_folder(superstructures_val_label_path)

superstructures_test_path = superstructures_dataset_path / 'test'
superstructures_test_image_path = superstructures_test_path / 'image'
create_clean_folder(superstructures_test_image_path)
superstructures_test_label_path = superstructures_test_path / 'label'
create_clean_folder(superstructures_test_label_path)

## Random data split

In [3]:
path_list = list(images_tmp_path.glob('*.tif'))
name_list = [path.stem for path in path_list]

random.shuffle(name_list)

train_size = 0.8
val_size = 0.1
test_size = 1 - (train_size + val_size)

train_names, val_names, test_names = np.split(name_list, 
                            [
                                int(len(name_list) * train_size),
                                int(len(name_list) * (train_size + val_size))
                            ])

## Rooftop training data

In [4]:
for name in tqdm(train_names):
    img = cv2.imread(str(images_tmp_path / (name + '.tif')))
    mask = cv2.imread(str(segments_tmp_path / (name + '.png')))
    
    output_name = name + '.png'
    img_output_path = segment_train_image_path / output_name
    mask_output_path = segment_train_label_path / output_name
    
    cv2.imwrite(str(img_output_path), img)
    cv2.imwrite(str(mask_output_path), mask)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1504/1504 [01:05<00:00, 22.79it/s]


## Rooftop validation data

In [5]:
for name in tqdm(val_names):
    img = cv2.imread(str(images_tmp_path / (name + '.tif')))
    mask = cv2.imread(str(segments_tmp_path / (name + '.png')))
    
    output_name = name + '.png'
    img_output_path = segment_val_image_path / output_name
    mask_output_path = segment_val_label_path / output_name
    
    cv2.imwrite(str(img_output_path), img)
    cv2.imwrite(str(mask_output_path), mask)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 188/188 [00:14<00:00, 12.58it/s]


## Rooftop test data

In [6]:
for name in tqdm(test_names):
    img = cv2.imread(str(images_tmp_path / (name + '.tif')))
    mask = cv2.imread(str(segments_tmp_path / (name + '.png')))
    
    output_name = name + '.png'
    img_output_path = segment_test_image_path / output_name
    mask_output_path = segment_test_label_path / output_name
    
    cv2.imwrite(str(img_output_path), img)
    cv2.imwrite(str(mask_output_path), mask)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 188/188 [00:21<00:00,  8.94it/s]


## Superstructure training data

In [7]:
# Labels:
# ['pvmodule', 'dormer', 'window', 'ladder', 'chimney', 'shadow', 'tree', 'unknown', 'background']

obstacle_classes = [1, 2, 3, 4, 5, 6, 7]
pv_class = 0
background_class = 8

for name in tqdm(train_names):
    img = cv2.imread(str(images_tmp_path / (name + '.tif')))
    mask = cv2.imread(str(superstructures_tmp_path / (name + '.png')))
    
    #mask[(mask >= min(obstacle_classes)) & (mask <= max(obstacle_classes))] = 2
    #mask[mask == pv_class] = 1
    #mask[mask == background_class] = 0
    
    output_name = name + '.png'
    img_output_path = superstructures_train_image_path / output_name
    mask_output_path = superstructures_train_label_path / output_name
    
    cv2.imwrite(str(img_output_path), img)
    cv2.imwrite(str(mask_output_path), mask)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1504/1504 [01:06<00:00, 22.55it/s]


## Superstructure validation data

In [8]:
for name in tqdm(val_names):
    img = cv2.imread(str(images_tmp_path / (name + '.tif')))
    mask = cv2.imread(str(superstructures_tmp_path / (name + '.png')))
    
    #mask[(mask >= min(obstacle_classes)) & (mask <= max(obstacle_classes))] = 2
    #mask[mask == pv_class] = 1
    #mask[mask == background_class] = 0
    
    output_name = name + '.png'
    img_output_path = superstructures_val_image_path / output_name
    mask_output_path = superstructures_val_label_path / output_name
    
    cv2.imwrite(str(img_output_path), img)
    cv2.imwrite(str(mask_output_path), mask)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 188/188 [00:04<00:00, 43.04it/s]


## Superstructure test data

In [9]:
for name in tqdm(test_names):
    img = cv2.imread(str(images_tmp_path / (name + '.tif')))
    mask = cv2.imread(str(superstructures_tmp_path / (name + '.png')))
    
    #mask[(mask >= min(obstacle_classes)) & (mask <= max(obstacle_classes))] = 2
    #mask[mask == pv_class] = 1
    #mask[mask == background_class] = 0
    
    output_name = name + '.png'
    img_output_path = superstructures_test_image_path / output_name
    mask_output_path = superstructures_test_label_path / output_name
    
    cv2.imwrite(str(img_output_path), img)
    cv2.imwrite(str(mask_output_path), mask)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 188/188 [00:05<00:00, 34.91it/s]


## Removing tmp folder

In [10]:
print('Removing tmp folder...')
shutil.rmtree(tmp_path)
print('Finished!')

Removing tmp folder...
Finished!
