# Solar Dataset Preparation

Run only one time.

Downloads the WorldView-3 15 cm HD and 30 cm native imagery dataset of solar panels for Germany.

Prepares the data to be used by YOLOv8.

## Import Libraries

In [None]:
import os
import shutil
from urllib.request import urlretrieve
import glob
from sklearn.model_selection import train_test_split
try:
    from google.colab import files 
    from google.colab import drive 
except:
    print("Some libraries were ignored.")

## User Inputs

In [None]:
# Datasets folder
# Folder where all the datasets will be saved
datasets_folder = "datasets"

# Dataset spatial resolution
# "native" - Native 30 cm
# "hd" - HD 15 cm
spatial_res = "native"

# Split
# Separate the dataset into Train, Validation and Test (total is 100%)
split = (50, 25, 25)

## Code

In [None]:
# Create new folder (if does not exist) to store the downloaded dataset
if not os.path.exists(datasets_folder):
  os.mkdir(datasets_folder)

In [None]:
# Download dataset (images_chips)
# Be patient... Takes around 4 min with 900Mbps to download 3.85GB 
dataset_images_url = "https://maxar-marketing.s3.amazonaws.com/product-samples/Maxar_HD_and_Native_Solar_Panel_Image_Chips.zip"
dataset_images_path = os.path.join(datasets_folder, "Maxar_HD_and_Native_Solar_Panel_Image_Chips.zip")
urlretrieve(dataset_images_url, dataset_images_path)

In [None]:
# Unzip dataset (images_chips)
# Quiet and overwrite
# Be patient... Takes around 2 min to unzip
!unzip -q -o {dataset_images_path} -d {datasets_folder}
os.remove(dataset_images_path)
os.remove(os.path.join(datasets_folder, "README_ImageChips.txt"))

In [None]:
# Download dataset (labels)
# Be patient... Fast
dataset_labels_url = "https://figshare.com/ndownloader/files/39255599"
dataset_labels_path = os.path.join(datasets_folder, "labels.zip")
urlretrieve(dataset_labels_url, dataset_labels_path)

In [None]:
# Unzip dataset (labels)
# Quiet and overwrite
!unzip -q -o {dataset_labels_path} -d {datasets_folder}
os.remove(dataset_labels_path)

In [None]:
# Select folders based on spatial resolution
images_folder = os.path.join(datasets_folder, "image_chips")
labels_folder = os.path.join(datasets_folder, "labels")
if spatial_res == "hd":
  images_folder = os.path.join(images_folder, "image_chips_hd")
  labels_folder = os.path.join(labels_folder, "labels_hd")
else:
  images_folder = os.path.join(images_folder, "image_chips_native")
  labels_folder = os.path.join(labels_folder, "labels_native")

In [None]:
# Check if number of images files match labels files
images_list = glob.glob(os.path.join(images_folder, '*'))
images_number = len(images_list)
print("Number of images files: " + str(images_number))

labels_list = glob.glob(os.path.join(labels_folder, '*'))
labels_number = len(labels_list)
print("Number of labels files: " + str(labels_number))
assert images_number == labels_number, "The number of files do not match."

In [None]:
# Save to a list only the names of all images
images_names_list = []
for image_path in images_list:
    images_names_list.append(os.path.basename(image_path).split('.')[0])

In [None]:
# Split into Train and Other
x_train, x_other = train_test_split(images_names_list, random_state=42, test_size=(100-split[0])/100)

# Split Other into Validation and Test
x_valid, x_test = train_test_split(x_other, random_state=42, test_size=split[2]/(100-split[0]))

In [None]:
# Create the Train-Validation-Test folders structure
for folder_name in ["train", "validation", "test"]:
    folder_path = os.path.join(datasets_folder, folder_name)
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    else:
        shutil.rmtree(folder_path)
        os.mkdir(folder_path)
    os.mkdir(os.path.join(folder_path, "images"))
    os.mkdir(os.path.join(folder_path, "labels"))

In [None]:
# Function to copy files
def copy_images_labels_files(images_folder, labels_folder, dst_folder, file_name):
  # Images
  image_src = os.path.join(images_folder, file_name+".tif")
  image_dst = os.path.join(dst_folder, "images", file_name+".tif")
  shutil.copy(image_src, image_dst)

  # Labels
  labels_src = os.path.join(labels_folder, file_name+".txt")
  labels_dst = os.path.join(dst_folder, "labels", file_name+".txt")
  shutil.copy(labels_src, labels_dst)


In [None]:
# Copy the images and labels files
# Train
train_folder = os.path.join(datasets_folder, "train")
for file_name in x_train:
  copy_images_labels_files(images_folder, labels_folder, train_folder, file_name)

# Validation
valid_folder = os.path.join(datasets_folder, "validation")
for file_name in x_valid:
  copy_images_labels_files(images_folder, labels_folder, valid_folder, file_name)

# Test
test_folder = os.path.join(datasets_folder, "test")
for file_name in x_test:
  copy_images_labels_files(images_folder, labels_folder, test_folder, file_name)

In [None]:
# Check if number of images files match labels files
# Images
train_images_number = len(glob.glob(os.path.join(datasets_folder, "train", "images", "*")))
valid_images_number = len(glob.glob(os.path.join(datasets_folder, "validation", "images", "*")))
test_images_number = len(glob.glob(os.path.join(datasets_folder, "test", "images", "*")))
images_number = train_images_number + valid_images_number + test_images_number
print("Number of images files: " + str(images_number))

# Labels
train_labels_number = len(glob.glob(os.path.join(datasets_folder, "train", "labels", "*")))
valid_labels_number = len(glob.glob(os.path.join(datasets_folder, "validation", "labels", "*")))
test_labels_number = len(glob.glob(os.path.join(datasets_folder, "test", "labels", "*")))
labels_number = train_labels_number + valid_labels_number + test_labels_number
print("Number of labels files: " + str(labels_number))

assert images_number == labels_number, "The number of files do not match."

In [None]:
# The category for each solar panel object is 0 for objects identified with high confidence,
# 1 for objects identified with moderate confidence, and 2 for objects identified with low confidence.
# Consider only high confidence (0)
for folder in ["train/labels", "validation/labels", "test/labels"]:
    labels_list = glob.glob(os.path.join(datasets_folder, folder, "*"))
    for labels_file in labels_list:
        # Open the text file for reading
        with open(labels_file, 'r') as file:
            # Read all lines from the file
            lines = file.readlines()

        # Filter out lines where the first element is 0
        filtered_lines = [line for line in lines if line.strip().split(' ')[0] == '0']

        # Open the text file for writing
        with open(labels_file, 'w') as file:
            # Write the filtered lines to the output file
            file.writelines(filtered_lines)

In [None]:
# Zip final datasets
# Be patient... Takes around 2 min to zip
# Train
!cd {train_folder} && zip -q -r {"../train.zip"} *

# Validation
!cd {valid_folder} && zip -q -r {"../validation.zip"} *

# Test
!cd {test_folder} && zip -q -r {"../test.zip"} *

In [None]:
# Save the zip datasets on your drive
try:
    drive.mount("/content/drive")
    save_driver_folder = "/content/drive/MyDrive/datasets"
    if not os.path.exists(save_driver_folder):
        os.mkdir(save_driver_folder)
    # Train
    !cp {train_folder+".zip"} {save_driver_folder}

    # Validation
    !cp {valid_folder+".zip"} {save_driver_folder}

    # Test
    !cp {test_folder+".zip"} {save_driver_folder}
except:
    print("Saving to drive ignored.")

In [None]:
# Download the final datasets to your local machine
try:
    # Train
    files.download(train_folder+".zip")

    # Validation
    files.download(valid_folder+".zip")

    # Test
    files.download(test_folder+".zip")
except:
    print("Download to local machine ignored.")

In [None]:
# Delete remaining intermediate files and folders
if os.path.exists("datasets/image_chips"):
        shutil.rmtree("datasets/image_chips")
if os.path.exists("datasets/labels"):
        shutil.rmtree("datasets/labels")
if os.path.exists("datasets/Maxar_HD_and_Native_Solar_Panel_Image_Chips.zip"):
        os.remove("datasets/Maxar_HD_and_Native_Solar_Panel_Image_Chips.zip")
