# Solar Dataset Preparation

Run only one time.

Downloads the WorldView-3 15 cm HD and 30 cm native imagery dataset of solar panels for Germany.

Prepares the data to be used by YOLOv8.

## Import Libraries

In [1]:
import os
import shutil
from urllib.request import urlretrieve
import zipfile
import glob
from sklearn.model_selection import train_test_split
from google.colab import files # Ignore if not using Colab


## User Inputs

In [2]:
# Datasets folder
# Folder where all the datasets will be saved
datasets_folder = "datasets"

# Dataset spatial resolution
# "native" - Native 30 cm
# "hd" - HD 15 cm
spatial_res = "native"

# Split
# Separate the dataset into Train, Validation and Test (total is 100%)
split = (50, 25, 25)

## Code

In [3]:
# Create a brand new folder to store the downloaded dataset
if not os.path.exists(datasets_folder):
  os.mkdir(datasets_folder)
else:
  shutil.rmtree(datasets_folder)
  os.mkdir(datasets_folder)

In [4]:
# Download dataset (images_chips)
# Be patient...
dataset_images_url = "https://maxar-marketing.s3.amazonaws.com/product-samples/Maxar_HD_and_Native_Solar_Panel_Image_Chips.zip"
dataset_images_path = os.path.join(datasets_folder, "Maxar_HD_and_Native_Solar_Panel_Image_Chips.zip")
urlretrieve(dataset_images_url, dataset_images_path)

('datasets/Maxar_HD_and_Native_Solar_Panel_Image_Chips.zip',
 <http.client.HTTPMessage at 0x7fa90b47fc10>)

In [5]:
# Unzip dataset (images_chips)
!unzip {dataset_images_path} -d {datasets_folder}
os.remove(dataset_images_path)
os.remove(os.path.join(datasets_folder, "README_ImageChips.txt"))

Archive:  datasets/Maxar_HD_and_Native_Solar_Panel_Image_Chips.zip
   creating: datasets/image_chips/
   creating: datasets/image_chips/image_chips_hd/
  inflating: datasets/image_chips/image_chips_hd/solarpanels_hd_3__x0_10633_y0_2873_dxdy_832.tif  
  inflating: datasets/image_chips/image_chips_hd/solarpanels_hd_1__x0_3245_y0_19601_dxdy_832.tif  
  inflating: datasets/image_chips/image_chips_hd/solarpanels_hd_3__x0_27454_y0_31936_dxdy_832.tif  
  inflating: datasets/image_chips/image_chips_hd/solarpanels_hd_3__x0_12051_y0_21940_dxdy_832.tif  
  inflating: datasets/image_chips/image_chips_hd/solarpanels_hd_3__x0_13934_y0_19354_dxdy_832.tif  
  inflating: datasets/image_chips/image_chips_hd/solarpanels_hd_2__x0_2131_y0_16854_dxdy_832.tif  
  inflating: datasets/image_chips/image_chips_hd/solarpanels_hd_3__x0_5153_y0_26076_dxdy_832.tif  
  inflating: datasets/image_chips/image_chips_hd/solarpanels_hd_1__x0_2967_y0_19442_dxdy_832.tif  
  inflating: datasets/image_chips/image_chips_hd/sola

In [6]:
# Download dataset (labels)
# Be patient...
dataset_labels_url = "https://figshare.com/ndownloader/files/39255599"
dataset_labels_path = os.path.join(datasets_folder, "labels.zip")
urlretrieve(dataset_labels_url, dataset_labels_path)

('datasets/labels.zip', <http.client.HTTPMessage at 0x7fa90b4d3100>)

In [7]:
# Unzip dataset (labels)
!unzip {dataset_labels_path} -d {datasets_folder}
os.remove(dataset_labels_path)

Archive:  datasets/labels.zip
   creating: datasets/labels/
   creating: datasets/labels/labels_native/
  inflating: datasets/labels/labels_native/solarpanels_native_2__x0_2295_y0_4490_dxdy_416.txt  
  inflating: datasets/labels/labels_native/solarpanels_native_2__x0_6880_y0_9736_dxdy_416.txt  
  inflating: datasets/labels/labels_native/solarpanels_native_3__x0_12216_y0_11158_dxdy_416.txt  
  inflating: datasets/labels/labels_native/solarpanels_native_3__x0_5546_y0_1019_dxdy_416.txt  
  inflating: datasets/labels/labels_native/solarpanels_native_1__x0_1663_y0_7088_dxdy_416.txt  
  inflating: datasets/labels/labels_native/solarpanels_native_3__x0_9481_y0_15613_dxdy_416.txt  
  inflating: datasets/labels/labels_native/solarpanels_native_1__x0_9783_y0_10933_dxdy_416.txt  
  inflating: datasets/labels/labels_native/solarpanels_native_3__x0_1274_y0_13650_dxdy_416.txt  
  inflating: datasets/labels/labels_native/solarpanels_native_2__x0_4715_y0_5526_dxdy_416.txt  
  inflating: datasets/label

In [8]:
# Select folders based on spatial resolution
images_folder = os.path.join(datasets_folder, "image_chips")
labels_folder = os.path.join(datasets_folder, "labels")
if spatial_res == "hd":
  images_folder = os.path.join(images_folder, "image_chips_hd")
  labels_folder = os.path.join(labels_folder, "labels_hd")
else:
  images_folder = os.path.join(images_folder, "image_chips_native")
  labels_folder = os.path.join(labels_folder, "labels_native")

In [9]:
# Check if number of images files match labels files
images_list = glob.glob(os.path.join(images_folder, '*'))
images_number = len(images_list)
print("Number of images files: " + str(images_number))

labels_list = glob.glob(os.path.join(labels_folder, '*'))
labels_number = len(labels_list)
print("Number of labels files: " + str(labels_number))
assert images_number == labels_number, "The number of files do not match."

Number of images files: 2542
Number of labels files: 2542


In [10]:
# Save to a list only the names of all images
images_names_list = []
for image_path in images_list:
    images_names_list.append(os.path.basename(image_path).split('.')[0])

In [11]:
# Split into Train and Other
x_train, x_other = train_test_split(images_names_list, random_state=42, test_size=(100-split[0])/100)

# Split Other into Validation and Test
x_valid, x_test = train_test_split(x_other, random_state=42, test_size=split[2]/(100-split[0]))

In [12]:
# Create the Train-Validation-Test folders structure
for folder_name in ["train", "validation", "test"]:
    folder_path = os.path.join(datasets_folder, folder_name)
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    else:
        shutil.rmtree(folder_path)
        os.mkdir(folder_path)
    os.mkdir(os.path.join(folder_path, "images"))
    os.mkdir(os.path.join(folder_path, "labels"))

In [13]:
# Function to copy files
def copy_images_labels_files(images_folder, labels_folder, dst_folder, file_name):
  # Images
  image_src = os.path.join(images_folder, file_name+".tif")
  image_dst = os.path.join(dst_folder, "images", file_name+".tif")
  shutil.copy(image_src, image_dst)

  # Labels
  labels_src = os.path.join(labels_folder, file_name+".txt")
  labels_dst = os.path.join(dst_folder, "labels", file_name+".txt")
  shutil.copy(labels_src, labels_dst)


In [14]:
# Copy the images and labels files
# Train
train_folder = os.path.join(datasets_folder, "train")
for file_name in x_train:
  copy_images_labels_files(images_folder, labels_folder, train_folder, file_name)

# Validation
valid_folder = os.path.join(datasets_folder, "validation")
for file_name in x_valid:
  copy_images_labels_files(images_folder, labels_folder, valid_folder, file_name)

# Test
test_folder = os.path.join(datasets_folder, "test")
for file_name in x_test:
  copy_images_labels_files(images_folder, labels_folder, test_folder, file_name)

In [15]:
# Check if number of images files match labels files
# Images
train_images_number = len(glob.glob(os.path.join(datasets_folder, "train", "images", "*")))
valid_images_number = len(glob.glob(os.path.join(datasets_folder, "validation", "images", "*")))
test_images_number = len(glob.glob(os.path.join(datasets_folder, "test", "images", "*")))
images_number = train_images_number + valid_images_number + test_images_number
print("Number of images files: " + str(images_number))

# Labels
train_labels_number = len(glob.glob(os.path.join(datasets_folder, "train", "labels", "*")))
valid_labels_number = len(glob.glob(os.path.join(datasets_folder, "validation", "labels", "*")))
test_labels_number = len(glob.glob(os.path.join(datasets_folder, "test", "labels", "*")))
labels_number = train_labels_number + valid_labels_number + test_labels_number
print("Number of labels files: " + str(labels_number))

assert images_number == labels_number, "The number of files do not match."

Number of images files: 2542
Number of labels files: 2542


In [16]:
# The category for each solar panel object is 0 for objects identified with high confidence,
# 1 for objects identified with moderate confidence, and 2 for objects identified with low confidence.
# Consider only high confidence (0)
for folder in ["train/labels", "validation/labels", "test/labels"]:
    labels_list = glob.glob(os.path.join(datasets_folder, folder, "*"))
    for labels_file in labels_list:
        # Open the text file for reading
        with open(labels_file, 'r') as file:
            # Read all lines from the file
            lines = file.readlines()

        # Filter out lines where the first element is 0
        filtered_lines = [line for line in lines if line.strip().split(' ')[0] == '0']

        # Open the text file for writing
        with open(labels_file, 'w') as file:
            # Write the filtered lines to the output file
            file.writelines(filtered_lines)

In [46]:
# Zip and download the final datasets to your local machine
# Ignore if not using Colab
# Train
!zip -r {train_folder+".zip"} {train_folder}
files.download(train_folder+".zip")

# Validation
!zip -r {valid_folder+".zip"} {valid_folder}
files.download(valid_folder+".zip")

# Test
!zip -r {test_folder+".zip"} {test_folder}
files.download(test_folder+".zip")

  adding: datasets/train/ (stored 0%)
  adding: datasets/train/labels/ (stored 0%)
  adding: datasets/train/labels/solarpanels_native_2__x0_751_y0_10547_dxdy_416.txt (deflated 82%)
  adding: datasets/train/labels/solarpanels_native_3__x0_4413_y0_3877_dxdy_416.txt (deflated 70%)
  adding: datasets/train/labels/solarpanels_native_1__x0_7852_y0_10603_dxdy_416.txt (deflated 71%)
  adding: datasets/train/labels/solarpanels_native_1__x0_6720_y0_10987_dxdy_416.txt (deflated 73%)
  adding: datasets/train/labels/solarpanels_native_3__x0_11789_y0_9026_dxdy_416.txt (deflated 81%)
  adding: datasets/train/labels/solarpanels_native_1__x0_1881_y0_6837_dxdy_416.txt (deflated 74%)
  adding: datasets/train/labels/solarpanels_native_3__x0_5686_y0_0_dxdy_416.txt (deflated 52%)
  adding: datasets/train/labels/solarpanels_native_1__x0_98_y0_7130_dxdy_416.txt (deflated 74%)
  adding: datasets/train/labels/solarpanels_native_1__x0_8146_y0_12213_dxdy_416.txt (deflated 80%)
  adding: datasets/train/labels/sola

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: datasets/validation/ (stored 0%)
  adding: datasets/validation/labels/ (stored 0%)
  adding: datasets/validation/labels/solarpanels_native_1__x0_991_y0_8791_dxdy_416.txt (deflated 74%)
  adding: datasets/validation/labels/solarpanels_native_1__x0_6969_y0_12601_dxdy_416.txt (deflated 55%)
  adding: datasets/validation/labels/solarpanels_native_2__x0_2327_y0_4558_dxdy_416.txt (deflated 73%)
  adding: datasets/validation/labels/solarpanels_native_1__x0_6714_y0_12262_dxdy_416.txt (deflated 81%)
  adding: datasets/validation/labels/solarpanels_native_3__x0_9889_y0_14917_dxdy_416.txt (deflated 68%)
  adding: datasets/validation/labels/solarpanels_native_3__x0_7909_y0_9356_dxdy_416.txt (deflated 66%)
  adding: datasets/validation/labels/solarpanels_native_3__x0_8589_y0_14675_dxdy_416.txt (deflated 74%)
  adding: datasets/validation/labels/solarpanels_native_2__x0_5439_y0_5603_dxdy_416.txt (deflated 66%)
  adding: datasets/validation/labels/solarpanels_native_2__x0_286_y0_14590_dxdy_

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: datasets/test/ (stored 0%)
  adding: datasets/test/labels/ (stored 0%)
  adding: datasets/test/labels/solarpanels_native_3__x0_5901_y0_9623_dxdy_416.txt (deflated 59%)
  adding: datasets/test/labels/solarpanels_native_1__x0_1559_y0_6448_dxdy_416.txt (deflated 72%)
  adding: datasets/test/labels/solarpanels_native_3__x0_6585_y0_13546_dxdy_416.txt (deflated 63%)
  adding: datasets/test/labels/solarpanels_native_1__x0_6464_y0_8579_dxdy_416.txt (deflated 68%)
  adding: datasets/test/labels/solarpanels_native_1__x0_5200_y0_12763_dxdy_416.txt (deflated 64%)
  adding: datasets/test/labels/solarpanels_native_2__x0_0_y0_14797_dxdy_416.txt (deflated 62%)
  adding: datasets/test/labels/solarpanels_native_3__x0_11754_y0_9161_dxdy_416.txt (deflated 81%)
  adding: datasets/test/labels/solarpanels_native_1__x0_4503_y0_11592_dxdy_416.txt (deflated 72%)
  adding: datasets/test/labels/solarpanels_native_1__x0_8141_y0_12142_dxdy_416.txt (deflated 80%)
  adding: datasets/test/labels/solarpanels_

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>