## Obtaining baseline results using Yolo models 

### Data

#### Converting Geojson to YOLO

In [8]:
import os
import numpy as np
from glob import glob
from PIL import Image

def fix_path(path: str) -> str:
    return path.replace("\\", "/")

# Directories
labels_dir = fix_path(r"O:\Capstone_2\Marine-Debris-Detection\NASA_Planet_Data\labels")
source_dir = fix_path(r"O:\Capstone_2\Marine-Debris-Detection\NASA_Planet_Data\source")
output_dir = fix_path("O:/Capstone_2/Marine-Debris-Detection/yolo_labels")

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)


In [9]:

# Class mapping
class_map = {
    "marine_debris": 0  # Update with additional classes if needed
}

# Normalize bounding box coordinates
def normalize_bbox(bbox, img_width, img_height):
    x_min, y_min, x_max, y_max = bbox
    x_center = (x_min + x_max) / 2 / img_width
    y_center = (y_min + y_max) / 2 / img_height
    width = (x_max - x_min) / img_width
    height = (y_max - y_min) / img_height
    return x_center, y_center, width, height

# Process .npy files
npy_files = glob(os.path.join(labels_dir, "*.npy"))

for npy_file in npy_files:
    # Load .npy data
    data = np.load(npy_file, allow_pickle=True).tolist()

    # Get corresponding image dimensions
    base_name = os.path.splitext(os.path.basename(npy_file))[0]
    image_path = os.path.join(source_dir, base_name + ".jpg")
    
    if not os.path.exists(image_path):
        print(f"Image not found for {base_name}, skipping...")
        continue

    with Image.open(image_path) as img:
        img_width, img_height = img.size

    # Create YOLO label file
    yolo_file_path = os.path.join(output_dir, base_name + ".txt")
    with open(yolo_file_path, "w") as yolo_file:
        for entry in data:
            x_min, y_min, x_max, y_max, class_id = entry
            if class_id > 0:
                class_id -= 1

            # Normalize the bounding box coordinates
            x_center, y_center, width, height = normalize_bbox(
                (x_min, y_min, x_max, y_max), img_width, img_height
            )

            # Write to YOLO file
            yolo_file.write(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

print("Conversion complete!")


Conversion complete!


### Splitting Dataset

In [20]:
# import os
# import shutil
# from sklearn.model_selection import train_test_split

# # Define paths
# labels_dir = r"O:/Capstone_2/Marine-Debris-Detection/yolo_labels"
# images_dir = r"O:/Capstone_2/Marine-Debris-Detection/NASA_Planet_Data/source"
# output_dir = r"O:/Capstone_2/Marine-Debris-Detection/dataset_splits"

# # Ratios
# train_ratio = 0.7
# val_ratio = 0.2
# test_ratio = 0.1

# # Ensure the ratios sum to 1
# assert round(train_ratio + val_ratio + test_ratio, 5) == 1.0, "Ratios must sum to 1."

# # Get list of label files
# label_files = [f for f in os.listdir(labels_dir) if f.endswith(".txt")]

# # Split into train, validation, and test
# train_files, temp_files = train_test_split(label_files, test_size=(1 - train_ratio), random_state=42, shuffle=True)
# val_files, test_files = train_test_split(temp_files, test_size=(test_ratio / (test_ratio + val_ratio)), random_state=42, shuffle=True)

# # Function to move files
# def move_files(files, dest_images, dest_labels):
#     os.makedirs(dest_images, exist_ok=True)
#     os.makedirs(dest_labels, exist_ok=True)
#     for label_file in files:
#         # Move label file
#         src_label = os.path.join(labels_dir, label_file)
#         dest_label = os.path.join(dest_labels, label_file)
#         shutil.copy(src_label, dest_label)

#         # Move corresponding image file
#         image_file = label_file.replace(".txt", ".jpg")  # Assuming images are .jpg
#         src_image = os.path.join(images_dir, image_file)
#         dest_image = os.path.join(dest_images, image_file)
#         if os.path.exists(src_image):
#             shutil.copy(src_image, dest_image)
#         else:
#             print(f"Image file {image_file} not found for label {label_file}")

# # Move files to respective directories
# move_files(train_files, os.path.join(output_dir, "train/images"), os.path.join(output_dir, "train/labels"))
# move_files(val_files, os.path.join(output_dir, "val/images"), os.path.join(output_dir, "val/labels"))
# move_files(test_files, os.path.join(output_dir, "test/images"), os.path.join(output_dir, "test/labels"))

# print("Dataset split completed.")
# print(f"Train files: {len(train_files)}")
# print(f"Validation files: {len(val_files)}")
# print(f"Test files: {len(test_files)}")


import os
import shutil
from sklearn.model_selection import train_test_split

# Define paths
labels_dir = r"yolo_labels"
images_dir = r"NASA_Planet_Data\source_histogram"
output_dir = r"hist_splits"

# Ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Ensure the ratios sum to 1
assert round(train_ratio + val_ratio + test_ratio, 5) == 1.0, "Ratios must sum to 1."

# Get list of label files
label_files = [f for f in os.listdir(labels_dir) if f.endswith(".txt")]

# Split into train, validation, and test
train_files, temp_files = train_test_split(label_files, test_size=(1 - train_ratio), random_state=42, shuffle=True)
val_files, test_files = train_test_split(temp_files, test_size=(test_ratio / (test_ratio + val_ratio)), random_state=42, shuffle=True)

# Function to move files
def move_files(files, dest_images, dest_labels):
    os.makedirs(dest_images, exist_ok=True)
    os.makedirs(dest_labels, exist_ok=True)
    for label_file in files:
        # Move label file
        src_label = os.path.join(labels_dir, label_file)
        dest_label = os.path.join(dest_labels, label_file)
        shutil.copy(src_label, dest_label)

        # Move corresponding image file
        image_file = label_file.replace(".txt", ".tif")  # Assuming images are .jpg
        src_image = os.path.join(images_dir, image_file)
        dest_image = os.path.join(dest_images, image_file)
        if os.path.exists(src_image):
            shutil.copy(src_image, dest_image)
        else:
            print(f"Image file {image_file} not found for label {label_file}")

# Move files to respective directories
move_files(train_files, os.path.join(output_dir, "train/images"), os.path.join(output_dir, "train/labels"))
move_files(val_files, os.path.join(output_dir, "val/images"), os.path.join(output_dir, "val/labels"))
move_files(test_files, os.path.join(output_dir, "test/images"), os.path.join(output_dir, "test/labels"))

print("Dataset split completed.")
print(f"Train files: {len(train_files)}")
print(f"Validation files: {len(val_files)}")
print(f"Test files: {len(test_files)}")


Dataset split completed.
Train files: 517
Validation files: 148
Test files: 74


#### Augmentation

In [11]:
import os
import shutil
import random
from glob import glob
from PIL import Image, ImageOps

def fix_path(path: str) -> str:
    return path.replace("\\", "/")

# Directories
labels_dir = fix_path(r"yolo_labels")
source_dir = fix_path(r"NASA_Planet_Data\source")
output_dir = fix_path(r"aug")

# Ensure output directories exist
os.makedirs(output_dir, exist_ok=True)
augmented_images_dir = os.path.join(output_dir, "images")
augmented_labels_dir = os.path.join(output_dir, "labels")
os.makedirs(augmented_images_dir, exist_ok=True)
os.makedirs(augmented_labels_dir, exist_ok=True)

# Augmentation functions
def augment_image(image, label_path, base_name, augmentation_type):
    augmented_image = image.copy()
    if augmentation_type == "flip":
        augmented_image = ImageOps.mirror(image)
    elif augmentation_type == "rotate":
        augmented_image = image.rotate(90, expand=True)
    
    # Save augmented image
    augmented_image_path = os.path.join(augmented_images_dir, f"{base_name}_{augmentation_type}.jpg")
    augmented_image.save(augmented_image_path)

    # Copy corresponding label file
    if os.path.exists(label_path):
        augmented_label_path = os.path.join(augmented_labels_dir, f"{base_name}_{augmentation_type}.txt")
        shutil.copy(label_path, augmented_label_path)
    else:
        print(f"Label file {label_path} missing for image {base_name}, skipping...")

# Process images and labels
image_files = glob(os.path.join(source_dir, "*.jpg"))
for image_path in image_files:
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    label_path = os.path.join(labels_dir, f"{base_name}.txt")

    if not os.path.exists(label_path):
        print(f"Label file missing for {base_name}, skipping...")
        continue

    with Image.open(image_path) as img:
        # Save original image and label
        img.save(os.path.join(augmented_images_dir, f"{base_name}.jpg"))
        shutil.copy(label_path, os.path.join(augmented_labels_dir, f"{base_name}.txt"))

        # Apply augmentations
        augment_image(img, label_path, base_name, "flip")
        augment_image(img, label_path, base_name, "rotate")

print("Dataset augmentation completed.")


Dataset augmentation completed.


In [13]:
import os

# Directories for augmented dataset
augmented_images_dir = r"aug\images"
augmented_labels_dir = r"aug\labels"

# Count the number of image and label files
num_images = len([f for f in os.listdir(augmented_images_dir) if f.endswith(".jpg")])
num_labels = len([f for f in os.listdir(augmented_labels_dir) if f.endswith(".txt")])

# Print the counts
print(f"Number of augmented images: {num_images}")
print(f"Number of augmented labels: {num_labels}")

# Ensure the counts match
if num_images != num_labels:
    print("Warning: The number of images and labels do not match!")
else:
    print("Dataset is consistent: Images and labels match.")


Number of augmented images: 2217
Number of augmented labels: 2217
Dataset is consistent: Images and labels match.


## Histogram Stretching

In [17]:
import os
from tqdm import tqdm
import glob

base_dataset_dir = "NASA_Planet_Data"
source_dir = os.path.join(base_dataset_dir, "source")

# Use glob.glob to get source files
source_files = glob.glob(os.path.join(source_dir, "*.tif"))

# Store all file sizes
count = 0
for file in tqdm(source_files):
    print(file, f"{count:06d}_{os.path.basename(file)}")
    count += 1
    # os.rename(file, os.path.join(source_dir, f"{count:06d}_{os.path.basename(file)}"))


100%|██████████| 739/739 [00:00<00:00, 34548.92it/s]

NASA_Planet_Data\source\20160928_153233_0e16_16816-29821-16.tif 000000_20160928_153233_0e16_16816-29821-16.tif
NASA_Planet_Data\source\20160928_153233_0e16_16816-29824-16.tif 000001_20160928_153233_0e16_16816-29824-16.tif
NASA_Planet_Data\source\20160928_153233_0e16_16816-29825-16.tif 000002_20160928_153233_0e16_16816-29825-16.tif
NASA_Planet_Data\source\20160928_153233_0e16_16816-29828-16.tif 000003_20160928_153233_0e16_16816-29828-16.tif
NASA_Planet_Data\source\20160928_153233_0e16_16816-29829-16.tif 000004_20160928_153233_0e16_16816-29829-16.tif
NASA_Planet_Data\source\20160928_153233_0e16_16816-29830-16.tif 000005_20160928_153233_0e16_16816-29830-16.tif
NASA_Planet_Data\source\20160928_153233_0e16_16816-29831-16.tif 000006_20160928_153233_0e16_16816-29831-16.tif
NASA_Planet_Data\source\20160928_153233_0e16_16817-29821-16.tif 000007_20160928_153233_0e16_16817-29821-16.tif
NASA_Planet_Data\source\20160928_153233_0e16_16817-29823-16.tif 000008_20160928_153233_0e16_16817-29823-16.tif
N




In [19]:
import os
from glob import glob
import rasterio
import numpy as np
from tqdm import tqdm


def histogram_stretch(image: np.ndarray) -> np.ndarray:
    # Normalize the image for each band
    for i in range(image.shape[2]):
        band = image[:, :, i]
        band_min, band_max = band.min(), band.max()
        image[:, :, i] = (band - band_min) / (band_max - band_min) * 255

    return image
    

# Define base directories
base_dataset_dir = "NASA_Planet_Data"
source_dir = os.path.join(base_dataset_dir, "source")
histogram_dir = os.path.join(base_dataset_dir, "source_histogram")

# Create output directory if it doesn't exist
os.makedirs(histogram_dir, exist_ok=True)

# Correct use of glob.glob for source files
source_files = glob(os.path.join(source_dir, "*.tif"))

# Process files
for file in tqdm(source_files, desc="Processing files"):
    with rasterio.open(file, driver="Gtiff") as src:
        # Read image data and metadata
        img = src.read().transpose(1, 2, 0)  # Convert to (H, W, C) format
        metadata = src.meta

        # Stretch image histogram
        img = histogram_stretch(img).astype(np.uint8)

        # Adjust metadata for saving uint8 data
        metadata.update(dtype="uint8", count=img.shape[2])

        # Save the processed image
        img_path = os.path.join(histogram_dir, os.path.basename(file))
        with rasterio.open(img_path, "w", **metadata) as dst:
            dst.write(img.transpose(2, 0, 1))  # Convert back to (C, H, W)


Processing files:   0%|          | 0/739 [00:00<?, ?it/s]

Processing files: 100%|██████████| 739/739 [00:21<00:00, 34.27it/s]
