# Cropping images automatically

In [85]:
import os
import pandas as pandas
import cv2

In [86]:
# directories
dirs = [
    r"E:\DLSU\Coral 'yan!\Dataset\TUBBATAHA\ORIGINAL (aka MONITORING)\2024 (COMPLETE)\QUADRAT\IMAGE AND CPCE FILE\SHINE-1790_Min Ping Yu, Tubbataha, Cagayancillo\Q1",
]

output_dir = r"E:\DLSU\Coral 'yan!\Dataset\CROPPED-CORALS"

# create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [87]:
# function to find full path of an image

def find_image(image_name, raw_dirs):
    for raw_dir in raw_dirs:
        image_path = os.path.join(raw_dir, image_name)
        if os.path.exists(image_path):
            return image_path
    return None

In [88]:
crop_size = 500
half_crop = crop_size // 2

In [89]:
# Process each CPCE annotation file
for annotation_file in os.listdir(dirs):
    if annotation_file.endswith(".cpce"):
        continue

    annotation_path = os.path.join(dirs, annotation_file)
    print(f"Processing: {annotation_path}")

    # Parse the annotation file
    with open(annotation_path, "r", encoding="ISO-8859-1") as file:
        lines = file.readlines()

    try:
        # Extract image name
        image_name = os.path.splitext(annotation_file)[0] + ".JPG"
        print(f"Extracted image name: {image_name}")

        # TODO: Make this just look at a certain line
        start_index = None
        for i, line in enumerate(lines):
            if line.strip() == "10":
                start_index = i
                break

        if start_index is None:
            print(f"Skipping file {annotation_file}: '10' marker not found.")
            continue
        # TODO DONE

        # Parse the number of annotations
        try:
            num_annotations = int(lines[start_index].strip())
        except ValueError:
            print(f"Skipping file {annotation_file} due to invalid number of annotations.")
            continue

        if num_annotations <= 0:
            print(f"Skipping file {annotation_file} due to zero or negative annotations.")
            continue

        # Extract points and labels
        points = []
        labels = []
        for i in range(num_annotations):
            point_line = lines[start_index + 1 + i].strip()
            label_line = lines[start_index + 1 + num_annotations + i].strip()

            try:
                # Parse point coordinates
                # Use float for scaling later
                x, y = map(float, point_line.split(","))
                points.append((x, y))

                # Parse label
                label = label_line.split(",")[1].strip('"')
                labels.append(label)
            except (ValueError, IndexError):
                print(f"Skipping invalid line in file {annotation_file}: {point_line} or {label_line}")

        print(f"Extracted coordinates for {annotation_file}: {points}")

    except Exception as e:
        print(f"Error parsing annotation file {annotation_file}: {e}")
        continue

    # Find the corresponding image
    image_path = find_image(image_name, dirs)
    if not image_path:
        print(f"Image not found: {image_name}")
        continue

    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Failed to load image: {image_path}")
        continue

    # Get image dimensions
    image_height, image_width = image.shape[:2]

    # Determine scaling factors (assuming the coordinates are in some larger scale)
    max_x = 82080  # Replace with the actual max coordinate value for x from your data
    max_y = 54720  # Replace with the actual max coordinate value for y from your data

    scale_x = image_width / max_x
    scale_y = image_height / max_y
    print(f"Scaling factors: scale_x={scale_x}, scale_y={scale_y}")

    # Crop and save each annotation
    for i, (point, label) in enumerate(zip(points, labels)):
        x, y = point

        # Scale the coordinates to pixel space
        scaled_x = int(x * scale_x)
        scaled_y = int(y * scale_y)

        # Define crop boundaries
        x_min = max(0, scaled_x - half_crop)
        y_min = max(0, scaled_y - half_crop)
        x_max = min(image_width, scaled_x + half_crop)
        y_max = min(image_height, scaled_y + half_crop)

        # Check for valid crop (not empty)
        if x_min >= x_max or y_min >= y_max:
            print(f"Skipping invalid crop at ({scaled_x}, {scaled_y}) in image {image_name}. Crop is empty.")
            continue

        # Crop the image
        cropped = image[y_min:y_max, x_min:x_max]

        # Check if cropped image is empty
        if cropped.size == 0:
            print(f"Skipping empty crop for {label}_{os.path.splitext(image_name)[0]}_{i}.jpg")
            continue

        # Save the cropped image
        output_path = os.path.join(output_dir, f"{label}_{os.path.splitext(image_name)[0]}_{i}.jpg")
        cv2.imwrite(output_path, cropped)
        print(f"Saved: {output_path}")

print("Cropping complete!")

Processing: E:\DLSU\Coral 'yan!\Dataset\TUBBATAHA\ORIGINAL (aka MONITORING)\2024 (COMPLETE)\QUADRAT\IMAGE AND CPCE FILE\SHINE-1790_Min Ping Yu, Tubbataha, Cagayancillo\Q1\DSC07634.cpc
Extracted image name: DSC07634.jpg
Extracted coordinates for DSC07634.cpc: [(39387.0, 8589.0), (38577.0, 11350.0), (36456.0, 50259.0), (35583.0, 15394.0), (62174.0, 35688.0), (30169.0, 20224.0), (54171.0, 46772.0), (60841.0, 10629.0), (35625.0, 31865.0), (41825.0, 36839.0)]
Scaling factors: scale_x=0.06666666666666667, scale_y=0.06666666666666667
Saved: E:\DLSU\Coral 'yan!\Dataset\cropped-corals\AA_DSC07634_0.jpg
Saved: E:\DLSU\Coral 'yan!\Dataset\cropped-corals\R_DSC07634_1.jpg
Saved: E:\DLSU\Coral 'yan!\Dataset\cropped-corals\S_DSC07634_2.jpg
Saved: E:\DLSU\Coral 'yan!\Dataset\cropped-corals\R_DSC07634_3.jpg
Saved: E:\DLSU\Coral 'yan!\Dataset\cropped-corals\AA_DSC07634_4.jpg
Saved: E:\DLSU\Coral 'yan!\Dataset\cropped-corals\AA_DSC07634_5.jpg
Saved: E:\DLSU\Coral 'yan!\Dataset\cropped-corals\R_DSC07634_6