In [72]:
import json
import random
import boto3
from botocore.exceptions import ClientError

# === S3 Bucket and Key Configuration ===
BUCKET_NAME = "lemonproject"  
JSON_KEY = "annotations/instances_default.json"
ORIGINAL_IMAGE_PREFIX = "images/"

# Initialize the S3 client.
s3 = boto3.client('s3')

# === Helper Functions ===
def download_json_from_s3(bucket, key, local_path="annotations.json"):
    """Download the JSON annotation file from S3."""
    s3.download_file(bucket, key, local_path)
    return local_path

def delete_all_objects_with_prefix(bucket, prefix):
    """Delete all objects in the specified S3 bucket that start with the given prefix."""
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        if "Contents" in page:
            objects_to_delete = [{"Key": obj["Key"]} for obj in page["Contents"]]
            s3.delete_objects(Bucket=bucket, Delete={"Objects": objects_to_delete})
            print(f"Deleted {len(objects_to_delete)} objects under {prefix}")

def copy_s3_object(bucket, source_key, dest_key):
    """Copy an object within the same bucket."""
    try:
        copy_source = {"Bucket": bucket, "Key": source_key}
        s3.copy_object(Bucket=bucket, CopySource=copy_source, Key=dest_key)
        print(f"Copied {source_key} to {dest_key}")
    except ClientError as e:
        print(f"Error copying {source_key} to {dest_key}: {e}")

def get_source_key(file_name):
    """
    Compute the source key for an image in the original images folder.
    Since we already removed the "images/" prefix from file_name,
    simply prepend the ORIGINAL_IMAGE_PREFIX.
    """
    return ORIGINAL_IMAGE_PREFIX + file_name


Subseting it like this allows me to change the splitting without touching the original images, so if I make some mistakes I just have to rewrite this code, instead of reuploading the entire dataset to s3

In [73]:

SUBSET_ROOT = "image_subset/"

# "allsubset" will contain all filtered images and annotations.
ALLSUBSET_PREFIX = SUBSET_ROOT + "allsubset/"
ALLSUBSET_IMAGES_PREFIX = ALLSUBSET_PREFIX + "images/"

# Each split folder will contain its own images subfolder and annotations.
TRAIN_PREFIX = SUBSET_ROOT + "train/"
TRAIN_IMAGES_PREFIX = TRAIN_PREFIX + "images/"

TEST_PREFIX  = SUBSET_ROOT + "test/"
TEST_IMAGES_PREFIX = TEST_PREFIX + "images/"


print("Clearing existing objects under the subset root in S3...")
delete_all_objects_with_prefix(BUCKET_NAME, SUBSET_ROOT)


local_annotations_file = download_json_from_s3(BUCKET_NAME, JSON_KEY, "annotations.json")
with open(local_annotations_file, 'r') as f:
    data = json.load(f)

# Strip "images/" prefix from all image file names in the annotations.
for img in data.get("images", []):
    file_name = img.get("file_name", "")
    if file_name.startswith("images/"):
        img["file_name"] = file_name[len("images/"):]


with open("annotations_updated.json", "w") as f:
    json.dump(data, f)
print("Stripped 'images/' prefix from all image file names in the annotations.")


local_allsubset_json = "allsubset_annotations.json"
with open(local_allsubset_json, 'w') as f:
    json.dump(allsubset_data, f)
allsubset_annotations_key = ALLSUBSET_PREFIX + "annotations.json"
s3.upload_file(local_allsubset_json, BUCKET_NAME, allsubset_annotations_key)
print(f"Uploaded allsubset annotations to s3://{BUCKET_NAME}/{allsubset_annotations_key}")


print("Copying all filtered images to allsubset/images/ in S3...")
for img in filtered_images:
    source_key = get_source_key(img["file_name"])
    dest_key = ALLSUBSET_IMAGES_PREFIX + img["file_name"]
    copy_s3_object(BUCKET_NAME, source_key, dest_key)

Filter images that having decent sized bounding boxes and mould or pedicel(stem) , will use these for a better train test split in the next box

In [77]:
# Identify category IDs for "mould" and "pedicel" (case-insensitive)
mould_cat_ids = [cat["id"] for cat in data["categories"] if cat["name"].lower() == "mould"]
pedicel_cat_ids = [cat["id"] for cat in data["categories"] if cat["name"].lower() == "pedicel"]

if not mould_cat_ids:
    raise ValueError("No category named 'mould' found in annotations.")
if not pedicel_cat_ids:
    print("Warning: No category named 'pedicel' found; only 'mould' annotations will be kept.")

# Filter annotations: keep those with area >= 150 and category in (mould + pedicel)
filtered_annotations = [
    ann for ann in data["annotations"]
    if ann["category_id"] in (mould_cat_ids + pedicel_cat_ids) and ann["area"] >= 150
]


filtered_image_ids = {ann["image_id"] for ann in filtered_annotations}
filtered_images = [img for img in data["images"] if img["id"] in filtered_image_ids]
used_cat_ids = {ann["category_id"] for ann in filtered_annotations}
new_categories = [cat for cat in data["categories"] if cat["id"] in used_cat_ids]

# Create the complete filtered dataset
allsubset_data = {
    "images": filtered_images,
    "annotations": filtered_annotations,
    "categories": new_categories
}

Random sample 20% of the data for testing , of the remaining 80% , make sure to grab all of the images which have mould and then random sample so that the model doesn't overtrain on mouldy images

In [80]:

all_image_ids = [img["id"] for img in filtered_images]
random.shuffle(all_image_ids)

n_total = len(all_image_ids)
n_test = int(0.2 * n_total)

# Step 1: Create test set (random 20%)
test_image_ids = set(all_image_ids[:n_test])


# Step 2: Remaining images
remaining_image_ids = set(all_image_ids[n_test:])
test_images = [img for img in filtered_images if img["id"] in test_image_ids]
remaining_images = [img for img in filtered_images if img["id"] in remaining_image_ids]

# Identify mould images from remaining
mould_image_ids = {ann["image_id"] for ann in filtered_annotations if ann["category_id"] in mould_cat_ids}
mould_train_ids = mould_image_ids.intersection(remaining_image_ids)
mould_train_images = [img for img in remaining_images if img["id"] in mould_train_ids]

# Randomly sample same number of non-mould images
non_mould_ids = list(remaining_image_ids - mould_train_ids)
random.shuffle(non_mould_ids)
non_mould_sample_ids = set(non_mould_ids[:len(mould_train_ids)])
non_mould_train_images = [img for img in remaining_images if img["id"] in non_mould_sample_ids]

# Final training set
train_images = mould_train_images + non_mould_train_images
print(f"Final split: {len(train_images)} train, {len(test_images)} test images.")

Final split: 424 train, 216 test images.


In [81]:
# === Upload train/test splits to S3
for split_name, split_images, split_prefix, split_images_prefix in [
    ("train", train_images, TRAIN_PREFIX, TRAIN_IMAGES_PREFIX),
    ("test",  test_images,  TEST_PREFIX,  TEST_IMAGES_PREFIX)
]:

    copy_split_images(split_images, split_images_prefix)
    split_data = create_split_annotation(split_images)
    local_split_json = f"{split_name}_annotations.json"
    with open(local_split_json, 'w') as f:
        json.dump(split_data, f)
    split_annotations_key = split_prefix + "annotations.json"
    s3.upload_file(local_split_json, BUCKET_NAME, split_annotations_key)
    #print(f"Uploaded {split_name} annotations to s3://{BUCKET_NAME}/{split_annotations_key}")

Processing train split...
Copied image_subset/allsubset/images/0004_B_H_0_G.jpg to image_subset/train/images/0004_B_H_0_G.jpg
Copied image_subset/allsubset/images/0004_B_H_15_G.jpg to image_subset/train/images/0004_B_H_15_G.jpg
Copied image_subset/allsubset/images/0004_B_H_30_G.jpg to image_subset/train/images/0004_B_H_30_G.jpg
Copied image_subset/allsubset/images/0004_B_H_45_H.jpg to image_subset/train/images/0004_B_H_45_H.jpg
Copied image_subset/allsubset/images/0015_B_I_15_A.jpg to image_subset/train/images/0015_B_I_15_A.jpg
Copied image_subset/allsubset/images/0016_B_I_0_A.jpg to image_subset/train/images/0016_B_I_0_A.jpg
Copied image_subset/allsubset/images/0016_B_I_105_A.jpg to image_subset/train/images/0016_B_I_105_A.jpg
Copied image_subset/allsubset/images/0016_B_I_120_A.jpg to image_subset/train/images/0016_B_I_120_A.jpg
Copied image_subset/allsubset/images/0016_B_I_135_A.jpg to image_subset/train/images/0016_B_I_135_A.jpg
Copied image_subset/allsubset/images/0016_B_I_165_A.jp