### Install dependencies

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization

### GPU Testing

In [None]:
# Check if GPU is available
is_gpu_available = tf.test.is_gpu_available()

# Clear the cell output
clear_output()

# Print GPU availability
print("GPU is available" if is_gpu_available else "GPU is not available")

### Load data

We rename the `img` column to `image_id` for clarity. The `rot` column was renamed to `is_rotten` for the same reason.

In [None]:
def load_train_df() -> None:
    return pd.read_csv("data/train.csv", dtype={
        "img": int, # Image ID
        "x": int, # X coordinate of the top-left corner of the bounding box
        "y": int, # Y coordinate of the top-left corner of the bounding box
        "w": int, # Width of the bounding box
        "h": int, # Height of the bounding box
        "rot": int, # Whether the image shows a rotten pear (1) or not (0)
    }, index_col=0).rename(columns={
        "img": "image_id",
        "w": "x2",
        "h": "y2",
        "rot": "is_rotten",
    })

In [None]:
def load_test_df() -> None:
    return pd.read_csv("data/test.csv", dtype={
        "img": int, # Image ID
        "x": int, # X coordinate of the top-left corner of the bounding box
        "y": int, # Y coordinate of the top-left corner of the bounding box
        "w": int, # Width of the bounding box
        "h": int, # Height of the bounding box
    }, index_col=0).rename(columns={
        "img": "image_id",
        "w": "x2",
        "h": "y2",
    })

In [None]:
def safe_create_directory(directory: str) -> None:
    try:
        os.makedirs(directory, exist_ok=True)
    except OSError as e:
        print(f"Error creating directory {directory}: {e}")

In [None]:
def get_image_path(image_id: int, directory: str) -> str:
    return f"{directory}{image_id:08d}.png"

In [None]:
def load_image(image_id: int, directory: str) -> np.ndarray:
    image_path = get_image_path(image_id, directory)
    image = cv2.imread(image_path)

    return image

In [None]:
def crop_image(image: np.ndarray, row: pd.Series) -> np.ndarray:
    left_x, top_y, right_x, bottom_y = row[["x", "y", "x2", "y2"]]

    cropped_image = image[top_y:bottom_y, left_x:right_x]

    return cropped_image

In [None]:
def determine_training_subdirectory(row: pd.Series) -> str:
    if "is_rotten" not in row:
        return ""
    
    return "rotten/" if row["is_rotten"] == 1 else "not_rotten/"

In [None]:
def get_cropped_image_path(row: pd.Series, directory: str) -> str:
    subdirectory = determine_training_subdirectory(row)
    return f"{directory}{subdirectory}{row.name}.png"

In [None]:
def is_dir_empty(dir: str) -> bool:
    safe_create_directory(dir)

    return len(os.listdir(dir)) == 0

In [None]:
def crop_all_images(df: pd.DataFrame, raw_directory: str, cropped_directory: str) -> None:   
    if not is_dir_empty(cropped_directory):
        print(f"The {cropped_directory} directory is not empty. Skipping.")
        return

    # Load all images one by one
    for image_id in df["image_id"].unique():
        image = load_image(image_id, raw_directory)

        # Find all the rows with the current image ID
        rows = df[df["image_id"] == image_id]

        # Crop the image for each row
        for _, row in rows.iterrows():
            cropped_image = crop_image(image, row)
            cropped_image_path = get_cropped_image_path(row, cropped_directory)
            # Save the cropped image
            cv2.imwrite(cropped_image_path, cropped_image)

In [None]:
# Define the directories for raw and cropped images
IMAGE_DATA_DIRECTORIES = {
    "raw": {
        "train": "data/raw/all_images/",
        "test": "data/raw/all_images/",
    },
    "cropped": {
        "train": "data/cropped/train_images/",
        "test": "data/cropped/test_images/",
    },
}

for directory in IMAGE_DATA_DIRECTORIES["raw"].values():
    safe_create_directory(directory)

for directory in IMAGE_DATA_DIRECTORIES["cropped"].values():
    safe_create_directory(directory)

In [None]:
train_df = load_train_df()
test_df = load_test_df()

In [None]:
crop_all_images(train_df, IMAGE_DATA_DIRECTORIES["raw"]["train"], IMAGE_DATA_DIRECTORIES["cropped"]["train"])
crop_all_images(test_df, IMAGE_DATA_DIRECTORIES["raw"]["test"], IMAGE_DATA_DIRECTORIES["cropped"]["test"])

# TODO
- All images have different sizes, what do we do with this?
- Do we add a margin around each pear?

In [None]:
# image_width = 500
# image_height = 500
# batch_size = 32

# train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
#     rescale=1./255,
#     rotation_range=15,
#     width_shift_range=0.1,
#     height_shift_range=0.1,
#     horizontal_flip=True,
#     vertical_flip=True,
#     validation_split=0.2,
# )

# train_generator = train_datagen.flow_from_directory(
#     directory="data/cropped/train_images/",
#     target_size=(image_width, image_height),
#     batch_size=batch_size,
#     class_mode="binary",
#     subset="training",
# )

# validation_generator = train_datagen.flow_from_directory(
#     directory="data/cropped/train_images/",
#     target_size=(image_width, image_height),
#     batch_size=batch_size,
#     class_mode="binary",
#     subset="validation",
# )