### Install dependencies

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython.display import clear_output

### GPU Testing

In [None]:
# Check if GPU is available
is_gpu_available = tf.test.is_gpu_available()

# Clear the cell output
clear_output()

# Print GPU availability
print('GPU is available' if is_gpu_available else 'GPU is not available')

### Load data

We rename the `img` column to `image_id` for clarity. The `rot` column was renamed to `is_rotten` for the same reason.

In [None]:
def load_train_df() -> None:
    return pd.read_csv('data/train.csv', dtype={
        'img': int, # Image ID
        'x': int, # X coordinate of the top-left corner of the bounding box
        'y': int, # Y coordinate of the top-left corner of the bounding box
        'w': int, # Width of the bounding box
        'h': int, # Height of the bounding box
        'rot': int, # Whether the image shows a rotten pear (1) or not (0)
    }, index_col=0).rename(columns={
        'img': 'image_id',
        'rot': 'is_rotten',
    })

def load_test_df() -> None:
    return pd.read_csv('data/test.csv', dtype={
        'img': int, # Image ID
        'x': int, # X coordinate of the top-left corner of the bounding box
        'y': int, # Y coordinate of the top-left corner of the bounding box
        'w': int, # Width of the bounding box
        'h': int, # Height of the bounding box
    }, index_col=0).rename(columns={
        'img': 'image_id',
    })

def safe_create_directory(directory: str) -> None:
    """
    Safely creates a directory if it does not exist.

    Args:
    - directory (str): The path to the directory to create.

    Returns:
    - None
    """
    try:
        os.makedirs(directory, exist_ok=True)
    except OSError as e:
        print(f'Error creating directory {directory}: {e}')

def get_image_path(image_id: int, directory: str) -> str:
    """
    Get the path to an image given its ID and the directory it is stored in.

    Args:
    - image_id (int): The ID of the image.
    - directory (str): The directory the image is stored in.

    Returns:
    - str: The path to the image.
    """
    return f"{directory}{str(image_id)}.jpg"

def load_image(image_id: int, directory: str) -> np.ndarray:
    """
    Load an image given its ID and the directory it is stored in.

    Args:
    - image_id (int): The ID of the image.
    - directory (str): The directory the image is stored in.

    Returns:
    - np.ndarray: The image as a NumPy array.
    """
    image_path = get_image_path(image_id, directory)
    image = cv2.imread(image_path)

    return image

def get_cropped_image(image_id: int, df: pd.DataFrame, directory: str) -> np.ndarray:
    """
    Crop an image given its id and the coordinates of the bounding box.

    Args:
    - image_path (str): The path to the image file.
    - x (int): The x coordinate of the top-left corner of the bounding box.
    - y (int): The y coordinate of the top-left corner of the bounding box.
    - w (int): The width of the bounding box.
    - h (int): The height of the bounding box.

    Returns:
    - np.ndarray: The cropped image as a NumPy array.
    """
    image = load_image(image_id, directory)
    x, y, w, h = df.loc[image_id, ['x', 'y', 'w', 'h']]

    return image[y:y+h, x:x+w]

In [None]:
# Define the directories for raw and cropped images
IMAGE_DATA_DIRECTORIES = {
    'raw': {
        'train': 'data/raw/train_images/',
        'test': 'data/raw/test_images/',
    },
    'cropped': {
        'train': 'data/cropped/train_images/',
        'test': 'data/cropped/test_images/',
    },
}

for directory in IMAGE_DATA_DIRECTORIES['raw'].values():
    safe_create_directory(directory)

for directory in IMAGE_DATA_DIRECTORIES['cropped'].values():
    safe_create_directory(directory)

In [None]:
train_df = load_train_df()
test_df = load_test_df()

for i in range(8):
    # temporarily use the iterator as the image_id
    image_id = i
    # image_id = train_df['image_id'][i]
    image = get_cropped_image(image_id, train_df, IMAGE_DATA_DIRECTORIES['raw']['train'])
    # Store the image in the cropped directory
    cv2.imwrite(get_image_path(image_id, IMAGE_DATA_DIRECTORIES['cropped']['train']), image)