#start


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from multiprocessing.dummy import Pool as ThreadPool
import tqdm

In [12]:

# Define paths
csv_image_mapping = [
    {
        'csv_path': '/content/drive/MyDrive/Tomato_dataset/camera_ripe(batch2)/annotations/camera_ripe(batch2).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/camera_ripe(batch2)'
    },
    {
        'csv_path': '/content/drive/MyDrive/Tomato_dataset/camera_ripe(batch3)/annotations/camera_ripe(batch3).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/camera_ripe(batch3)'
    },

]
output_dir = '/content/drive/MyDrive/Tomato_dataset/cnn_crops'

In [13]:


output_dir = '/content/drive/MyDrive/Tomato_dataset/cnn_crops'

ripe_labels = ['R.healthy', 'R.lateblight', 'R.spots', 'R.pests', 'R.ber']
green_labels = ['G.healthy', 'G.lateblight', 'G.spots', 'G.pests', 'G.ber']

# Example mapping
# csv_image_mapping = [{'csv_path': 'path/to/csv1.csv', 'image_dir': 'path/to/images1'},
#                      {'csv_path': 'path/to/csv2.csv', 'image_dir': 'path/to/images2'}]

# -----------------------------
# LOAD AND FILTER DATA
# -----------------------------
dfs = [pd.read_csv(m['csv_path']).assign(image_dir=m['image_dir']) for m in csv_image_mapping]
data = pd.concat(dfs, ignore_index=True)[
    ['image_name','label_name','bbox_x','bbox_y','bbox_width','bbox_height','image_dir']
]
data = data[data['label_name'].isin(ripe_labels + green_labels)]

# -----------------------------
# CREATE OUTPUT DIRECTORIES IF MISSING
# -----------------------------
for ripeness in ['ripe', 'green']:
    labels = ripe_labels if ripeness == 'ripe' else green_labels
    for label in labels:
        dir_path = os.path.join(output_dir, ripeness, label.replace('.', '_'))
        os.makedirs(dir_path, exist_ok=True)

# -----------------------------
# TRAIN/VAL SPLIT
# -----------------------------
unique_images = data['image_name'].unique()
train_images, val_images = train_test_split(unique_images, test_size=0.2, random_state=42)
train_set, val_set = set(train_images), set(val_images)

# -----------------------------
# PREPARE ROWS FOR THREADING
# -----------------------------
rows = [(r['image_dir'], r['image_name'], r['label_name'], r['bbox_x'], r['bbox_y'], r['bbox_width'], r['bbox_height'])
        for r in data.to_dict(orient='records')]

# -----------------------------
# PROCESS FUNCTION
# -----------------------------
def process_row(row):
    image_dir, image_name, label_name, x, y, w, h = row
    img_path = os.path.join(image_dir, image_name)
    if not os.path.exists(img_path):
        return
    img = cv2.imread(img_path)
    if img is None:
        return
    x, y, w, h = map(int, (x, y, w, h))
    crop = img[y:y+h, x:x+w]
    if crop.size == 0:
        return
    crop = cv2.resize(crop, (224, 224))

    # Determine ripeness and split
    ripeness = 'ripe' if label_name.startswith('R') else 'green'
    split = 'train' if image_name in train_set else 'val'
    label_dir = label_name.replace('.', '_')

    # Check if split folder exists; create if missing
    final_dir = os.path.join(output_dir, ripeness, split, label_dir)
    os.makedirs(final_dir, exist_ok=True)

    # Save crop
    save_path = os.path.join(final_dir, f"{image_name.replace('.jpg', f'_{x}_{y}.jpg')}")
    cv2.imwrite(save_path, crop)

# -----------------------------
# PARALLEL EXECUTION
# -----------------------------
num_threads = 8  # Adjust depending on your CPU
with ThreadPool(num_threads) as pool:
    list(tqdm.tqdm(pool.imap(process_row, rows), total=len(rows)))

print(f"✅ Cropped datasets saved to {output_dir}")


100%|██████████| 748/748 [00:48<00:00, 15.32it/s]

✅ Cropped datasets saved to /content/drive/MyDrive/Tomato_dataset/cnn_crops





In [16]:
import os
import shutil

# Base dataset path
base_dir = "/content/drive/MyDrive/Tomato_dataset/cnn_crops"
backup_dir = os.path.join(base_dir, "removed_classes")

# Ensure backup folder exists
os.makedirs(backup_dir, exist_ok=True)

# Folders to move
to_move = [
    f"{base_dir}/ripe/train/R_lateblight",
    f"{base_dir}/ripe/val/R_lateblight",
    f"{base_dir}/green/train/G_pests",
    f"{base_dir}/green/val/G_pests",
]

# Move entire folders with prefix
for folder in to_move:
    if os.path.exists(folder):
        # Generate prefix from parent folders
        parts = folder.replace(base_dir + "/", "").split(os.sep)  # e.g., ["ripe", "train", "R_pests"]
        prefix = "_".join(parts[:-1])  # "ripe_train"
        folder_name = parts[-1]        # "R_pests"

        dst_folder_name = f"{prefix}_{folder_name}"
        dst_path = os.path.join(backup_dir, dst_folder_name)

        # If a folder with the same name exists, append a counter
        count = 1
        while os.path.exists(dst_path):
            dst_path = os.path.join(backup_dir, f"{dst_folder_name}_{count}")
            count += 1

        shutil.move(folder, dst_path)
        print(f"✅ Moved folder: {folder} → {dst_path}")
    else:
        print(f"⚠️ Folder not found: {folder}")

print("\n✨ Done! Folders moved to 'removed_classes' with prefixes to avoid name conflicts.")


✅ Moved folder: /content/drive/MyDrive/Tomato_dataset/cnn_crops/ripe/train/R_lateblight → /content/drive/MyDrive/Tomato_dataset/cnn_crops/removed_classes/ripe_train_R_lateblight
✅ Moved folder: /content/drive/MyDrive/Tomato_dataset/cnn_crops/ripe/val/R_lateblight → /content/drive/MyDrive/Tomato_dataset/cnn_crops/removed_classes/ripe_val_R_lateblight
⚠️ Folder not found: /content/drive/MyDrive/Tomato_dataset/cnn_crops/green/train/G_pests
⚠️ Folder not found: /content/drive/MyDrive/Tomato_dataset/cnn_crops/green/val/G_pests

✨ Done! Folders moved to 'removed_classes' with prefixes to avoid name conflicts.
