#start


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import os
import cv2
from sklearn.model_selection import train_test_split

# Define paths
csv_image_mapping = [
        {
        'csv_path': '/content/drive/MyDrive/Tomato_dataset/phone1(ripe)/phone1(ripe).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/phone1(ripe)'
    },
    {
        'csv_path': '/content/drive/MyDrive/Tomato_dataset/Phone1(green)/phone1(green).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/Phone1(green)'
    },
    {
        'csv_path': '/content/drive/MyDrive/Tomato_dataset/phone2(batch1)/annotations/phone2(batch1).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/phone2(batch1)'
    },
    {
        'csv_path': '/content/drive/MyDrive/Tomato_dataset/phone2(batch2)/annotations/phone2(batch2).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/phone2(batch2)'
    }
]
output_dir = '/content/drive/MyDrive/Tomato_dataset/cnn_crops'

In [5]:
import os
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split
from multiprocessing.dummy import Pool as ThreadPool
import tqdm

# -----------------------------
# CONFIG
# -----------------------------
output_dir = 'cropped_dataset'  # where crops will be saved

ripe_labels = ['R.healthy', 'R.lateblight', 'R.spots', 'R.pests', 'R.ber']
green_labels = ['G.healthy', 'G.lateblight', 'G.spots', 'G.pests', 'G.ber']

# Example: list of dictionaries mapping CSV to image directories
# csv_image_mapping = [{'csv_path': 'path/to/csv1.csv', 'image_dir': 'path/to/images1'},
#                      {'csv_path': 'path/to/csv2.csv', 'image_dir': 'path/to/images2'}]

# -----------------------------
# LOAD AND FILTER DATA
# -----------------------------
dfs = [pd.read_csv(m['csv_path']).assign(image_dir=m['image_dir']) for m in csv_image_mapping]
data = pd.concat(dfs, ignore_index=True)[
    ['image_name','label_name','bbox_x','bbox_y','bbox_width','bbox_height','image_dir']
]
data = data[data['label_name'].isin(ripe_labels + green_labels)]

# -----------------------------
# CREATE OUTPUT DIRECTORIES
# -----------------------------
for ripeness in ['ripe', 'green']:
    for split in ['train', 'val']:
        labels = ripe_labels if ripeness == 'ripe' else green_labels
        for label in labels:
            os.makedirs(os.path.join(output_dir, ripeness, split, label.replace('.', '_')), exist_ok=True)

# -----------------------------
# TRAIN/VAL SPLIT
# -----------------------------
unique_images = data['image_name'].unique()
train_images, val_images = train_test_split(unique_images, test_size=0.2, random_state=42)
train_set, val_set = set(train_images), set(val_images)

# -----------------------------
# PREPARE ROWS FOR THREADING
# -----------------------------
rows = [(r['image_dir'], r['image_name'], r['label_name'], r['bbox_x'], r['bbox_y'], r['bbox_width'], r['bbox_height'])
        for r in data.to_dict(orient='records')]

# -----------------------------
# PROCESS FUNCTION
# -----------------------------
def process_row(row):
    image_dir, image_name, label_name, x, y, w, h = row
    img_path = os.path.join(image_dir, image_name)
    if not os.path.exists(img_path):
        return
    img = cv2.imread(img_path)
    if img is None:
        return
    x, y, w, h = map(int, (x, y, w, h))
    crop = img[y:y+h, x:x+w]
    if crop.size == 0:
        return
    crop = cv2.resize(crop, (224, 224))
    ripeness = 'ripe' if label_name.startswith('R') else 'green'
    label_dir = label_name.replace('.', '_')
    split = 'train' if image_name in train_set else 'val'
    save_path = os.path.join(output_dir, ripeness, split, label_dir, f"{image_name.replace('.jpg', f'_{x}_{y}.jpg')}")
    cv2.imwrite(save_path, crop)

# -----------------------------
# PARALLEL EXECUTION
# -----------------------------
num_threads = 8  # Adjust depending on your CPU
with ThreadPool(num_threads) as pool:
    list(tqdm.tqdm(pool.imap(process_row, rows), total=len(rows)))

print(f"✅ Cropped datasets saved to {output_dir}")


100%|██████████| 4451/4451 [09:47<00:00,  7.57it/s]

✅ Cropped datasets saved to cropped_dataset



