Leaf model


In [6]:
!pip install ultralytics opencv-python -q
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import numpy as np
import os
import shutil
from PIL import Image as PILImage
from sklearn.model_selection import train_test_split



In [8]:
#CSV paths and their corresponding image directories (modify these as needed)
csv_image_mapping = [
    {
        'csv_path': '/content/drive/MyDrive/Tomato_dataset/Phone1_Leaves(batch3)/annotations/phone 1_leaves(batch3).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/Phone1_Leaves(batch3)'
    },
    {'csv_path': '/content/drive/MyDrive/Tomato_dataset/Phone2_leaves/annotations/Phone2_leaves.csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/Phone2_leaves'
    },
     {'csv_path': '/content/drive/MyDrive/Tomato_dataset/phone1_leaves(batch1)/annotations/phone1_leaves(batch1).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/phone1_leaves(batch1)'
    },
     {'csv_path': '/content/drive/MyDrive/Tomato_dataset/phone1_leaves(batch2)/annotations/phone1_leaves(batch2).csv',
        'image_dir': '/content/drive/MyDrive/Tomato_dataset/phone1_leaves(batch2)'
    }
    ]

In [9]:
# Get annotated image names from CSVs
annotated_images = set()
for mapping in csv_image_mapping:
    try:
        df = pd.read_csv(mapping['csv_path'])
        annotated_images.update(df['image_name'].unique())
    except Exception as e:
        print(f"Error reading {mapping['csv_path']}: {e}")

# Get all images from directories
all_images = set()
for mapping in csv_image_mapping:
    try:
        images = [f for f in os.listdir(mapping['image_dir']) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        all_images.update(images)
    except Exception as e:
        print(f"Error accessing {mapping['image_dir']}: {e}")

# Calculate annotated and unannotated counts
unannotated_images = all_images - annotated_images
print(f"Annotated images: {len(annotated_images)}")
print(f"Unannotated images: {len(unannotated_images)}")
if unannotated_images:
    print("Unannotated images:", unannotated_images)

Annotated images: 515
Unannotated images: 96
Unannotated images: {'IMG_20250906_125929.jpg', 'IMG_20250906_131711.jpg', 'IMG_20250906_142801.jpg', 'IMG_20250906_143945.jpg', 'IMG_20250906_124418.jpg', 'IMG_20250906_124435.jpg', 'IMG_20250906_125521.jpg', 'IMG_20250906_132738.jpg', 'IMG_20250906_130644.jpg', 'IMG_20250906_124244.jpg', 'IMG_20250906_124436_HDR.jpg', 'IMG_20250906_132850.jpg', 'IMG_20250906_125900.jpg', 'IMG_20250906_131014.jpg', 'IMG_20250906_125958.jpg', 'IMG_20250906_130451_HDR.jpg', 'IMG_20250906_130050.jpg', 'IMG_20250906_130331.jpg', 'IMG_20250906_130656.jpg', 'IMG_20250906_124904.jpg', 'IMG_20250906_125935.jpg', 'IMG_20250906_114110_HDR.jpg', 'IMG_20250906_131001.jpg', 'IMG_20250906_132820.jpg', 'IMG_20250906_125136.jpg', 'IMG_20250906_125544.jpg', 'IMG_20250906_131450.jpg', 'IMG_20250906_125938.jpg', 'IMG_20250906_125948.jpg', 'IMG_20250906_124906.jpg', 'IMG_20250906_130557.jpg', 'IMG_20250906_130304.jpg', 'IMG_20250906_124803.jpg', 'IMG_20250906_124448_HDR.jpg', 

In [10]:
# Check annotated vs unannotated per folder
for mapping in csv_image_mapping:
    csv_path = mapping['csv_path']
    image_dir = mapping['image_dir']
    folder_name = os.path.basename(image_dir)

    try:
        # Load annotated image names
        df = pd.read_csv(csv_path)
        annotated = set(df['image_name'].unique())
    except Exception as e:
        print(f"Error reading CSV for {folder_name}: {e}")
        annotated = set()

    try:
        # Get all images
        all_imgs = {f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))}
    except Exception as e:
        print(f"Error accessing images in {folder_name}: {e}")
        all_imgs = set()

    # Compare
    unannotated = all_imgs - annotated

    print(f"\n Folder: {folder_name}")
    print(f"  Total images:       {len(all_imgs)}")
    print(f"  Annotated images:   {len(annotated)}")
    print(f"  Unannotated images: {len(unannotated)}")

    if unannotated:
        print("   Unannotated samples:", list(unannotated)[:5], "..." if len(unannotated) > 5 else "")


 Folder: Phone1_Leaves(batch3)
  Total images:       113
  Annotated images:   96
  Unannotated images: 17
   Unannotated samples: ['IMG_20250906_124418.jpg', 'IMG_20250906_124503.jpg', 'IMG_20250906_124905.jpg', 'IMG_20250906_125223.jpg', 'IMG_20250906_124803.jpg'] ...

 Folder: Phone2_leaves
  Total images:       218
  Annotated images:   191
  Unannotated images: 27
   Unannotated samples: ['IMG_20250906_125948.jpg', 'IMG_20250906_124906.jpg', 'IMG_20250906_132222_HDR.jpg', 'IMG_20250906_142203.jpg', 'IMG_20250906_143119_HDR.jpg'] ...

 Folder: phone1_leaves(batch1)
  Total images:       150
  Annotated images:   139
  Unannotated images: 11
   Unannotated samples: ['IMG_20250906_132820.jpg', 'IMG_20250906_132126.jpg', 'IMG_20250906_131710.jpg', 'IMG_20250906_132738.jpg', 'IMG_20250906_131713.jpg'] ...

 Folder: phone1_leaves(batch2)
  Total images:       150
  Annotated images:   106
  Unannotated images: 44
   Unannotated samples: ['IMG_20250906_125929.jpg', 'IMG_20250906_125938.

In [11]:

dfs = []
for mapping in csv_image_mapping:
    try:
        df = pd.read_csv(mapping['csv_path'])
        df['image_dir'] = mapping['image_dir']  # Add image_dir column
        dfs.append(df)
    except Exception as e:
        print(f"Error reading {mapping['csv_path']}: {e}")
        raise

# Concatenate DataFrames in memory for processing (not saving)
data = pd.concat(dfs, ignore_index=True)

# Ensure correct columns
expected_columns = ['image_name', 'label_name', 'bbox_x', 'bbox_y', 'bbox_width', 'bbox_height', 'image_width', 'image_height', 'image_dir']
data = data[expected_columns]

# Check for missing or invalid label_name values
print("Missing label_name values:", data['label_name'].isna().sum())
print("Unique label_name values:", data['label_name'].unique())
data = data.dropna(subset=['label_name'])
data = data[data['label_name'].apply(lambda x: isinstance(x, str))]

# Extract binary class (G or R)
data['binary_class'] = data['label_name'].apply(lambda x: x.split('.')[0])
print("Unique binary classes:", data['binary_class'].unique())
if not all(label in ['G', 'R'] for label in data['binary_class'].unique()):
    print("Warning: Unexpected binary classes:", data['binary_class'].unique())

# Check class balance
print("Class distribution:")
print(data['binary_class'].value_counts())

# Validate bounding boxes
invalid_bboxes = data[
    (data['bbox_width'] <= 0) |
    (data['bbox_height'] <= 0) |
    (data['bbox_x'] < 0) |
    (data['bbox_y'] < 0) |
    (data['bbox_x'] + data['bbox_width'] > data['image_width']) |
    (data['bbox_y'] + data['bbox_height'] > data['image_height'])
]
if not invalid_bboxes.empty:
    print("Invalid bounding boxes:", invalid_bboxes[['image_name', 'bbox_x', 'bbox_y', 'bbox_width', 'bbox_height']])
    data = data[~data.index.isin(invalid_bboxes.index)]

# Get unique filenames and split into train/val
unique_filenames = data['image_name'].unique()
train_filenames, val_filenames = train_test_split(unique_filenames, test_size=0.2, random_state=42)

# Create dataset directories
dataset_dir = '/content/dataset'
images_train_dir = os.path.join(dataset_dir, 'images', 'train')
images_val_dir = os.path.join(dataset_dir, 'images', 'val')
labels_train_dir = os.path.join(dataset_dir, 'labels', 'train')
labels_val_dir = os.path.join(dataset_dir, 'labels', 'val')
os.makedirs(images_train_dir, exist_ok=True)
os.makedirs(images_val_dir, exist_ok=True)
os.makedirs(labels_train_dir, exist_ok=True)
os.makedirs(labels_val_dir, exist_ok=True)

# Class mapping
class_map = {'G': 0, 'R': 1}

Missing label_name values: 0
Unique label_name values: ['Flowers' 'Leaf_spot' 'Y.leaf_curl' 'Wilt' 'Early_blight' 'Late_blight'
 'Late _blight' 'Pests' 'Early-blight' 'Powdery_mildew' 'flowers']
Unique binary classes: ['Flowers' 'Leaf_spot' 'Y' 'Wilt' 'Early_blight' 'Late_blight'
 'Late _blight' 'Pests' 'Early-blight' 'Powdery_mildew' 'flowers']
 'Late _blight' 'Pests' 'Early-blight' 'Powdery_mildew' 'flowers']
Class distribution:
binary_class
Flowers           602
Y                 447
flowers           163
Powdery_mildew    121
Wilt              114
Early_blight      105
Late_blight        94
Leaf_spot          79
Late _blight       72
Pests              36
Early-blight       11
Name: count, dtype: int64


In [13]:
# Create YOLO label file
def create_yolo_label(filename, split_dir, is_train=True):
    annotations = data[data['image_name'] == filename]
    if annotations.empty:
        print(f"No annotations for {filename}")
        return False

    image_dir = annotations['image_dir'].iloc[0]
    image_path = os.path.join(image_dir, filename)
    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        return False

    width = annotations['image_width'].iloc[0]
    height = annotations['image_height'].iloc[0]
    dest_image_dir = images_train_dir if is_train else images_val_dir
    dest_image_path = os.path.join(dest_image_dir, filename)
    shutil.copy(image_path, dest_image_path)

    label_path = os.path.join(labels_train_dir if is_train else labels_val_dir, filename.replace('.jpg', '.txt'))
    valid_annotations = 0
    with open(label_path, 'w') as f:
        for _, ann in annotations.iterrows():
            cls = class_map[ann['binary_class']]
            x_center = (ann['bbox_x'] + ann['bbox_width'] / 2) / width
            y_center = (ann['bbox_y'] + ann['bbox_height'] / 2) / height
            w = ann['bbox_width'] / width
            h = ann['bbox_height'] / height
            if 0 <= x_center <= 1 and 0 <= y_center <= 1 and 0 < w <= 1 and 0 < h <= 1:
                f.write(f"{cls} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}\n")
                valid_annotations += 1
            else:
                print(f"Skipping invalid bbox for {filename}: {x_center}, {y_center}, {w}, {h}")
    print(f"Created label file: {label_path} with {valid_annotations} annotations")
    return valid_annotations > 0

In [14]:
# Create labels and oversample minority class
train_success = []
val_success = []
for filename in train_filenames:
    train_success.append(create_yolo_label(filename, 'train', is_train=True))
ripe_images = data[data['binary_class'] == 'R']['image_name'].unique()
for filename in ripe_images:
    if filename in train_filenames:
        train_success.append(create_yolo_label(filename, 'train', is_train=True))
for filename in val_filenames:
    val_success.append(create_yolo_label(filename, 'val', is_train=False))

# Verify dataset
print(f"Train images: {len(os.listdir(images_train_dir))} in {images_train_dir}")
print(f"Val images: {len(os.listdir(images_val_dir))} in {images_val_dir}")
print(f"Train labels: {len(os.listdir(labels_train_dir))} in {labels_train_dir}")
print(f"Val labels: {len(os.listdir(labels_val_dir))} in {labels_val_dir}")
print(f"Dataset prepared with {sum(train_success)} train images and {sum(val_success)} val images with valid annotations.")

KeyError: 'Late_blight'