# introduction
This file is used to train an image segmentation model to detect plant diseases in order to be used in an application to detect plant health in real-time and classify what type of disease is it inflicted with if any.

## Dataset used
The dataset used is the PlantVillage Dataset from Kaggle. The dataset can be found [here](https://www.kaggle.com/datasets/alexisbcook/plantvillage).

# 1. Dataset
In this section, we will create:
1. the Dataset class.
2. the DataLoader class.

And we will explore the dataset and visualize each class size in training, validation and test sets.

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
import tqdm
import cv2

## 1.1. Creating a transformation pipeline
The pipeline consists of the following transformations:
1. Resize.
2. RandomCrop.
3. Rotate limit of 40 degrees.
4. HorizontalFlip.
5. VerticalFlip.
6. RGBShift.
7. OneOf: Blur or ColorJitter.
8. Normalize the images.
9. Converting to tensor via ToTensorV2.

In [None]:
# Creating a transformation pipeline using Albumentations
transform = A.Compose([
        A.RandomRotate90(),
        A.HorizontalFlip(),
        A.Transpose(),
        A.GaussNoise(),
        A.OneOf([
            A.MotionBlur(p=.2),
            A.MedianBlur(blur_limit=3, p=0.1),
            A.Blur(blur_limit=3, p=0.1),
        ], p=0.2),
        A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2),
        A.OneOf([
            A.OpticalDistortion(p=0.3),
            A.GridDistortion(p=.1),
        ], p=0.2),
        A.OneOf([
            A.CLAHE(clip_limit=2),
            A.RandomBrightnessContrast(),
        ], p=0.3),
        A.HueSaturationValue(p=0.3),
        A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
            max_pixel_value=255.0,
        ),
        ToTensorV2(),
], additional_targets={'mask': 'mask'})

## 1.2. Dataset Class & DataLoaders for the Dataset

In [3]:
import os
from pathlib import Path

In [4]:
# Defining the custom Dataset class
class PlantVillageSegmentationDataset(Dataset):
    def __init__(self, image_dir, mask_dir, transform=None):
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.transform = transform
        self.image_paths = sorted(list(Path(image_dir).rglob("*.jpg")))
        self.mask_paths = []
        for img_path in self.image_paths:
            # build corresponding mask path
            rel_path = img_path.relative_to(image_dir)
            mask_path = Path(mask_dir) / rel_path.parent / f"{img_path.stem}_final_masked.jpg"
            if mask_path.exists():
                self.mask_paths.append(mask_path)
            else:
                # Skip if mask missing
                self.image_paths.remove(img_path)


    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = np.array(Image.open(self.image_paths[idx]).convert("RGB"))
        mask = np.array(Image.open(self.mask_paths[idx]).convert("L"))

        if self.transform:
            transformed = self.transform(image=image, mask=mask)
            image = transformed["image"]
            mask = transformed["mask"]

        mask = (mask > 0).float()
        return image, mask


- Adding the dataset file locations

In [5]:
dataset_loc = "D:\Work Projects\AI & ML Projects\Plants-Disease-Detection---Identification\data"
color_full_images_dir = "D:\Work Projects\AI & ML Projects\Plants-Disease-Detection---Identification\data\color"
segmented_images_dir = "D:\Work Projects\AI & ML Projects\Plants-Disease-Detection---Identification\data\segmented"

In [6]:
# Add class instances of Train, Validation, Test split datasets
train_dataset = PlantVillageSegmentationDataset(
    image_dir=os.path.join(color_full_images_dir, 'train'),
    mask_dir=os.path.join(segmented_images_dir, 'train'),
    transform=transform
)

val_dataset = PlantVillageSegmentationDataset(
    image_dir=os.path.join(color_full_images_dir, 'valid'),
    mask_dir=os.path.join(segmented_images_dir, 'valid'),
    transform=transform
)

test_dataset = PlantVillageSegmentationDataset(
    image_dir=os.path.join(color_full_images_dir, 'test'),
    mask_dir=os.path.join(segmented_images_dir, 'test'),
    transform=transform
)

# Testing for image size
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(val_dataset)}")
print(f"Test Dataset Size: {len(test_dataset)}")

Train Dataset Size: 42473
Validation Dataset Size: 5296
Test Dataset Size: 5338


In [7]:
# Building DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)

In [None]:
# Testing DataLoaders
for images, masks in test_loader:
    print(f"Image batch shape: {images.shape}")
    print(f"Mask batch shape: {masks.shape}")
    break