# Plant Disease Dataset Creation


## Get the PlantVillage-Dataset dataset

In [None]:
# Get the PlantVillage-Dataset from github repo
!git clone https://github.com/spMohanty/PlantVillage-Dataset

Cloning into 'PlantVillage-Dataset'...
remote: Enumerating objects: 163235, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 163235 (delta 2), reused 1 (delta 0), pack-reused 163229 (from 1)[K
Receiving objects: 100% (163235/163235), 2.00 GiB | 21.62 MiB/s, done.
Resolving deltas: 100% (101/101), done.
Updating files: 100% (182401/182401), done.


In [None]:
# Create data directory
from pathlib import Path
data_dir = Path("data/")
plantvillage_dir = data_dir / "PlantVillage-Dataset"
plantvillage_dir.mkdir(exist_ok=True, parents=True)

In [None]:
# move segmented data(leaf closeups) to the data directory and delete the rest of the PlantVillage git clone folders
!mv PlantVillage-Dataset/raw/color/ data/PlantVillage-Dataset
!rm -r PlantVillage-Dataset/

# Create Train and Test Folders

In [None]:
import random
import os
color_dir = plantvillage_dir / "color/"
random.seed(42)

# get a list of the directories
img_dirs = os.listdir(color_dir)
img_dirs[:5]


['Tomato___Early_blight',
 'Tomato___Target_Spot',
 'Apple___Apple_scab',
 'Potato___Early_blight',
 'Potato___Late_blight']

In [None]:
import shutil
# state what percent of data you want for testing
test_pct = 0.2

train_dir = plantvillage_dir / "train"
test_dir = plantvillage_dir / "test"
# rename the color directory to the train directory
if not train_dir.exists():
  os.rename(color_dir, train_dir)
# create the test directory
if not test_dir.exists():
  test_dir.mkdir(exist_ok=True, parents=True)

# iterate through all the folder names
for folder_name in img_dirs:
  # create train and test image directory paths based on folder names
  train_image_dir = train_dir / folder_name
  test_image_dir = test_dir / folder_name
  # create a new folder for the images within the test directory
  test_image_dir.mkdir(exist_ok=True, parents=True)
  # get the list of image paths within the train image folder
  image_paths = list(train_image_dir.glob("*"))
  # get a random sample of the images based on sampling percent
  num_samples = int(len(image_paths) * test_pct)
  print(f"Grabbing {num_samples} images from {len(image_paths)} available in {folder_name}")
  train_image_paths = random.sample(image_paths, k=num_samples)
  print(len(train_image_paths))
  # move sampled paths to destiantion
  for path in train_image_paths:
    shutil.move(src=path,dst=test_image_dir)


Grabbing 200 images from 1000 available in Tomato___Early_blight
200
Grabbing 280 images from 1404 available in Tomato___Target_Spot
280
Grabbing 126 images from 630 available in Apple___Apple_scab
126
Grabbing 200 images from 1000 available in Potato___Early_blight
200
Grabbing 200 images from 1000 available in Potato___Late_blight
200
Grabbing 190 images from 952 available in Tomato___Leaf_Mold
190
Grabbing 30 images from 152 available in Potato___healthy
30
Grabbing 276 images from 1383 available in Grape___Esca_(Black_Measles)
276
Grabbing 72 images from 360 available in Peach___healthy
72
Grabbing 381 images from 1909 available in Tomato___Late_blight
381
Grabbing 1071 images from 5357 available in Tomato___Tomato_Yellow_Leaf_Curl_Virus
1071
Grabbing 1101 images from 5507 available in Orange___Haunglongbing_(Citrus_greening)
1101
Grabbing 55 images from 275 available in Apple___Cedar_apple_rust
55
Grabbing 238 images from 1192 available in Corn_(maize)___Common_rust_
238
Grabbing 

In [None]:
# While using this using this data from hugging face I found a png in the images
#so I'm making sure they all get saved as jpgs here
# (There was one png in the whole dataset :/ )
from PIL import Image
for class_name in os.listdir(train_dir):
  class_dir = os.path.join(train_dir, class_name)
  for name in os.listdir(class_dir):
    if name.lower().endswith(".png"):
      print(f"PNG found in {class_dir}")
      img_path = os.path.join(class_dir, name)
      img = Image.open(img_path).convert("RGB")
      new_name = os.path.splitext(name)[0] + ".jpg"
      os.remove(os.path.join(class_dir, name))
      img.save(os.path.join(class_dir, new_name), "JPEG")

for class_name in os.listdir(test_dir):
  class_dir = os.path.join(test_dir, class_name)
  for name in os.listdir(class_dir):
    if name.lower().endswith(".png"):
      print(f"PNG found in {class_dir}")
      img_path = os.path.join(class_dir, name)
      img = Image.open(img_path).convert("RGB")
      new_name = os.path.splitext(name)[0] + ".jpg"
      os.remove(os.path.join(class_dir, name))
      img.save(os.path.join(class_dir, new_name), "JPEG")


PNG found in data/PlantVillage-Dataset/train/Pepper,_bell___healthy


In [None]:
# load the train and test dataset to hugging face
from datasets import load_dataset

# Replace load the train dataset
dataset = load_dataset("imagefolder", data_dir=plantvillage_dir)
dataset

Resolving data files:   0%|          | 0/43456 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/10849 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/43456 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/10849 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 43456
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 10849
    })
})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
dataset.push_to_hub(repo_id="Plant-Diseases-PlantVillage-Dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Map:   0%|          | 0/21728 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/218 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  10%|#         | 33.4MB /  321MB            

Map:   0%|          | 0/21728 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/218 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   7%|6         | 25.1MB /  362MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Map:   0%|          | 0/10849 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/109 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  25%|##4       | 41.9MB /  170MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/BrandonFors/Plant-Diseases-PlantVillage-Dataset/commit/96bdc27bd804297b76f373cba1c935e52076e107', commit_message='Upload dataset', commit_description='', oid='96bdc27bd804297b76f373cba1c935e52076e107', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BrandonFors/Plant-Diseases-PlantVillage-Dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BrandonFors/Plant-Diseases-PlantVillage-Dataset'), pr_revision=None, pr_num=None)

## Download Images for Gradio HF Space

In [None]:
# create an examples folder to hold images for the gradio instance
example_image_dir = Path("examples/")
example_image_dir.mkdir(exist_ok=True, parents=True)

In [None]:
# set the random seed for reproducability
random.seed(42)
# iterate through the class names
for class_name in os.listdir(test_dir):
  class_dir = os.path.join(test_dir, class_name)
  # get the names of the images from the class directory
  names =  os.listdir(class_dir)
  # get a random name
  name = random.sample(names, k=1)
  # create a path to the selected image
  image_path =  test_dir / class_name / name[0]
  # copy the image to the examples folder
  shutil.copy(src=image_path,dst=example_image_dir)
  # rename the copied image with the class name so the class is identifyable
  new_path = example_image_dir / name[0]
  new_name = f"{class_name}.JPG"
  rename_path = example_image_dir / new_name
  os.rename(src=new_path, dst=rename_path)


In [None]:
!cd ./examples && zip -r ../examples.zip * -x "*.pyc" "*.ipynb" "*__pycache__*" "*ipynb_checkpoints*"

  adding: Apple___Apple_scab.JPG (deflated 1%)
  adding: Apple___Black_rot.JPG (deflated 1%)
  adding: Apple___Cedar_apple_rust.JPG (deflated 2%)
  adding: Apple___healthy.JPG (deflated 2%)
  adding: Blueberry___healthy.JPG (deflated 1%)
  adding: Cherry_(including_sour)___healthy.JPG (deflated 2%)
  adding: Cherry_(including_sour)___Powdery_mildew.JPG (deflated 2%)
  adding: Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot.JPG (deflated 1%)
  adding: Corn_(maize)___Common_rust_.JPG (deflated 2%)
  adding: Corn_(maize)___healthy.JPG (deflated 2%)
  adding: Corn_(maize)___Northern_Leaf_Blight.JPG (deflated 1%)
  adding: Grape___Black_rot.JPG (deflated 1%)
  adding: Grape___Esca_(Black_Measles).JPG (deflated 1%)
  adding: Grape___healthy.JPG (deflated 2%)
  adding: Grape___Leaf_blight_(Isariopsis_Leaf_Spot).JPG (deflated 1%)
  adding: Orange___Haunglongbing_(Citrus_greening).JPG (deflated 2%)
  adding: Peach___Bacterial_spot.JPG (deflated 2%)
  adding: Peach___healthy.JPG (deflated 1%)

In [None]:
# download the folder zip
from google.colab import files
files.download("examples.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>