# **Data Collection**

## Objectives

* Set working directory
* Fetch data from Kaggle and prepare it
* Clean data
* Split data


## Inputs

Kaggle JSON file - authentication token 

## Outputs

Generate dataset that is split in test, train and validation folders

# Import

In [14]:
! pip install -r /workspace/ML5-Mildew-Detection/requirements.txt



In [1]:
import numpy
import os


# Change working directory

* Notebooks located in subfolder, therefore whe runing notebook we need tp change working directory

Changing working directory to parent folder

In [2]:
current_dir = os.getcwd()
current_dir

'/workspace/ML5-Mildew-Detection/jupyter_notebooks'

Parent of this folder becomes new directory

In [3]:
os.chdir('/workspace/ML5-Mildew-Detection')
print("You set a new current directory")

You set a new current directory


Checking if we set up our new working directory

In [4]:
current_dir = os.getcwd()
current_dir

'/workspace/ML5-Mildew-Detection'

# Install Kaggle

In [5]:
!pip install kaggle



Change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON

In [6]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

Download dataset from Kaggle

In [7]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherryleaves_data/cherry-leaves/cut"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading cherry-leaves.zip to inputs/cherryleaves_data/cherry-leaves/cut
 98%|█████████████████████████████████████▎| 54.0M/55.0M [00:01<00:00, 31.1MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:02<00:00, 28.8MB/s]


Unzip downloaded file and delete zip file

In [8]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

# Data Preparation


## Data Cleaning

Check and remove all non-image files

In [9]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

In [10]:
remove_non_image_file(my_data_dir='inputs/cherryleaves_data/cherry-leaves/cut/cherry-leaves')

Folder: healthy - has image file 2104
Folder: healthy - has non-image file 0
Folder: powdery_mildew - has image file 2104
Folder: powdery_mildew - has non-image file 0


In [15]:
# from PIL import Image
# import os

# # Set the directory path to the input images
# input_dir = "inputs/cherryleaves_data/cherry-leaves/cut/cherry-leaves/powdery_mildew"

# # Set the directory path to save the cropped images
# output_dir = "inputs/cherryleaves_data/cherry-leaves/cut/cherry-leaves/powdery_mildew_cut"

# # Create the output directory if it doesn't exist
# if not os.path.exists(output_dir):
#     os.mkdir(output_dir)

# # Get a list of all the image files in the input directory
# image_files = os.listdir(input_dir)

# # Iterate through each image file
# for image_file in image_files:
#     # Open the image file
#     image_path = os.path.join(input_dir, image_file)
#     image = Image.open(image_path)
    
#     # Get the size of the image
#     width, height = image.size
    
#     # Calculate the height of each piece
#     piece_height = height // 3
    
#     # Crop the image into three pieces
#     piece1 = image.crop((0, 0, width, piece_height))
#     piece2 = image.crop((0, piece_height, width, 2*piece_height))
#     piece3 = image.crop((0, 2*piece_height, width, height))
    
#     # Save the cropped pieces with new file names
#     piece1.save(os.path.join(output_dir, f"{image_file}_1"))
#     piece2.save(os.path.join(output_dir, f"{image_file}_2"))
#     piece3.save(os.path.join(output_dir, f"{image_file}_3"))

from PIL import Image
import os


def crop_images(input_dir, powdery_mildew_output_dir, healthy_output_dir):
    # Create the output directories if they don't exist
    if not os.path.exists(powdery_mildew_output_dir):
        os.mkdir(powdery_mildew_output_dir)
    if not os.path.exists(healthy_output_dir):
        os.mkdir(healthy_output_dir)

    # Define the subdirectories to look for images in
    subdirs = ["powdery_mildew", "healthy"]

    # Loop through each subdirectory
    for subdir in subdirs:
        # Set the input directory path for this subdirectory
        subdir_input_dir = os.path.join(input_dir, subdir)

        # Set the output directory path for this subdirectory
        if subdir == "powdery_mildew":
            subdir_output_dir = powdery_mildew_output_dir
        else:
            subdir_output_dir = healthy_output_dir

        # Get a list of all the image files in the subdirectory
        image_files = os.listdir(subdir_input_dir)

        # Iterate through each image file
        for image_file in image_files:
            # Open the image file
            image_path = os.path.join(subdir_input_dir, image_file)
            image = Image.open(image_path)

            # Get the size of the image
            width, height = image.size

            # Calculate the height of each piece
            piece_height = height // 3

            # Crop the image into three pieces
            piece1 = image.crop((0, 0, width, piece_height))
            piece2 = image.crop((0, piece_height, width, 2*piece_height))
            piece3 = image.crop((0, 2*piece_height, width, height))

            # Save the cropped pieces with new file names
            piece1.save(os.path.join(subdir_output_dir, f"{image_file[:-4]}_1.jpg"))
            piece2.save(os.path.join(subdir_output_dir, f"{image_file[:-4]}_2.jpg"))
            piece3.save(os.path.join(subdir_output_dir, f"{image_file[:-4]}_3.jpg"))



In [16]:
input_dir = "inputs/cherryleaves_data/cherry-leaves/cut/cherry-leaves"
powdery_mildew_output_dir = "inputs/cherryleaves_data/cherry-leaves/cut/cherry-leaves/powdery_mildew_cut"
healthy_output_dir = "inputs/cherryleaves_data/cherry-leaves/cut/cherry-leaves/healthy_cut"

crop_images(input_dir, powdery_mildew_output_dir, healthy_output_dir)


## Split train validation test set

In [19]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)

* The training set to 0.70 ratio of data.
* The validation set to 0.10 ratio of data.
* The test set to 0.20 ratio of data.

In [20]:
split_train_validation_test_images(my_data_dir=f"inputs/cherryleaves_data/cherry-leaves/cut/cherry-leaves",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )