# **Data Collection**

## Objectives

* Collecting data from kaggle and further prepare data to be handled

## Inputs

* Kaggle json file: authentication token

## Outputs

* Generate Dataset: inputs/dataset/mildew_dataset 





---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Section 1

Section 1 content

In [None]:
# install kaggle dataset
%pip install kaggle

---

# Section 2

Section 2 content

In [None]:
os.environ["KAGGLE_CONFIG_DIR"] = os.getcwd()
! chmod 600 kaggle.json

* download the dataset and create a folder for the dataset

In [None]:
KaggleDatasetPath = "codeinstitute/cherry-leaves/code"
DestinationFolder = "inputs/mildew_dataset"
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

* Unzip the zip folder fetched from Kaggle
* Creates one healty folder and one mildew folder

In [None]:
! unzip {DestinationFolder}/*.zip -d {DestinationFolder} \
    && rm {DestinationFolder}/*.zip

---

Data Preparation

* Remove non image files.
* Split the data into train/test/validation
* Create copy of images and convert to gray scale images

---

Creating gray scale copies of origanl dataset

In [None]:
from PIL import Image
import os

#Full path of the directory containing the image folders
path = "/workspace/mildew-detection/inputs/mildew_dataset/cherry-leaves"

# Create a new directory to store the grayscale images
if not os.path.exists(path+"/gray_images"):
    os.mkdir(path+"/gray_images")

# Iterate through each folder
for foldername in os.listdir(path):
    if not os.path.exists(path+"/gray_images/"+foldername):
        os.mkdir(path+"/gray_images/"+foldername)
    # Iterate through each image in the folder
    for file in os.listdir(path+"/"+foldername):
        # Open the image
        with Image.open(path+"/"+foldername+"/"+file) as im:
            # Convert the image to grayscale
            im = im.convert("L")
            # Save the grayscale image to the new directory
            im.save(path+"/gray_images/"+foldername+"/"+file)


Data Cleaning

In [None]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        # print(files)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))

Excecute the remove_non_image_file

In [None]:
remove_non_image_file(my_data_dir="/workspace/mildew-detection/inputs/mildew_dataset/cherry-leaves")
remove_non_image_file(my_data_dir="/workspace/mildew-detection/inputs/mildew_dataset/gray_images")

---

## Split dataset

---

In [None]:
import os
import shutil
import random
import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)


In [None]:
# spliting the rgb data set
split_train_validation_test_images(my_data_dir=f"inputs/mildew_dataset/cherry-leaves",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

#spliting the gray_scale data set
split_train_validation_test_images(my_data_dir=f"inputs/mildew_dataset/gray_images",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

## Check how many pictures in each set

In [None]:
import os

# List of supported image formats
supported_formats = ['.jpg', '.jpeg', '.png', '.bmp']

# Path to the dataset
dataset_path = 'inputs/mildew_dataset/cherry-leaves'

# Dictionary to store the image count for each folder
image_count = {}

# Iterate through all files in the dataset
for root, dirs, files in os.walk(dataset_path):
    for file in files:
        # Get the file path
        file_path = os.path.join(root, file)
        # Get the file extension
        file_ext = os.path.splitext(file_path)[1].lower()
        # Check if the file is a supported image format
        if file_ext in supported_formats:
            if root in image_count:
                image_count[root] += 1
            else:
                image_count[root] = 1

# Print the image count for each folder
for folder, count in image_count.items():
    print(f'{folder} has {count} images')


# Push files to Repo

* If you don't need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.

In [None]:

!cat .gitignore
!git status
!git add .
!git commit -am "Devided pictures and data cleaning"
!git push
