# Corals health monitoring project
## Data collection and cleaning
---

## Collect dataset from Kaggle

### Objectives:
* Collect data
* Clean data (remove files which are not images)

### Input:
* Kaggle autentication token (kaggle.json)

### Output:
* Generate dataset:<br>
inputs/corals-dataset/Dataset

---

### Import packages

In [None]:
%pip install -r /workspace/corals_health/requirements.txt

In [6]:
import numpy
import os

### Setting up directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/corals_health/jupyter_notebooks'

In [8]:
os.chdir('/workspace/corals_health')
print(f"Your current working directory is:\n {os.getcwd()}")

Your current working directory is:
 /workspace/corals_health


## Install Kaggle

In [None]:
%pip install --upgrade kaggle

In [15]:
# change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

* Get the dataset path from the [Kaggle URL](https://www.kaggle.com/datasets/sonainjamil/bhd-corals).
* Set your destination folder.

![Kaggle dataset summary page](../assets/images/kaggle-dataset.jpg)

### Set the Kaggle Dataset and Download it.

In [None]:
KaggleDatasetPath = "sonainjamil/bhd-corals"
DestinationFolder = "/workspace/corals_health/inputs/corals-dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

#### Unzip the downloaded file, and delete the zip file.

In [21]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/bhd-corals.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/bhd-corals.zip')

## Data cleaning
### In the '/inputs/' folder, check which files are not images

In [1]:
import os
def remove_non_image_file(my_data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                print(file_location)
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))


In [2]:
remove_non_image_file(my_data_dir='/workspace/corals_health/inputs/corals-dataset/Dataset')

Folder: Bleached - has image file 720
Folder: Bleached - has non-image file 0
Folder: Dead - has image file 150
Folder: Dead - has non-image file 0
Folder: Healthy - has image file 712
Folder: Healthy - has non-image file 0


## Split train validation test set

In [5]:
import os
import shutil
import random
# import joblib


def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(my_data_dir)  # it should get only the folder name
    print(labels)
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(my_data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                my_data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(my_data_dir + '/' + label)


In [6]:
split_train_validation_test_images(my_data_dir=f"/workspace/corals_health/inputs/corals-dataset/Dataset",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )


['Bleached', 'Dead', 'Healthy']
