### Relevant Imports

In [None]:
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import warnings
from data_splitter import data_split

import numpy as np
import matplotlib.pyplot as plt

from face_detection import process_images
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from face_detection_2 import perform_face_detection


config = tf.compat.v1.ConfigProto()
warnings.filterwarnings("default", category=DeprecationWarning, module=__name__)

In [None]:
PATH = os.getcwd()
input_folder = f'{PATH}/../datasets_directory/dataset'
output_folder = f'{PATH}/../datasets_directory/splitted_dataset'

### Data Cleaning & Preparation
<p>Preprocessing the image dataset before training a black and white face detection model is critical for several technical reasons that enhance model quality and performance:</p>
<ol>
<li>Data Quality Assurance: Removing low-quality or faceless images ensures the training dataset is high-quality, reducing the risk of overfitting to noisy data.
</li><br/>
<li>
Noise Reduction: Eliminating non-face images or low-quality faces reduces dataset noise, improving the model's ability to generalize.
</li><br/>
<li>
Bias Mitigation: Balancing the dataset representation mitigates bias, ensuring a more equitable model.
</li><br/>
<li>
Robust Feature Learning: High-quality data helps the model learn robust and discriminative features for accurate face detection.
</li><br/>
<li>
Alignment and Consistency:
Face detection during preprocessing may involve alignment and normalization, ensuring that all face regions are consistently positioned and scaled.
Consistent alignment helps the model learn invariant features, making it more adaptable to different poses and perspectives of black and white faces.
</li><br/>
<li>
Alignment and Consistency: Consistent alignment aids the model in handling different face poses and perspectives.
</li><br/>
</li>
<li>Training Efficiency: Removing irrelevant images speeds up training, benefiting model development.
</li><br/>
<li>
Enhanced Generalization: A clean dataset fosters better model generalization to real-world scenarios.
</li><br/>
<li>Ethical Compliance: Ensuring the dataset is free from inappropriate content aligns with ethical AI development.
</li>
<br/><br/>
<p>
In summary, the process of exploring, cleaning, and filtering the dataset for black and white face detection using the DeepFace library is essential to enhance data quality, mitigate bias, improve model generalization, and ensure ethical AI development. It aligns with best practices in data preprocessing and contributes to the overall success of the model.</p>

### Process Images:

In [43]:
folder_mapping = {
    'white_faces': 'black_faces',
    'black_faces': 'white_faces'
}

for input_folder, output_folder in folder_mapping.items():
    input_path = f'{PATH}/../datasets_directory/dataset/{input_folder}'
    # output_path = f'{PATH}/../datasets_directory/detected_face_dataset/{output_folder}'
    # perform_face_detection(input_path, output_path)


### Data split:

In [66]:
main_data = f'{PATH}/../datasets_directory/detected_face_dataset'
split_data = f'{PATH}/../datasets_directory/splitted_dataset'
data_split(main_data, split_data, 0.7, 0.15, 0.15)

Copying files: 25111 files [00:18, 1337.11 files/s]


In [69]:
categories = ['white_faces', 'black_faces']
subsets = ['train', 'val', 'test']
output_folder = f'{PATH}/../datasets_directory/splitted_dataset'
data_location_map = {
    'train_dir': os.path.join(output_folder, "train"),
    'val_dir': os.path.join(output_folder, "val"),
    'test_dir': os.path.join(output_folder, "test")
}

data_counts = {'train': {}, 'val': {}, 'test': {}}

for subset in subsets:
    subset_dir = os.path.join(output_folder, subset)
    for category in categories:
        category_dir = os.path.join(subset_dir, category)
        num_images = len(os.listdir(category_dir))
        data_counts[subset][category] = num_images

print("Data Counts:\n")
for subset, categories_data in data_counts.items():
    print(f"Total {subset.capitalize()} images:")
    for category, count in categories_data.items():
        print(f"\t{category.capitalize()}: {count}")

# Calculate totals
total_train = sum(data_counts['train'].values())
total_val = sum(data_counts['val'].values())
total_test = sum(data_counts['test'].values())

print("------------------------")
print("Total training images:", total_train)
print("Total validation images:", total_val)
print("Total test images:", total_test)


Data Counts:

Total Train images:
	White_faces: 12250
	Black_faces: 12099
Total Val images:
	White_faces: 3664
	Black_faces: 3580
Total Test images:
	White_faces: 3686
	Black_faces: 3576
------------------------
Total training images: 24349
Total validation images: 7244
Total test images: 7262


In [70]:
# Define batch size and number of epochs for training
batch_size = 128
epochs = 10

# Define image dimensions (height and width) for data preprocessing
IMG_HEIGHT = 86
IMG_WIDTH = 86