In [2]:
import os
import sys
import pandas as pd

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category = Warning)

# add parent folder path to the namespace
sys.path.append(os.path.dirname(os.getcwd()))

# import modules and components
from utils.preprocessing import load_images
from utils.validation import DataValidation
from config.pathfinder import IMG_DATA_PATH, VAL_PATH
import config.configurations as cnf

# Load and prepare data

In [None]:

validator = DataValidation()

# find and assign images path
images_paths = []
for root, dirs, files in os.walk(IMG_DATA_PATH):
    for file in files:
        images_paths.append(os.path.join(root, file))

# select a fraction of data for training
total_samples = cnf.TRAIN_SAMPLES + cnf.TEST_SAMPLES
df_images = pd.DataFrame(images_paths, columns=['images path'])
df_images = df_images.sample(total_samples, random_state=cnf.seed)

# create train and test datasets (for validation)
test_data = df_images.sample(n=cnf.TEST_SAMPLES, random_state=cnf.SPLIT_SEED)
train_data = df_images.drop(test_data.index)

# 1. Data evaluation

### 1.1 Evaluation report

In [None]:
print(f'Number of train samples: {train_data.shape[0]}')
print(f'Number of test samples:  {test_data.shape[0]}')

### 1.2 Pixel intensity 

In [None]:
# load train and test images as numpy arrays
train_images = load_images(train_data['images path'], cnf.IMG_SHAPE[:-1], 
                           as_tensor=False,  normalize=False)
test_images = load_images(test_data['images path'], cnf.IMG_SHAPE[:-1], 
                          as_tensor=False, normalize=False)

# validate pixel intensity histograms for both datasets
validator.pixel_intensity_histograms(train_images, test_images, VAL_PATH,
                                     names=['Train', 'Test'])