In [None]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# add parent folder path to the namespace
sys.path.append(os.path.dirname(os.getcwd()))

# import modules and components
from utils.preprocessing import find_images_path
from utils.validation import DataValidation
import utils.global_paths as globpt
import configurations as cnf

# specify relative paths from global paths and create subfolders
images_path = os.path.join(globpt.data_path, 'images') 
val_path = os.path.join(globpt.data_path, 'validation')
os.mkdir(images_path) if not os.path.exists(images_path) else None
os.mkdir(val_path) if not os.path.exists(val_path) else None  

# Load data

In [None]:
# load data from csv, add paths to images 
file_loc = os.path.join(globpt.data_path, 'XREP_dataset.csv') 
dataset = pd.read_csv(file_loc, encoding = 'utf-8', sep =';', low_memory=False)
dataset = find_images_path(images_path, dataset)

# select subset of data
total_samples = cnf.num_train_samples + cnf.num_test_samples
subset = dataset.sample(n=total_samples, random_state=cnf.seed)

# Data validation

## 1. Analysis of XREP dataset

Analyze the entire XREPORT dataset

### 1.1 Text analysis

In [None]:
total_text = dataset['text'].to_list()
words_list = [x.split() for x in total_text]
words_list = [item for sublist in words_list for item in sublist]
print(words_list)

## 2. Comparison of train and test datasets

Analyze the XREPORT images dataset with different metrics. Compare the train and test datasets to explore possible differences between the two

### 2.1 Pixel intensity histogram

Evaluate the average pixel intensity of images from both the train and test datasets

In [None]:
plot_properties = {'figsize': (10, 8),  
                   'fontsize_title': 16,  
                   'fontsize_labels': 12,  
                   'fontsize_ticks': 10,  
                   'xlabel': 'Feature', 
                   'ylabel': 'Value',
                   'orientation' : 'h',  
                   'xticks_rotation': 45,  
                   'xticks_ha': 'right',  
                   'xticks_va': 'center', 
                   'title': 'Pixel Intensity Histograms',  
                   'palette': 'viridis',
                   'color' : 'skyblue',  
                   'grid': True,                   
                   'legend': True,  
                   'legend_loc': 'best',
                   'filename' : 'pixel_intensities.jpeg'} 

# load train and test images as numpy arrays
validator = DataValidation()
print(f'\nLoading pictures from train and test dataset. Current picture shape is {cnf.picture_shape[:-1]}\n')
# train_images = preprocessor.load_images(train_data['path'], cnf.picture_shape[:-1], 
#                                         as_tensor=False,  normalize=False)
# test_images = preprocessor.load_images(test_data['path'], cnf.picture_shape[:-1], 
#                                        as_tensor=False, normalize=False)

# # validate pixel intensity histograms for both datasets
# print('\nCalculating pixel intensity of images train and test sets\n')
# validator.pixel_intensity_histograms(train_images, test_images, val_path, names=['Train','Test'], plot_properties)

# # split data into train and test dataset and start preprocessor
# test_size = cnf.num_test_samples/total_samples
# train_data, test_data = train_test_split(dataset, test_size=test_size, random_state=cnf.seed, plot_properties)