In [None]:
# [SET KERAS BACKEND]
import os 
os.environ["KERAS_BACKEND"] = "torch"

# [IMPORT LIBRARIES]
import pandas as pd

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# import modules and components
from XREPORT.commons.utils.dataloader.serializer import get_images_from_dataset, DataSerializer
from XREPORT.commons.utils.validation import DataValidation
from XREPORT.commons.utils.preprocessing.splitting import DatasetSplit
from XREPORT.commons.constants import CONFIG, IMG_DATA_PATH, DATA_PATH, RESULTS_PATH
from XREPORT.commons.logger import logger

## Load data

In [None]:
# load data from csv, add paths to images 
sample_size = CONFIG["evaluation"]["SAMPLE_SIZE"] 
file_loc = os.path.join(DATA_PATH, 'XREP_dataset.csv') 
dataset = pd.read_csv(file_loc, encoding = 'utf-8', sep =';', low_memory=False)
dataset = get_images_from_dataset(IMG_DATA_PATH, dataset, sample_size=sample_size)

splitter = DatasetSplit(dataset)     
train_data, validation_data = splitter.split_data() 

logger.info(f'Number of train samples: {len(train_data)}')
logger.info(f'Number of validation samples: {len(validation_data)}')

# Data validation

## 1. Analysis of XREP dataset

...

### 1.1 Text analysis

In [None]:
total_words_list = [word for text in dataset['text'].to_list() for word in text.split()]
train_words_list = [word for text in train_data['text'].to_list() for word in text.split()]
validation_words_list = [word for text in validation_data['text'].to_list() for word in text.split()]

logger.info(f'Number of words in the entire dataset:        {len(total_words_list)}')
logger.info(f'Number of unique words in the entire dataset: {len(set(total_words_list))}\n')
logger.info(f'Number of words in the training dataset:        {len(train_words_list)}')
logger.info(f'Number of unique words in the training dataset: {len(set(train_words_list))}\n')
logger.info(f'Number of words in the validation dataset:        {len(validation_words_list)}')
logger.info(f'Number of unique words in the validation dataset: {len(set(validation_words_list))}\n')

## 2. Comparison of train and test datasets

Analyze the XREPORT images dataset with different metrics. Compare the train and test datasets to explore possible differences between the two

### 2.1 Pixel intensity histogram

Evaluate the average pixel intensity of images from both the train and test datasets

In [None]:
img_shape = CONFIG["model"]["IMG_SHAPE"]
logger.info(f'Loading pictures from train and test dataset. Current picture shape is {img_shape}')

# load train and validation images as numpy arrays
serializer = DataSerializer()
validator = DataValidation()
train_images = [serializer.load_image(pt, as_tensor=False) for pt in train_data['path'].to_list()]
validation_images = [serializer.load_image(pt, as_tensor=False) for pt in validation_data['path'].to_list()]

# validate pixel intensity histograms for both datasets
images_dictionary = {'Train' : train_images, 'Validation' : validation_images}
validator.pixel_intensity_histograms(images_dictionary, RESULTS_PATH)