In [None]:
import os
import pandas as pd

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# import modules and components
from XREPORT.commons.utils.dataloader.serializer import get_images_from_dataset, DataSerializer
from XREPORT.commons.utils.validation import DataValidation
from XREPORT.commons.utils.preprocessing.splitting import DatasetSplit
from XREPORT.commons.pathfinder import IMG_DATA_PATH, DATA_PATH, VAL_PATH
from XREPORT.commons.configurations import SAMPLE_SIZE, IMG_SHAPE

## Load data

In [None]:
# load data from csv, add paths to images 
file_loc = os.path.join(DATA_PATH, 'XREP_dataset.csv') 
dataset = pd.read_csv(file_loc, encoding = 'utf-8', sep =';', low_memory=False)
dataset = get_images_from_dataset(IMG_DATA_PATH, dataset, sample_size=SAMPLE_SIZE)

splitter = DatasetSplit(dataset)     
train_data, validation_data, test_data = splitter.split_data() 

print(f'Number of train samples: {len(train_data)}')
print(f'Number of validation samples: {len(validation_data)}')
print(f'Number of test samples:  {len(test_data)}')

# Data validation

## 1. Analysis of XREP dataset

...

### 1.1 Text analysis

In [None]:
total_text = dataset['text'].to_list()
words_list = (x.split() for x in total_text)
words_list = [item for sublist in words_list for item in sublist]
print(f'Number of detected words: {len(words_list)}')
print(f'Number of unique words: {len(set(words_list))}')

## 2. Comparison of train and test datasets

Analyze the XREPORT images dataset with different metrics. Compare the train and test datasets to explore possible differences between the two

### 2.1 Pixel intensity histogram

Evaluate the average pixel intensity of images from both the train and test datasets

In [None]:
# load train and test images as numpy arrays
print(f'\nLoading pictures from train and test dataset. Current picture shape is {IMG_SHAPE}\n')

# load train and validation images as numpy arrays
serializer = DataSerializer()
validator = DataValidation()
train_images = serializer.load_images(train_data['path'], as_tensor=False, normalize=False)
validation_images = serializer.load_images(validation_data['path'], as_tensor=False, normalize=False)

# validate pixel intensity histograms for both datasets
images_dictionary = {'Train' : train_images,
                     'Validation' : validation_images}
validator.pixel_intensity_histograms(images_dictionary, VAL_PATH)