In [None]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category = Warning)

# add parent folder path to the namespace
sys.path.append(os.path.dirname(os.getcwd()))

# import modules and components
from utils.data_assets import PreProcessing, DataValidation
import utils.global_paths as globpt
import configurations as cnf

# specify relative paths from global paths and create subfolders
images_path = os.path.join(globpt.data_path, 'images') 
val_path = os.path.join(globpt.data_path, 'validation')
os.mkdir(images_path) if not os.path.exists(images_path) else None
os.mkdir(val_path) if not os.path.exists(val_path) else None  

# Load data

In [None]:
# initialize the preprocessing class
preprocessor = PreProcessing()

# load data from csv, add paths to images 
file_loc = os.path.join(globpt.data_path, 'XREP_dataset.csv') 
dataset = pd.read_csv(file_loc, encoding = 'utf-8', sep =';', low_memory=False)
dataset = preprocessor.find_images_path(images_path, dataset)

# select subset of data
total_samples = cnf.num_train_samples + cnf.num_test_samples
dataset = dataset.sample(n=total_samples, random_state=cnf.seed)

# split data into train and test dataset and start preprocessor
test_size = cnf.num_test_samples/total_samples
train_data, test_data = train_test_split(dataset, test_size=test_size, random_state=cnf.seed)

# Data validation

## 1. Images analysis

Analyze the XREPORT images dataset with different metrics. Compare the train and test datasets to explore possible differences between the two

### 1.1 Pixel intensity histogram

In [None]:
validator = DataValidation()

# load train and test images as numpy arrays
print(f'\nLoading pictures from train and test dataset. Current picture shape is {cnf.picture_shape[:-1]}\n')
train_images = preprocessor.load_images(train_data['path'], cnf.picture_shape[:-1], 
                                        as_tensor=False,  normalize=False)
test_images = preprocessor.load_images(test_data['path'], cnf.picture_shape[:-1], 
                                       as_tensor=False, normalize=False)

# validate pixel intensity histograms for both datasets
print('\nCalculating pixel intensity of images train and test sets\n')
validator.pixel_intensity_histograms(train_images, test_images, val_path, names=['Train','Test'])

### 2. Text analysis

In [9]:
total_text = dataset['text'].to_list()
words_list = [x.split() for x in total_text]
words_list = [item for sublist in words_list for item in sublist]
print(words_list)

['Moderate', 'cardiomegaly', 'is', 'stable.', 'Retrocardiac', 'atelectasis', 'has', 'improved.', 'Mild', 'vascular', 'congestion', 'has', 'improved.', 'There', 'is', 'no', 'pneumothorax.', 'Small', 'bilateral', 'effusions', 'have', 'improved', 'on', 'the', 'left.', 'ET', 'tube', 'is', 'in', 'standard', 'position.', 'Swans', 'Ganz', 'catheter', 'tip', 'is', 'in', 'the', 'right', 'pulmonary', 'artery.', 'NG', 'tube', 'tip', 'is', 'in', 'the', 'stomach.', 'Lung', 'volumes', 'lung.', 'Small', 'left', 'pleural', 'effusion', 'is', 'new.', 'There', 'is', 'a', 'new', 'mild', 'pulmonary', 'vascular', 'congestion.', 'Borderline', 'enlarged', 'cardiac', 'silhouette', 'is', 'exaggerated', 'by', 'low', 'lung', 'volumes.', 'Bibasilar', 'opacity', 'is', 'likely', 'secondary', 'to', 'atelectasis.', 'Cardiomegaly', 'is', 'mild', 'to', 'moderate.', 'Thoracic', 'aorta', 'is', 'generally', 'large', 'and', 'tortuous.', 'Lung', 'volumes', 'are', 'low.', 'Isolated', 'interstitial', 'abnormality', 'left', 'lo