# Imports and paths

In [None]:
import os
import sys
import pandas as pd

# set warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# import modules and classes
from SCADS.commons.utils.preprocessing import PreProcessing
from SCADS.commons.utils.validation import DataValidation
from SCADS.commons.pathfinder import DATA_PATH, VALIDATION_PATH
import SCADS.commons.configurations as cnf

## Load data

In [None]:
preprocessor = PreProcessing()

# load data from .csv files
file_loc = os.path.join(DATA_PATH, 'SCADS_dataset.csv') 
dataset = pd.read_csv(file_loc, sep=';', encoding = 'utf-8')
file_loc = os.path.join(DATA_PATH, 'adsorbates_dataset.csv') 
df_adsorbates = pd.read_csv(file_loc, sep=';', encoding = 'utf-8')
file_loc = os.path.join(DATA_PATH, 'adsorbents_dataset.csv') 
df_adsorbents = pd.read_csv(file_loc, sep=';', encoding = 'utf-8')

# Preprocessing

# 1. Data validation

## 1.1 General validation

In [None]:
# print report with statistics and info about the non-grouped dataset
print(f'\nNumber of adsorption measurements: {len(dataset["pressure_in_Pascal"].explode())}')
print(f'Number of unique experiments:        {dataset.shape[0]}')
print(f'Number of dataset features:          {dataset.shape[1]}')
print(f'Average measurements per experiment: {dataset.shape[0]//len(dataset["pressure_in_Pascal"].explode())}')

# perform prelimiary analysis on the grouped, unsplit dataset
# check columns with null values
validator = DataValidation()
print('\nChecking for missing values in the dataset:\n')
missing_values = validator.check_missing_values(dataset)  

# generate histograms of the grouped dataset features
validator.plot_histograms(dataset, VALIDATION_PATH, exclude_cols=['pressure_in_Pascal', 'uptake_in_mol_g'])

## 1.2 Validation of dataset split

In [None]:
# validate splitting based on random seed
print('\nValidation best random seed for data splitting\n')
min_diff, best_seed, best_split = validator.data_split_validation(dataset, cnf.TEST_SIZE, 500)
print(f'''\nBest split found with split_seed of {best_seed}, with total difference equal to {round(min_diff, 3)}
Mean and standard deviation differences per features (X and Y):''')
for key, val in best_split.items():
    print(f'{key} ---> mean difference = {val[0]}, STD difference = {val[1]}')