In [None]:
import os
import pandas as pd

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# import modules and components
from NISTADS.commons.utils.validation import DataValidation
from NISTADS.commons.constants import CONFIG, DATA_PATH, DATA_PATH
from NISTADS.commons.logger import logger

# Load and prepare data

In [None]:
filepath = os.path.join(DATA_PATH, 'hosts_dataset.csv')  
df_host = pd.read_csv(filepath, sep=';', encoding='utf-8')  
filepath = os.path.join(DATA_PATH, 'guests_dataset.csv')  
df_guest = pd.read_csv(filepath, sep=';', encoding='utf-8')  
filepath = os.path.join(DATA_PATH, 'single_component_adsorption.csv')  
single_component = pd.read_csv(filepath, sep=';', encoding='utf-8')
filepath = os.path.join(DATA_PATH, 'binary_mixture_adsorption.csv')  
binary_mixture = pd.read_csv(filepath, sep=';', encoding='utf-8')

# 1. Binary Mixture (BMADS) data analysis

BMADS dataset is not analyzed fully, as it is of less interest. Some general info on BMADS can be found here:

In [None]:
num_BMADS_experiments = binary_mixture['filename'].nunique()
print(f'Number of measurements: {binary_mixture.shape[0]}')
print(f'Number of actual experiments: {num_BMADS_experiments}')
print(f'Average measurements per experiment: {binary_mixture.shape[0]//num_BMADS_experiments}')

# 2. Single Component (SCADS) data analysis

SCADS dataset (single component adsorption) is analysed and validated. The raw dataset (each entry corresponds to a measurement) is briefly explored hereinafter

In [None]:
ads_col, sorb_col  = ['adsorbent_name'], ['adsorbate_name'] 
P_col, Q_col  = 'pressure', 'uptake'
P_unit_col, Q_unit_col  = 'pressureUnits', 'adsorptionUnits' 
valid_units = ['mmol/g', 'mol/kg', 'mol/g', 'mmol/kg', 'mg/g', 'g/g', 'cm3(STP)/g',
                'wt%', 'g Adsorbate / 100g Adsorbent', 'g/100g', 'ml(STP)/g']
                            
parameters = ['temperature', 'mol_weight', 'complexity', 'covalent_units', 
              'H_acceptors', 'H_donors', 'heavy_atoms']

In [None]:
validator = DataValidation()

negative_press = single_component[single_component['pressure'] < 0].shape[0]
negative_uptake = single_component[single_component['adsorbed_amount'] < 0].shape[0]
negative_temp = single_component[single_component['temperature'] <= 0].shape[0]
num_experiments = single_component['filename'].nunique()

print('\nSCADS dataset: single component adsorption')
print('-----------------------------------------------')
print(f'Number of measurements: {single_component.shape[0]}')
print(f'Number of actual experiments: {num_experiments}')
print(f'Average measurements per experiment: {single_component.shape[0]//num_experiments}')
print('-----------------------------------------------')
print(f'Negative pressure measurements: {negative_press} ({negative_press/single_component.shape[0] * 100:.2f}%)')
print(f'Negative uptake measurements: {negative_uptake} ({negative_uptake/single_component.shape[0] * 100:.2f}%)')
print(f'Negative temperature measurements: {negative_uptake} ({negative_uptake/single_component.shape[0] * 100:.2f}%)')
print('-----------------------------------------------')
print(f'Number of null values in dataset: \n{single_component.isnull().sum()}')

## 2.1 Distribution of data

### 2.1.1 Distribution of experimental units (adsorption and pressure)

The distribution of units is considered for both the raw dataset and the preprocessed datasets. Beware that while the raw dataset has rows corresponding to single measurements, the preprocessed dataset has been grouped based on experiment name.

In [None]:
unique_P_units = single_component['pressureUnits'].unique()
num_unique_P_units = single_component['pressureUnits'].nunique()
unique_Q_units = single_component['adsorptionUnits'].unique()
num_unique_Q_units = single_component['adsorptionUnits'].nunique()

print(f'Number of unique pressure units: {num_unique_P_units}')
print(f'Number of unique uptake units: {num_unique_Q_units}')
print(f'List of valid uptake units: \n{sorted([str(x) for x in valid_units])}\n')

# filter experiments leaving only valid uptake and pressure units, then convert 
# pressure and uptake to Pa (pressure) and mol/kg (uptake)
# filter experiments by pressure and uptake units 
dataset = single_component[single_component[Q_unit_col].isin(valid_units)]
print(f'Number of uptake valid units is {len(valid_units)} vs invalid units {num_unique_Q_units-len(valid_units)}')
print(f'Number of measurements removed by sorting units: {single_component.shape[0] - dataset.shape[0]}')
print(f'Number of remaining measurements upon sorting units: {dataset.shape[0]}')

In [None]:
# visualize distribution of adsorption units per measurements, with all units and only with valid units
validator = DataValidation()
print('\nRaw dataset (each row corresponds to a measurement')
validator.class_distribution(single_component, 'adsorptionUnits', title='Adsorption units distribution', y_label='Value')