In [2]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category = Warning)

# add parent folder path to the namespace
sys.path.append(os.path.dirname(os.getcwd()))

# import modules and components
from utils.data_assets import PreProcessing, DataValidation
import utils.global_paths as globpt
import configurations as cnf

# specify relative paths from global paths and create subfolders
mat_path = os.path.join(globpt.data_path, 'materials') 
exp_path = os.path.join(globpt.data_path, 'experiments') 
os.mkdir(mat_path) if not os.path.exists(mat_path) else None
os.mkdir(exp_path) if not os.path.exists(exp_path) else None 

### Load and prepare data

In [3]:
filepath = os.path.join(mat_path, 'adsorbents_dataset.csv')  
df_adsorbents = pd.read_csv(filepath, sep=';', encoding='utf-8')  
filepath = os.path.join(mat_path, 'adsorbates_dataset.csv')  
df_adsorbates = pd.read_csv(filepath, sep=';', encoding='utf-8')  
filepath = os.path.join(exp_path, 'single_component_dataset.csv')  
df_SCADS = pd.read_csv(filepath, sep=';', encoding='utf-8')
filepath = os.path.join(exp_path, 'binary_mixture_dataset.csv')  
df_BMADS = pd.read_csv(filepath, sep=';', encoding='utf-8')

## 1. Data validation

BMADS dataset is not analyzed fully, as it is of less interest. Some general info on BMADS can be found here:

In [4]:
num_BMADS_experiments = df_BMADS['filename'].nunique()
print('\nBMADS dataset: binary mixture adsorption')
print('-----------------------------------------------')
print(f'Number of measurements: {df_BMADS.shape[0]}')
print(f'Number of actual experiments: {num_BMADS_experiments}')
print(f'Average measurements per experiment: {df_BMADS.shape[0]//num_BMADS_experiments}')


BMADS dataset: binary mixture adsorption
-----------------------------------------------
Number of measurements: 20403
Number of actual experiments: 2372
Average measurements per experiment: 8


### 1.1 Analyze raw dataset

SCADS dataset (single component adsorption) is analyzed and validate. The major aim is to provide a fully preprocessed dataset for machine learning. General info on the dataset are provided here:

In [5]:
preprocessor = PreProcessing()
validator = DataValidation()
valid_units = preprocessor.valid_units

negative_press = df_SCADS[df_SCADS['pressure'] < 0].shape[0]
negative_uptake = df_SCADS[df_SCADS['adsorbed_amount'] < 0].shape[0]
negative_temp = df_SCADS[df_SCADS['temperature'] <= 0].shape[0]
num_experiments = df_SCADS['filename'].nunique()

print('\nSCADS dataset: single component adsorption')
print('-----------------------------------------------')
print(f'Number of measurements: {df_SCADS.shape[0]}')
print(f'Number of actual experiments: {num_experiments}')
print(f'Average measurements per experiment: {df_SCADS.shape[0]//num_experiments}')
print('-----------------------------------------------')
print(f'Negative pressure measurements: {negative_press} ({negative_press/df_SCADS.shape[0] * 100:.2f}%)')
print(f'Negative uptake measurements: {negative_uptake} ({negative_uptake/df_SCADS.shape[0] * 100:.2f}%)')
print(f'Negative temperature measurements: {negative_uptake} ({negative_uptake/df_SCADS.shape[0] * 100:.2f}%)')
print('-----------------------------------------------')
print(f'Number of null values in dataset: \n{df_SCADS.isnull().sum()}')



SCADS dataset: single component adsorption
-----------------------------------------------
Number of measurements: 610667
Number of actual experiments: 28606
Average measurements per experiment: 21
-----------------------------------------------
Negative pressure measurements: 737 (0.12%)
Negative uptake measurements: 1654 (0.27%)
Negative temperature measurements: 1654 (0.27%)
-----------------------------------------------
Number of null values in dataset: 
filename               0
temperature            0
adsorptionUnits      160
pressureUnits          0
compositionType        0
num_of_adsorbates      0
adsorbent_name         0
adsorbates_name        0
pressure               0
adsorbed_amount        0
composition            0
dtype: int64


Distribution of experiments by units

In [9]:
unique_P_units = df_SCADS['pressureUnits'].unique()
num_unique_P_units = df_SCADS['pressureUnits'].nunique()
unique_Q_units = df_SCADS['adsorptionUnits'].unique()
num_unique_Q_units = df_SCADS['adsorptionUnits'].nunique()

print(f'Number of unique pressure units: {num_unique_P_units}')
print(f'List of unique pressure units: \n{sorted([str(x) for x in unique_P_units])}\n')
print(f'Number of unique uptake units: {num_unique_Q_units}')
print(f'List of unique uptake units: \n{sorted([str(x) for x in unique_Q_units])}\n')
print(f'List of valid uptake units: \n{sorted([str(x) for x in valid_units])}\n')

# filter experiments leaving only valid uptake and pressure units, then convert 
# pressure and uptake to Pa (pressure) and mol/kg (uptake)
# filter experiments by pressure and uptake units 
dataset = df_SCADS[df_SCADS[preprocessor.Q_unit_col].isin(preprocessor.valid_units)]
print(f'Number of uptake valid units is {len(valid_units)} vs invalid units {num_unique_Q_units-len(valid_units)}')
print(f'Number of measurements removed by sorting units: {df_SCADS.shape[0] - dataset.shape[0]}')
print(f'Number of remaining measurements upon sorting units: {dataset.shape[0]}')


Number of unique pressure units: 1
List of unique pressure units: 
['bar']

Number of unique uptake units: 66
List of unique uptake units: 
['% Volume Adsorbed', 'Conc./uc', 'Fractional Coverage', 'Fractional Loading', 'Fractional Occupancy', 'Guest molecules per Cu ion', 'Molecules Adsorbed', 'Site Occupancy', 'a.u.', 'a.u./g', 'ang3(STP)/unit cell', 'atoms/unit cell', 'cm3(STP)/cm3', 'cm3(STP)/g', 'cm3(STP)/m2', 'cm3(STP)/mol', 'cm3/m2', 'coverage', 'g Adsorbate / 100g Adsorbent', 'g/100g', 'g/cm3', 'g/g', 'g/l', 'g/ml', 'kg/mol', 'kg/ton', 'kmol/m3', 'mg/g', 'mg/m2', 'micromoles/m2', 'ml(STP)/g', 'ml/g', 'mmol adsorbed', 'mmol/cm3', 'mmol/g', 'mmol/kg', 'mmol/m2', 'mol gas/mol Ni', 'mol/cm3', 'mol/formula unit', 'mol/g', 'mol/l', 'mol/m2', 'mol/m3', 'mol/mol', 'molecules*box^-1', 'molecules/8 unit cells', 'molecules/Cu2 unit', 'molecules/Rh2 unit', 'molecules/Zn2', 'molecules/cage', 'molecules/cavity', 'molecules/cm2', 'molecules/formula unit', 'molecules/fundamental unit', 'molecul

### 1.2 Preprocess SCADS dataset

In [None]:
# filter experiments leaving only valid uptake and pressure units, then convert 
# pressure and uptake to Pa (pressure) and mol/kg (uptake)
# filter experiments by pressure and uptake units 
dataset = df_SCADS[df_SCADS[preprocessor.Q_unit_col].isin(preprocessor.valid_units)]

# convert pressures to Pascal
dataset[preprocessor.P_col] = dataset.progress_apply(lambda x : preprocessor.pressure_converter(x[preprocessor.P_unit_col], 
                                                                                                x['pressure']), 
                                                                                                axis = 1)
# convert uptakes to mol/g
dataset[preprocessor.Q_col] = dataset.progress_apply(lambda x : preprocessor.uptake_converter(x[preprocessor.Q_unit_col], 
                                                                                              x['adsorbed_amount'], 
                                                                                              x['mol_weight']), 
                                                                                              axis = 1)

# further filter the dataset to remove experiments which values are outside desired boundaries, 
# such as experiments with negative temperature, pressure and uptake values
#------------------------------------------------------------------------------ 
dataset = dataset[dataset['temperature'].astype(int) > 0]
dataset = dataset[dataset[preprocessor.P_col].astype(float).between(0.0, cnf.max_pressure)]
dataset = dataset[dataset[preprocessor.Q_col].astype(float).between(0.0, cnf.max_uptake)]


# add molecular properties based on PUGCHEM API data
print('Adding physicochemical properties from guest species dataset\n')
dataset = preprocessor.add_guest_properties(df_SCADS, df_adsorbates)
dataset = dataset.dropna()

In [None]:

# filter experiments leaving only valid uptake and pressure units, then convert 
# pressure and uptake to Pa (pressure) and mol/kg (uptake)

# filter experiments by pressure and uptake units 
dataset = dataset[dataset[preprocessor.Q_unit_col].isin(preprocessor.valid_units)]

# convert pressures to Pascal
dataset[preprocessor.P_col] = dataset.progress_apply(lambda x : preprocessor.pressure_converter(x[preprocessor.P_unit_col], 
                                                                                                x['pressure']), 
                                                                                                axis = 1)
# convert uptakes to mol/g
dataset[preprocessor.Q_col] = dataset.progress_apply(lambda x : preprocessor.uptake_converter(x[preprocessor.Q_unit_col], 
                                                                                              x['adsorbed_amount'], 
                                                                                              x['mol_weight']), 
                                                                                              axis = 1)

# further filter the dataset to remove experiments which values are outside desired boundaries, 
# such as experiments with negative temperature, pressure and uptake values
#------------------------------------------------------------------------------ 
dataset = dataset[dataset['temperature'].astype(int) > 0]
dataset = dataset[dataset[preprocessor.P_col].astype(float).between(0.0, cnf.max_pressure)]
dataset = dataset[dataset[preprocessor.Q_col].astype(float).between(0.0, cnf.max_uptake)]

## 2. Data analysis

### 1.1 Explore materials datasets

In [None]:
unique_adsorbents = df_adsorbents['name'].nunique()
unique_adsorbates = df_adsorbates['name'].nunique()

print(f'\nNumber of adsorbents in dataset: {unique_adsorbents}')
print(f'Number of adsorbates in dataset: {unique_adsorbates}')

### 1.2 Explore experiments datasets