In [None]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category = Warning)

# add parent folder path to the namespace
sys.path.append(os.path.dirname(os.getcwd()))

# import modules and components
from utils.data_assets import PreProcessing, DataValidation
import utils.global_paths as globpt
import configurations as cnf

# specify relative paths from global paths and create subfolders
mat_path = os.path.join(globpt.data_path, 'materials') 
exp_path = os.path.join(globpt.data_path, 'experiments') 
os.mkdir(mat_path) if not os.path.exists(mat_path) else None
os.mkdir(exp_path) if not os.path.exists(exp_path) else None 

# Load and prepare data

In [None]:
filepath = os.path.join(mat_path, 'adsorbents_dataset.csv')  
df_adsorbents = pd.read_csv(filepath, sep=';', encoding='utf-8')  
filepath = os.path.join(mat_path, 'adsorbates_dataset.csv')  
df_adsorbates = pd.read_csv(filepath, sep=';', encoding='utf-8')  
filepath = os.path.join(exp_path, 'single_component_dataset.csv')  
df_SCADS = pd.read_csv(filepath, sep=';', encoding='utf-8')
filepath = os.path.join(exp_path, 'binary_mixture_dataset.csv')  
df_BMADS = pd.read_csv(filepath, sep=';', encoding='utf-8')
filepath = os.path.join(exp_path, 'preprocessed_SC_dataset.csv')  
df_processed_SCADS = pd.read_csv(filepath, sep=';', encoding='utf-8')

# convert strings to float list
df_processed_SCADS['pressure_in_Pascal'] = df_processed_SCADS['pressure_in_Pascal'].apply(lambda x : [float(s) for s in x.split()])
df_processed_SCADS['uptake_in_mol_g'] = df_processed_SCADS['uptake_in_mol_g'].apply(lambda x : [float(s) for s in x.split()])

# 1. BMADS data analysis

BMADS dataset is not analyzed fully, as it is of less interest. Some general info on BMADS can be found here:

In [None]:
num_BMADS_experiments = df_BMADS['filename'].nunique()
print('\nBMADS dataset: binary mixture adsorption')
print('-----------------------------------------------')
print(f'Number of measurements: {df_BMADS.shape[0]}')
print(f'Number of actual experiments: {num_BMADS_experiments}')
print(f'Average measurements per experiment: {df_BMADS.shape[0]//num_BMADS_experiments}')

# 2. SCADS data analysis

SCADS dataset (single component adsorption) is analysed and validated. The raw dataset (each entry corresponds to a measurement) is briefly explored hereinafter

In [None]:
preprocessor = PreProcessing()
validator = DataValidation()
valid_units = preprocessor.valid_units

negative_press = df_SCADS[df_SCADS['pressure'] < 0].shape[0]
negative_uptake = df_SCADS[df_SCADS['adsorbed_amount'] < 0].shape[0]
negative_temp = df_SCADS[df_SCADS['temperature'] <= 0].shape[0]
num_experiments = df_SCADS['filename'].nunique()

print('\nSCADS dataset: single component adsorption')
print('-----------------------------------------------')
print(f'Number of measurements: {df_SCADS.shape[0]}')
print(f'Number of actual experiments: {num_experiments}')
print(f'Average measurements per experiment: {df_SCADS.shape[0]//num_experiments}')
print('-----------------------------------------------')
print(f'Negative pressure measurements: {negative_press} ({negative_press/df_SCADS.shape[0] * 100:.2f}%)')
print(f'Negative uptake measurements: {negative_uptake} ({negative_uptake/df_SCADS.shape[0] * 100:.2f}%)')
print(f'Negative temperature measurements: {negative_uptake} ({negative_uptake/df_SCADS.shape[0] * 100:.2f}%)')
print('-----------------------------------------------')
print(f'Number of null values in dataset: \n{df_SCADS.isnull().sum()}')

## 2.1 Distribution of data

### 2.1.1 Distribution of experimental units (adsorption and pressure)

The distribution of units is considered for both the raw dataset and the preprocessed datasets. Beware that while the raw dataset has rows corresponding to single measurements, the preprocessed dataset has been grouped based on experiment name.

In [None]:
unique_P_units = df_SCADS['pressureUnits'].unique()
num_unique_P_units = df_SCADS['pressureUnits'].nunique()
unique_Q_units = df_SCADS['adsorptionUnits'].unique()
num_unique_Q_units = df_SCADS['adsorptionUnits'].nunique()

print(f'Number of unique pressure units: {num_unique_P_units}')
print(f'Number of unique uptake units: {num_unique_Q_units}')
print(f'List of valid uptake units: \n{sorted([str(x) for x in valid_units])}\n')

# filter experiments leaving only valid uptake and pressure units, then convert 
# pressure and uptake to Pa (pressure) and mol/kg (uptake)
# filter experiments by pressure and uptake units 
dataset = df_SCADS[df_SCADS[preprocessor.Q_unit_col].isin(preprocessor.valid_units)]
print(f'Number of uptake valid units is {len(valid_units)} vs invalid units {num_unique_Q_units-len(valid_units)}')
print(f'Number of measurements removed by sorting units: {df_SCADS.shape[0] - dataset.shape[0]}')
print(f'Number of remaining measurements upon sorting units: {dataset.shape[0]}')

In [None]:
plot_properties = {'figsize': (10, 8),  
                   'fontsize_title': 16,  
                   'fontsize_labels': 12,  
                   'fontsize_ticks': 10,  
                   'xlabel': 'Feature', 
                   'ylabel': 'Value',
                   'orientation' : 'h',  
                   'xticks_rotation': 45,  
                   'xticks_ha': 'right',  
                   'xticks_va': 'center', 
                   'title': 'Adsorption units distribution',  
                   'palette': 'viridis',
                   'color' : 'blue',  
                   'grid': True,                   
                   'legend': True,  
                   'legend_loc': 'best'}  

# visualize distribution of adsorption units per measurements, with all units and only with valid units
validator = DataValidation()
print('\nRaw dataset (each row corresponds to a measurement')
validator.class_distribution(df_SCADS, 'adsorptionUnits', plot_properties)

### 2.1.2 Distribution of adsorption and experimental properties

In [None]:
plot_properties = {'figsize': (10, 8),  
                   'fontsize_title': 16,  
                   'fontsize_labels': 12,  
                   'fontsize_ticks': 10,  
                   'xlabel': 'Feature', 
                   'ylabel': 'Value',
                   'orientation' : 'h',  
                   'xticks_rotation': 45,  
                   'xticks_ha': 'right',  
                   'xticks_va': 'center', 
                   'title': 'Adsorption units distribution',  
                   'palette': 'viridis',
                   'color' : 'skyblue',  
                   'grid': True,                   
                   'legend': True,  
                   'legend_loc': 'best',
                   'filename' : 'file.jpeg'} 

features = ['temperature', 'complexity', 'mol_weight', 'covalent_units', 'H_acceptors', 'H_donors', 'heavy_atoms']
validator.features_boxplot(df_processed_SCADS, features, plot_properties)

### 2.1.3 Distribution of pressure and uptake

In [None]:
plot_properties = {'figsize': (10, 8),  
                   'fontsize_title': 16,  
                   'fontsize_labels': 12,  
                   'fontsize_ticks': 10,  
                   'xlabel': 'Feature', 
                   'ylabel': 'Value',
                   'orientation' : 'h',  
                   'xticks_rotation': 45,  
                   'xticks_ha': 'right',  
                   'xticks_va': 'center', 
                   'title': 'Adsorption units distribution',  
                   'palette': 'viridis',
                   'color' : 'skyblue',  
                   'grid': True,                   
                   'legend': True,  
                   'legend_loc': 'best',
                   'filename' : 'file.jpeg'} 

SCADS_pressures = df_processed_SCADS['pressure_in_Pascal'].to_list()
SCADS_uptakes = df_processed_SCADS['uptake_in_mol_g'].to_list()

flat_pressures = [item for sublist in SCADS_pressures for item in sublist]
flat_uptakes = [item for sublist in SCADS_uptakes for item in sublist]

df_series = pd.DataFrame({'Pressure' : flat_pressures, 'Uptake' : flat_uptakes})
validator.features_boxplot(df_series, 'Pressure', plot_properties)
validator.features_boxplot(df_series, 'Uptake', plot_properties)

## 1.2 Explore materials datasets

In [None]:
print(f'\nNumber of adsorbents from adsorbent dataset: {df_adsorbents["name"].nunique()}')
print(f'Number of adsorbates from adsorbates dataset: {df_adsorbates["name"].nunique()}')