In [None]:
import os
import sys
import pandas as pd

# setting warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# add parent folder path to the namespace
sys.path.append(os.path.dirname(os.getcwd()))

# import modules and components
from utils.validation import DataValidation
from config.pathfinder import DATA_EXP_PATH, DATA_MAT_PATH
import config.configurations as cnf 

# Load and prepare data

In [None]:
filepath = os.path.join(DATA_MAT_PATH, 'adsorbents_dataset.csv')  
df_adsorbents = pd.read_csv(filepath, sep=';', encoding='utf-8')  
filepath = os.path.join(DATA_MAT_PATH, 'adsorbates_dataset.csv')  
df_adsorbates = pd.read_csv(filepath, sep=';', encoding='utf-8')  
filepath = os.path.join(DATA_EXP_PATH, 'single_component_dataset.csv')  
df_SCADS = pd.read_csv(filepath, sep=';', encoding='utf-8')
filepath = os.path.join(DATA_EXP_PATH, 'binary_mixture_dataset.csv')  
df_BMADS = pd.read_csv(filepath, sep=';', encoding='utf-8')
filepath = os.path.join(DATA_EXP_PATH, 'preprocessed_SC_dataset.csv')  
df_processed_SCADS = pd.read_csv(filepath, sep=';', encoding='utf-8')

# convert strings to float list
df_processed_SCADS['pressure_in_Pascal'] = df_processed_SCADS['pressure_in_Pascal'].apply(lambda x : [float(s) for s in x.split()])
df_processed_SCADS['uptake_in_mol_g'] = df_processed_SCADS['uptake_in_mol_g'].apply(lambda x : [float(s) for s in x.split()])

# 1. BMADS data analysis

BMADS dataset is not analyzed fully, as it is of less interest. Some general info on BMADS can be found here:

In [None]:
num_BMADS_experiments = df_BMADS['filename'].nunique()
print('\nBMADS dataset: binary mixture adsorption')
print('-----------------------------------------------')
print(f'Number of measurements: {df_BMADS.shape[0]}')
print(f'Number of actual experiments: {num_BMADS_experiments}')
print(f'Average measurements per experiment: {df_BMADS.shape[0]//num_BMADS_experiments}')

# 2. SCADS data analysis

SCADS dataset (single component adsorption) is analysed and validated. The raw dataset (each entry corresponds to a measurement) is briefly explored hereinafter

In [None]:
ads_col, sorb_col  = ['adsorbent_name'], ['adsorbates_name'] 
P_col, Q_col  = 'pressure_in_Pascal', 'uptake_in_mol_g'
P_unit_col, Q_unit_col  = 'pressureUnits', 'adsorptionUnits' 
valid_units = ['mmol/g', 'mol/kg', 'mol/g', 'mmol/kg', 'mg/g', 'g/g', 'cm3(STP)/g',
                'wt%', 'g Adsorbate / 100g Adsorbent', 'g/100g', 'ml(STP)/g']
                            
parameters = ['temperature', 'mol_weight', 'complexity', 'covalent_units', 
              'H_acceptors', 'H_donors', 'heavy_atoms']

In [None]:
validator = DataValidation()

negative_press = df_SCADS[df_SCADS['pressure'] < 0].shape[0]
negative_uptake = df_SCADS[df_SCADS['adsorbed_amount'] < 0].shape[0]
negative_temp = df_SCADS[df_SCADS['temperature'] <= 0].shape[0]
num_experiments = df_SCADS['filename'].nunique()

print('\nSCADS dataset: single component adsorption')
print('-----------------------------------------------')
print(f'Number of measurements: {df_SCADS.shape[0]}')
print(f'Number of actual experiments: {num_experiments}')
print(f'Average measurements per experiment: {df_SCADS.shape[0]//num_experiments}')
print('-----------------------------------------------')
print(f'Negative pressure measurements: {negative_press} ({negative_press/df_SCADS.shape[0] * 100:.2f}%)')
print(f'Negative uptake measurements: {negative_uptake} ({negative_uptake/df_SCADS.shape[0] * 100:.2f}%)')
print(f'Negative temperature measurements: {negative_uptake} ({negative_uptake/df_SCADS.shape[0] * 100:.2f}%)')
print('-----------------------------------------------')
print(f'Number of null values in dataset: \n{df_SCADS.isnull().sum()}')

## 2.1 Distribution of data

### 2.1.1 Distribution of experimental units (adsorption and pressure)

The distribution of units is considered for both the raw dataset and the preprocessed datasets. Beware that while the raw dataset has rows corresponding to single measurements, the preprocessed dataset has been grouped based on experiment name.

In [None]:
unique_P_units = df_SCADS['pressureUnits'].unique()
num_unique_P_units = df_SCADS['pressureUnits'].nunique()
unique_Q_units = df_SCADS['adsorptionUnits'].unique()
num_unique_Q_units = df_SCADS['adsorptionUnits'].nunique()

print(f'Number of unique pressure units: {num_unique_P_units}')
print(f'Number of unique uptake units: {num_unique_Q_units}')
print(f'List of valid uptake units: \n{sorted([str(x) for x in valid_units])}\n')

# filter experiments leaving only valid uptake and pressure units, then convert 
# pressure and uptake to Pa (pressure) and mol/kg (uptake)
# filter experiments by pressure and uptake units 
dataset = df_SCADS[df_SCADS[Q_unit_col].isin(valid_units)]
print(f'Number of uptake valid units is {len(valid_units)} vs invalid units {num_unique_Q_units-len(valid_units)}')
print(f'Number of measurements removed by sorting units: {df_SCADS.shape[0] - dataset.shape[0]}')
print(f'Number of remaining measurements upon sorting units: {dataset.shape[0]}')

In [None]:
# visualize distribution of adsorption units per measurements, with all units and only with valid units
validator = DataValidation()
print('\nRaw dataset (each row corresponds to a measurement')
validator.class_distribution(df_SCADS, 'adsorptionUnits', title='Adsorption units distribution',
                             y_label='Value')

### 2.1.2 Distribution of experimental properties

In [None]:
features = ['temperature', 'complexity', 'mol_weight', 'covalent_units', 'H_acceptors', 'H_donors', 'heavy_atoms']
validator.features_boxplot(df_processed_SCADS, features, title='Adsorption units distribution')                           

### 2.1.3 Distribution of pressure and uptake

In [None]:
SCADS_pressures = df_processed_SCADS['pressure_in_Pascal'].to_list()
SCADS_uptakes = df_processed_SCADS['uptake_in_mol_g'].to_list()

flat_pressures = [item for sublist in SCADS_pressures for item in sublist]
flat_uptakes = [item for sublist in SCADS_uptakes for item in sublist]

df_series = pd.DataFrame({'Pressure' : flat_pressures, 'Uptake' : flat_uptakes})
validator.features_boxplot(df_series, 'Pressure', title='Pressure distribution',
                             x_label='Pressure (Pa)')
validator.features_boxplot(df_series, 'Uptake', title='Uptake distribution',
                            x_label='uptake (mol/g)')

## 2.2 Relationship between variables

Relationship between variables is explored in this section. At this stage, the scatter plot of uptake and pressure points is shown

In [None]:
validator.features_scatterplot(df_series, df_series.columns, title='Pressure versus Uptake',
                               x_label='Pressure (Pa)', y_label='Uptake (mol/g)')

### 2.2.2 Density-based scan clustering 

Use DBSCAN clustering to generate a cluster scatter plot of various features. In this section, clustering is performed between pressure (in Pa) and uptake (in mol/g)

In [None]:
validator.DBSCAN_clustering(df_series, min_samples=5, title='DBSCAN clustering Pressure versus Uptake',
                            x_label='Pressure (Pa)', y_label='Uptake (mol/g)')

## 2.3 Explore materials datasets

In [None]:
print(f'\nNumber of adsorbents from adsorbent dataset: {df_adsorbents["name"].nunique()}')
print(f'Number of adsorbates from adsorbates dataset: {df_adsorbates["name"].nunique()}')