### Path of database of different properties

In [1]:
import pandas as pd

src_database = pd.read_csv('../../../Database_Structure/DES.csv', keep_default_na = False)
source_dict = {
    'density': '../../../Database/density_database.csv',
    'viscosity': '../../../Database/viscosity_database.csv',
    'mp': '../../../Database/mp_database.csv',
    'tg': '../../../Database/tg_database.csv',
    'td': '../../../Database/td_database.csv',
    'ph': '../../../Database/pH_database.csv',
    'surface_tension': '../../../Database/surface_tension_database.csv',
    'xTB_DFT': '../../../Database_Structure/structure/xTB_DFT/data.csv'
}

hbond_type = 'Pauling'
job_dict = {
    'density': {
        'source': source_dict['density'],
        'feature': {
            'exp': {
                'water_content': pd.read_csv(source_dict['density'], keep_default_na = False)['water content (wt.%)'].tolist(),
                'temperature': pd.read_csv(source_dict['density'], keep_default_na = False)['Tmeasure (K)'].tolist(),
                },
            'calc':{
                'num_hbond': src_database[hbond_type].tolist(),
                'interaction_energy': src_database['Interaction_energy(kcal/mol)'].tolist()
                },
            },
        'target': pd.read_csv(source_dict['density'], keep_default_na = False)['density (g cm-3)'].tolist(),
        'output': '../../database/density/attr_train.csv'
        },
    'viscosity': {
        'source': '../../../Database/viscosity_database.csv',
        'feature': {
            'exp': {
                'water_content': pd.read_csv(source_dict['viscosity'], keep_default_na = False)['water content (wt.%)'].tolist(),
                'temperature': pd.read_csv(source_dict['viscosity'], keep_default_na = False)['Tmeasure (K)'].tolist(),
                },
            'calc': {
                'num_hbond': src_database[hbond_type].tolist(),
                },
            },
        'target': pd.read_csv(source_dict['viscosity'], keep_default_na = False)['viscosity (mPa s)'].tolist(),
        'output': '../../database/viscosity/attr_train.csv'
        },
    'mp': {
        'source': '../../../Database/mp_database.csv',
        'output': '../../database/mp/attr_train.csv'
        },
    'tg': {
        'source': '../../../Database/tg_database.csv',
        'output': '../../database/tg/attr_train.csv'
        },
    'td': {
        'source': '../../../Database/td_database.csv',
        'output': '../../database/td/attr_train.csv'
        },
    'ph': {
        'source': '../../../Database/pH_database.csv',
        'output': '../../database/ph/attr_train.csv'
        },
    'surface_tension': {
        'source': '../../../Database/surface_tension_database.csv',
        'output': '../../database/surface_tension/attr_train.csv'
        },
    'xTB_DFT': {
        'source': '../../../Database_Structure/structure/xTB_DFT/data.csv',
        'output': '../../database/xTB_DFT/attr_train.csv'
        },
}

### Determine the type of job

In [2]:
job_type = 'viscosity'

### Match the entry with the index of corresponding DES

In [3]:
DES_list = []
for i in range(len(src_database)):
    text = src_database['HBA_smiles'][i] + src_database['HBD_smiles'][i] + str(src_database['molar_ratio'][i])
    if text in DES_list:
        print(i)
    DES_list.append(text)

index_list = []
job_database = pd.read_csv(job_dict[job_type]['source'])
for i in range(len(job_database)):
    DES_type = job_database['HBA_smiles'][i] + job_database['HBD_smiles'][i] + str(job_database['molar_ratio (HBA:HBD)'][i])
    index_list.append(DES_list.index(DES_type))

In [4]:
import os

def gen_attr(job_type: str, DES_dir: str):
    attr_dict = {}

    for feature in job_dict[job_type]['feature']['exp'].keys():
        feature_list = []
        for i, (ft, index) in enumerate(zip(job_dict[job_type]['feature']['exp'][feature], index_list)):
            if not os.path.exists(os.path.join(DES_dir, f'DES_{index + 1}_trj.xyz')):
                continue
            try:
                if float(ft) == float(ft):
                    feature_list.append(float(ft))
                else:
                    feature_list.append(0.0)
            except ValueError:
                if '<' in ft:
                    feature_list.append(0.0)
                elif '-' in ft:
                    feature_list.append((float(ft.split('-')[0]) + float(ft.split('-')[1])) * 0.5)
                else:
                    feature_list.append(0.0)
                    print(f'{feature}: {i}; {ft}')
        attr_dict[feature] = feature_list

    for feature in job_dict[job_type]['feature']['calc'].keys():
        feature_list = []
        for index in index_list:
            if not os.path.exists(os.path.join(DES_dir, f'DES_{index + 1}_trj.xyz')):
                continue
            feature_list.append(job_dict[job_type]['feature']['calc'][feature][index])
        attr_dict[feature] = feature_list

    target_list = []
    for i, (tg, index) in enumerate(zip(job_dict[job_type]['target'], index_list)):
        if not os.path.exists(os.path.join(DES_dir, f'DES_{index + 1}_trj.xyz')):
            continue
        try:
            target_list.append(float(tg))
        except ValueError:
            print(f'target: {i}; {ft}')
    attr_dict[job_type] = target_list

    return attr_dict

In [9]:
attr_dict = gen_attr(job_type, '../../../Database_Structure/MD_simulation/xTB/xTB_MDS_002/result_dir/')
len(attr_dict['num_hbond'])

1910

In [5]:
attr_dict = gen_attr(job_type, '../../../Database_Structure/MD_simulation/xTB/xTB_MDS_002/result_dir/')
pd.DataFrame(attr_dict).to_csv(job_dict[job_type]['output'], index = False)