# Prepare merged SDF file for GCN training

In [1]:
job_dict = {
    'density': '../../../Database/density_database.csv',
    'viscosity': '../../../Database/viscosity_database.csv',
    'mp': '../../../Database/mp_database.csv',
    'tg': '../../../Database/tg_database.csv',
    'td': '../../../Database/td_database.csv',
    'ph': '../../../Database/pH_database.csv',
    'surface_tension': '../../../Database/surface_tension_database.csv',
    'xTB_DFT': '../../../Database_Structure/structure/xTB_DFT/data.csv'
}

## 1. DES

### Get the index of DES in database

In [2]:
import pandas as pd

job_type = 'density'

src_database = pd.read_csv('../../../Database_Structure/DES.csv')
DES_list = []
for i in range(len(src_database)):
    text = src_database['HBA_smiles'][i] + src_database['HBD_smiles'][i] + str(src_database['molar_ratio'][i])
    if text in DES_list:
        print(i)
    DES_list.append(text)

index_list = []
job_database = pd.read_csv(job_dict[job_type])
for i in range(len(job_database)):
    DES_type = job_database['HBA_smiles'][i] + job_database['HBD_smiles'][i] + str(job_database['molar_ratio (HBA:HBD)'][i])
    index_list.append(DES_list.index(DES_type) + 1)

### Convert XYZ to SDF

In [6]:
import subprocess
import os

xyz_dir = sdf_dir =  '../../../Database_Structure/MD_simulation/xTB/xTB_MDS_002/result_dir/lowest_E_frame/'

for file in os.listdir(xyz_dir):
    if file.endswith('.xyz'):
        xyz_path = os.path.join(xyz_dir, file)
        sdf_path = xyz_path.replace('.xyz', '.sdf')
        openbabel_cmd = f'obabel -ixyz {xyz_path} -osdf -O {sdf_path}'
        if not os.path.exists(sdf_path):
            subprocess.Popen(openbabel_cmd,
                             shell=True,
                             stdout=subprocess.DEVNULL,
                             stderr=subprocess.DEVNULL)


### Merge SDF

In [7]:
import os

sdf_list = []
for i in index_list:
    sdf_path = os.path.join(sdf_dir, f'DES_{i}.sdf')
    if os.path.exists(sdf_path):
        sdf_list.append(sdf_path)

In [8]:
from xyz2sdf import merge_sdf

merge_sdf(sdf_list, f'../../database/{job_type}/merged_mol.sdf')

total sdf: 2599


'../../database/density/merged_mol.sdf'

## 2. xTB~DFT

### Convert XYZ to SDF

In [2]:
import subprocess
import os

xyz_dir = sdf_dir =  '../../Database_Structure/structure/xTB_DFT/xTB_structure/'

for file in os.listdir(xyz_dir):
    if file.endswith('.xyz'):
        xyz_path = os.path.join(xyz_dir, file)
        sdf_path = xyz_path.replace('.xyz', '.sdf')
        openbabel_cmd = f'obabel -ixyz {xyz_path} -osdf -O {sdf_path}'
        if not os.path.exists(sdf_path):
            subprocess.Popen(openbabel_cmd,
                             shell=True,
                             stdout=subprocess.DEVNULL,
                             stderr=subprocess.DEVNULL)

### Merge SDF

In [7]:
import pandas as pd

job_type = 'xTB_DFT'
job_database = pd.read_csv(job_dict[job_type], keep_default_na = False)
sdf_list = []
for i, file_name in enumerate(job_database['Substance'].tolist()):
    if job_database['DFT'].tolist()[i] == '':
        continue
    sdf_path = os.path.join(sdf_dir, f'{file_name}.sdf')
    sdf_list.append(sdf_path)

In [11]:
from xyz2sdf import merge_sdf

merge_sdf(sdf_list, os.path.join(sdf_dir, 'merged_mol.sdf'))

total sdf: 1772


'../../Database_Structure/structure/xTB_DFT/xTB_structure/merged_mol.sdf'