### Path of database of different properties

In [46]:
job_dict = {
    'density': '../../../Database/density_database.csv',
    'viscosity': '../../../Database/viscosity_database.csv',
    'mp': '../../../Database/mp_database.csv',
    'tg': '../../../Database/tg_database.csv',
    'td': '../../../Database/td_database.csv',
    'ph': '../../../Database/pH_database.csv',
    'surface_tension': '../../../Database/surface_tension_database.csv',
    'xTB_DFT': '../../../Database_Structure/structure/xTB_DFT/data.csv'
}

### Determine the type of job

In [47]:
job_type = 'viscosity'
TARGET = 'viscosity (mPa s)'

### Match the entry with the index of corresponding DES

In [48]:
import pandas as pd

src_database = pd.read_csv('../../../Database_Structure/DES.csv')
DES_list = []
for i in range(len(src_database)):
    text = src_database['HBA_smiles'][i] + src_database['HBD_smiles'][i] + str(src_database['molar_ratio'][i])
    if text in DES_list:
        print(i)
    DES_list.append(text)

index_list = []
job_database = pd.read_csv(job_dict[job_type])
for i in range(len(job_database)):
    DES_type = job_database['HBA_smiles'][i] + job_database['HBD_smiles'][i] + str(job_database['molar_ratio (HBA:HBD)'][i])
    index_list.append(DES_list.index(DES_type))

In [49]:
import os

target_list = []
DES_dir = '../../../Database_Structure/MD_simulation/xTB/xTB_MDS_002/result_dir/'

for target, index in zip(job_database[TARGET], index_list):
    if not os.path.exists(os.path.join(DES_dir, f'DES_{index + 1}_trj.xyz')):
        continue
    target_list.append(target)

### Split target-list with a fixed step, using index as identification

In [51]:
from typing import List
import random

seed = 42
random.seed(seed)

def get_index_randomly(_list: List, floor: float, ceiling: float, split: List[float]):
    split_list = []
    index_list = [index for index, num in enumerate(_list) if floor <= num < ceiling]
    tot_len = len(index_list)
    sum_len = 0
    for i, percentage in enumerate(split):
        if i == len(split) - 1:
            length = tot_len - sum_len
        else:
            length = int(percentage * tot_len)
        sum_len += length
        sampled_list = random.sample(index_list, length)
        split_list.append(sampled_list)
        index_list = [element for element in index_list if element not in sampled_list]
    return split_list

In [52]:
i = 0
step = 10
split = [0.6, 0.2, 0.2]
split_list = [[], [], []]
while True:
    floor = i * step
    ceiling = (i + 1) * 10
    if ceiling > 1000:
        break
    sub_index_list = get_index_randomly(target_list, floor, ceiling, split)
    if min([len(i) for i in sub_index_list]) == 0:
        i += 1
        continue
    for sub_split_list, _sub_index_list in zip(split_list, sub_index_list):
        sub_split_list.extend(_sub_index_list)
    i += 1

### Save the split to `.npy` file

In [53]:
import numpy as np

split_array = np.array([np.array(i, dtype=np.int64) for i in split_list], dtype='object')
np.save('../../database/viscosity/split.npy', split_array)