Use the code below to write scripts for the different cases. There are 7 different datasets:

qh9_occ: Valence occupied orbitals from QH9 (Def2SVP) (>-1.75 Ha)

qh9_virt: All virtual orbitals from QH9 (Def2SVP)

qh9_v12: Valence virtual orbitals from QH9 (Def2SVP) (<1.2 Ha)

sto3g_occ: Valence occupied orbitals from QM9@STO-3G (>-1.75 Ha)

sto3g_virt: All (valence) virtual orbitals from QM9@STO-3G

tm_occ: Valence occupied orbitals from chargeless TMQM@STO-3G (no charge, >-1.75 Ha)

tm_virt: All (valence) virtual orbitals from chargeless TMQM@STO-3G (no charge)

In [4]:
import os

#Adjust NUM_TRAIN to do learning curves for performance on NUM_VAL
NUM_TRAIN = 51200
NUM_VAL = 5000

#Note: everything is in BOHR (7.6 bohr cutoff)
default_params = {
    "CUTOFF":7.6,
    "LOMAX":2,
    "NC":16,
    "LAYERS":2,
    "N_RBF":16,
    "N_RSAMPLES":16,
    "BATCH_SIZE":128,
    "IN_MEMORY":True,
    "NUM_TRAIN":NUM_TRAIN,
    "NUM_VAL":NUM_VAL,
    "LR":0.001,
    "MAX_STEPS":300000,
    "STACKING" : True,
    "IRREP_MIXING" : False,
    "CHARGE_EMBEDDING" : False,
}

#These are good estimates of the mean and std
mean_std = {
    "qh9_occ":[-0.5144,0.2229],
    "qh9_virt":[1.4346,0.9854],
    "qh9_v12":[0.5353,0.3168],
    "sto3g_occ":[-0.6605,0.2801],
    "sto3g_virt":[0.6892,0.1825],
    "tm_occ":[-0.6154,0.2862],
    "tm_virt":[0.6896,0.2229],
}

#Fill in dict below to generate scripts
#Note batch_size & num_layers had to be lowered for tm runs to cut memory
#Adjust DATA_NAMES below for the new total # of data:
#Also I would recommend setting IN_MEMORY to False for all of these because you have enough cpus
script_gen_dct = {
    "sto3g_occ":{
        "DATA_NAME":"sto3g_5000_occ.h5",
        "LINMAX":1,
        "IN_MEMORY":True,
    },
    "sto3g_virt":{
        "DATA_NAME":"sto3g_5000_virt.h5",
        "LINMAX":1,
        "IN_MEMORY":True,
    },
    "tm_occ":{
        "DATA_NAME":"tm_5000_occ.h5",
        "LINMAX":2,
        "LAYERS":1,
        "BATCH_SIZE":32,
        "MAX_STEPS":1200000,
        "IN_MEMORY":False,
    },
    "tm_virt":{
        "DATA_NAME":"tm_5000_virt.h5",
        "LINMAX":2,
        "LAYERS":1,
        "BATCH_SIZE":32,
        "MAX_STEPS":1200000,
        "IN_MEMORY":False,
    },
    "qh9_occ":{
        "DATA_NAME":"qh9_5000_occ.h5",
        "LINMAX":2,
        "IN_MEMORY":True,
    },
    "qh9_virt":{
        "DATA_NAME":"qh9_5000_virt.h5",
        "LINMAX":2,
        "IN_MEMORY":False,
    },
    "qh9_v12":{
        "DATA_NAME":"qh9_5000_v12.h5",
        "LINMAX":2,
        "IN_MEMORY":True,
    },
}

with open("template_script.py","r") as f:
    lines = f.readlines()
for name, dct2 in script_gen_dct.items():
    mean, std = mean_std[name]
    dct2["LOGS_NAME"] = name
    dct2["AVGE0"] = mean
    dct2["SIGMA"] = std
    dct = default_params.copy()
    for k,v in dct2.items():
        dct[k] = v
    new_lines = []
    for line in lines:
        kw = line.split("=")[0].strip()
        if kw in dct.keys():
            v = dct[kw]
            if type(v) is str:
                wline = f'{kw} = "{v}"\n'
            else:
                wline = f"{kw} = {v}\n"
        else:
            wline = line
        new_lines.append(wline)
    script_name = f"../../scripts_to_run/{name}_{NUM_TRAIN}.py"
    if os.path.isfile(script_name):
        os.system(f"rm {script_name}")
    with open(script_name,"w+") as f:
        for line in new_lines:
            f.write(line)
