# Download interaction energies

Here we extract interaction energies for the previously located dataset entries:

In [1]:
# imports
import requests
import re, os, time
import pandas as pd
from tqdm import tqdm

In [2]:
# read entries
df = pd.read_csv('data/entry_IDs.csv')
df['Total_energy'] = None
df

Unnamed: 0,Mainfile,Formula,Entry_ID,Upload_ID,Total_energy
0,AminoAcids/glu/Dipeptide/Ba/conf_0047/FHIaims.out,BaC8H13N2O4,---sV-GhvU9x9aLIxYIaSaRRMYb7,OhCdHFfSTLu8g6CGBISk4w,
1,AminoAcids/argH/Dipeptide/bare/conf_0855/FHIai...,C9H20N5O2,--6kkazavrym5kv8a9t9i9xFsVUY,MTSFEw9oS1GRiY0LVMgOYw,
2,AminoAcids/lysH/Dipeptide/bare/conf_0528/FHIai...,C9H20N3O2,--DqMR6pHAuouHktSJGR58iTBYmm,SXdit6b0RtibOeWIk3dulw,
3,AminoAcids/lys/Dipeptide/bare/conf_1802/FHIaim...,C9H19N3O2,--GsPYTZbOQDIzkGgdrzmr_40vc4,hSvqxrr0RjOhzwYU0stsOg,
4,AminoAcids/met/Dipeptide/Ca/conf_0159/FHIaims.out,C8CaH16N2O2S,--PlSeBOD6yCTDHBSdgPE5cEdzof,oHhmKWH2RHyyq7I2_Zdtkg,
...,...,...,...,...,...
23238,AminoAcids/arg/Dipeptide/bare/conf_2884/FHIaim...,C9H19N5O2,zymyALFYVstb5DXBA3b6I5I2MBOI,N-T_ymXOSk-DXMIjvGyFrQ,
23239,AminoAcids/arg/Dipeptide/Mg/conf_0285/FHIaims.out,C9H19MgN5O2,zzFI-iLg4I6FyTMYfzo0Xdk-N37Z,NIDpEFBaRuiPEiQKAUxB7g,
23240,AminoAcids/arg/Dipeptide/Ca/conf_0122/FHIaims.out,C9CaH19N5O2,zzVz_UTmBQoZeE-S0qsB6sz1F-Ja,_H1mhAjDTQm8tfuJmf_h2g,
23241,AminoAcids/arg/Dipeptide/bare/conf_0654/FHIaim...,C9H19N5O2,zzXvZ2GGD4UgvReG6X_Nrc9Uq6gt,kPLqJx0rQOePWx7CIOW6LQ,


In [3]:
# URLs
nomad_app = 'https://repository.nomad-coe.eu/app/api'
nomad_rep = f'{nomad_app}/repo'    # calc's metadata
nomad_raw = f'{nomad_app}/raw'     # unprocessed calc's

# params
mainfile = 'FHIaims.out'
path_out = 'data/total_energies.csv'

In [None]:
def download_energy(upload_id, entry_id):
    '''Returns total QM energy for the entry with given upload and entry IDs'''
    # request entry
    url = f'{nomad_rep}/{upload_id}/{entry_id}'
    metadata = requests.get(url).json()

    # get mainfile
    calc_dir = os.path.dirname(metadata['mainfile'])
    path_mainfile = f'{calc_dir}/{mainfile}'

    # request the mainfile
    url = f'{nomad_raw}/{upload_id}/{path_mainfile}'
    content = requests.get(url).content.decode('utf-8')

    # get energy
    pattern = r'Total energy of the DFT / Hartree-Fock s\.c\.f\. calculation\s*:\s*(-?\d+\.\d+)'
    match = re.search(pattern, content)
    E = float(match.group(1)) if match else None

    return E

# loading energies
n_entries = len(df)
for i, upload_id, entry_id in tqdm(zip(df.index, df['Upload_ID'], df['Entry_ID']), total = n_entries):
    try:
        E = download_energy(upload_id, entry_id)
    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        time.sleep(30)
        continue
    df.loc[i, 'Total_energy'] = E

# final message
print(f'\nTotal of {sum(df["Total_energy"].isna())} NAs')
print('Download is completed.')

# save data
df.to_csv(path_out, index=False)
df

 34%|█████████████████████████                                                | 7981/23243 [4:06:27<7:47:26,  1.84s/it]