In [16]:
import requests
import json
import pandas as pd
from time import sleep

def check_nested_keys(d, keys):
    for key in keys:
        if key not in d:
            return False
        d = d[key]
    return True

url = 'http://nomad-lab.eu/prod/v1/api/v1/entries/archive/query'

excluded_elements = [
    "He", "Ne", "Ar", "Kr", "Xe", "Rn", "U", "Th", "Rn", "Tc", "Po", "Pu", "Pa",
    ]

query = {
    # "owner": "visible",
    # 'not':{
    #     'results.material.elements': {
    #         'any': excluded_elements
    #     }
    # }
    "results.method.simulation.program_name:any": [
        "VASP"
    ],
    "quantities:all": [
        "results.properties.structures",
        "results.properties.structures.structure_original",
        "results.properties.structures.structure_conventional",
        "results.properties.structures.structure_primitive"
    ]
}

required = {
    "results": {
        "material": {
            "chemical_formula_reduced": "*"
        },
        "properties": {
            "structures": "*"
        }
    
    }
}

df = pd.DataFrame(columns=['entry_id', 'chemical_formula', 'structure_original', 'structure_primitive', 'structure_conventional', 'json'])

page_after_value = None
cnt = 0
save_cnt = 0
total = 0
while True:
    # try the post, if it fails, try again
    try:
        response = requests.post(
            url, json = dict(
                query=query,
                required=required,
                pagination=dict(page_size=100, page_after_value=page_after_value)
            )
        )
        data = response.json()
    except Exception as e:
        print(f"Error: {e}")
        sleep(30)
        continue

    if len(data['data']) == 0:
        break

    page_after_value = data['pagination']['next_page_after_value']

    for entry in data['data']:
        # check if all required keys are present
        if not check_nested_keys(entry, ['archive', 'results', 'properties', 'structures']):
            continue

        # save to dataframe
        entry_id = entry['entry_id']
        chemical_formula = entry['archive']['results']['material']['chemical_formula_reduced']
        structure_original = entry['archive']['results']['properties']['structures']['structure_original']
        structure_primitive = entry['archive']['results']['properties']['structures']['structure_primitive']
        structure_conventional = entry['archive']['results']['properties']['structures']['structure_conventional']
        entry_json = json.dumps(entry)

        # using pd.concat
        df = pd.concat([df, pd.DataFrame([[entry_id, chemical_formula, structure_original, structure_primitive, structure_conventional, entry_json]], columns=['entry_id', 'chemical_formula', 'structure_original', 'structure_primitive', 'structure_conventional', 'json'])], ignore_index=True)
        
        cnt += 1
        save_cnt += 1
    
    total += len(data['data'])
    print(f"Processed {cnt}/{total} entries")

    # save to csv every 100000 entries
    if save_cnt > 100000:
        df.to_csv(f'data/{page_after_value}.csv', index=False)
        print("Saved to csv")
        save_cnt = 0

    break

Processed 34/100 entries


In [17]:
df

Unnamed: 0,entry_id,chemical_formula,structure_original,structure_primitive,structure_conventional,json
0,----9KNOtIZc9bDFEWxgjeSRsJrC,CaFe2Re,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""----9KNOtIZc9bDFEWxgjeSRsJrC"", ""..."
1,---K_oamb0brcfuzk7XeBGT1l6pp,Ag2LaNi,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---K_oamb0brcfuzk7XeBGT1l6pp"", ""..."
2,---NRSI3eJ3cEV97kfl9eKZ38L5M,NaNi2V,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---NRSI3eJ3cEV97kfl9eKZ38L5M"", ""..."
3,---OWGSfJGTXSTY71gcX0l_HFO9A,LiSr2Zr,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---OWGSfJGTXSTY71gcX0l_HFO9A"", ""..."
4,---SR2t2-lkZvBWVaF-eCLkf7BQh,OsSbY,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---SR2t2-lkZvBWVaF-eCLkf7BQh"", ""..."
5,---mWpSogZteOYvPrpyT6PUZfSQJ,MnNaPm2,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---mWpSogZteOYvPrpyT6PUZfSQJ"", ""..."
6,---n93Np1-HZAuVg04MKVYobIt2m,As2F12I2Se12,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---n93Np1-HZAuVg04MKVYobIt2m"", ""..."
7,---nb1Zrg9nx-S-SZgvXiB959vtd,Ir2ReSi,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---nb1Zrg9nx-S-SZgvXiB959vtd"", ""..."
8,---vOlFiwCCU4JeZ0QjGTKQZYbXS,AgNiZr2,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---vOlFiwCCU4JeZ0QjGTKQZYbXS"", ""..."
9,---x0dhqdpEkEjP-AT6KK_5q_E8D,ErNaO3,"{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{'dimension_types': [1, 1, 1], 'lattice_vector...","{""entry_id"": ""---x0dhqdpEkEjP-AT6KK_5q_E8D"", ""..."
