# Fetching data from NOMAD

This file is used to collect the perovskite solar cell data for all 43 119 cells from the NOMAD repository. It collects each entry's PCE, ETL, HTL and publication reference. If necessary, more properties can be accessed.

In [1]:
import numpy as np
import pandas as pd
import requests
import pickle

# Fetching data

In [2]:
# Function to extract values from the downloaded entries
def extract_values(entry):
    try:
        htl.append(entry['results']['properties']['optoelectronic']['solar_cell']['hole_transport_layer'])
    except:
        htl.append('None')
    try:
        etl.append(entry['results']['properties']['optoelectronic']['solar_cell']['electron_transport_layer'])
    except:
        etl.append('None')
    try:
        ref.append(entry['references'])
    except:
        ref.append('None')
    return htl, etl, ref

In [3]:
# gets all ~43 119 PSCs from NOMAD

# initialize empty lists where collected values will be stored
htl = []
etl = []
ref = []

page_after_value = None
base_url = 'https://nomad-lab.eu/prod/v1/api/v1/'

# access NOMAD API and query for all cells with the property SolarCell that 
# have information in the mentioned sections:
while True:
    data = requests.post(f'{base_url}entries/query', json={
        "owner": "visible",
        "aggregations": {},
        "query": {
            "and": [
                {"sections:all": ["nomad.datamodel.results.SolarCell"]},
                ]},
        "required": {
            "results":{
                "material": {
                    "chemical_formula_reduced":"*",
                    "structural_type":"*"},
                "properties": {
                   "optoelectronic":{
                      "band_gap":"*",
                      "solar_cell":{
                          "open_circuit_voltage":"*",
                          "short_circuit_current_density":"*",
                          "fill_factor":"*",
                          "efficiency":"*",
                          }}},},
        },
        "pagination": {"page_size": 10,
                       "page_after_value": page_after_value}
        }).json()

    if not data['data']:
        print('debug: no data found')
        break

    # instructions for the last page
    if 'next_page_after_value' not in data['pagination'].keys():
        for entry in data['data']:
            if 'results' not in entry.keys():
                continue
            elif 'chemical_formula_reduced' not in entry['results']['material'].keys():
                continue
            else:
                extract_values(entry)
        break

    page_after_value = data['pagination']['next_page_after_value']

    # extract the values from current page
    for entry in data['data']:
        if 'results' not in entry.keys():
            continue
        else:
            extract_values(entry)

In [None]:
# put result of query into a pandas dataframe
df_all_ctls = pd.DataFrame({'etl': etl, 'htl': htl, 'ref': ref})

# clean up ref so it shows only the reference to the paper
df_all_ctls['ref'] = df_all_ctls['ref'].apply(lambda x: x[0] if x else None)

# Pickle the result
Fetching all those entries took ca. 30 mins, so I pickle them here.

In [5]:
with open('data_RO1/df_all_ctls.pkl', 'wb') as f:
    pickle.dump(df_all_ctls, f)