In [4]:
import pickle
import pandas as pd
import numpy as np
import ast
import requests
import shutil

# Data preparation for Research Objective 2
Requires:

- df_all_ctls_identified_v2.pkl (created with "Identification_results.ipynb")
- dict_all_cells_v2.pkl (created with "Identify_CID_v2.ipynb")

Outputs a dataframe for the ML tasks.

In [7]:
# Execute to make sure that you have the newest version of the RO1 results.
# WARNING: This overwrites the files if they already exist.
shutil.copyfile("../ResObj_1/data_RO1/df_all_ctls_identified_v2.pkl", "data_RO2/df_all_ctls_identified_v2.pkl")
shutil.copyfile("../ResObj_1/data_RO1/dict_all_cells_v2.pkl", "data_RO2/dict_all_cells_v2.pkl")

'data_RO2/dict_all_cells_v2.pkl'

# Download from NOMAD API
Data are downloaded again for Research Objective 2 because additional properties are needed, e.g. PCE.

In [2]:
base_url = 'https://nomad-lab.eu/prod/v1/api/v1/'

def extract_values(entry):
    try:
        bandgaps.append(entry['results']['properties']['electronic']['band_structure_electronic'][0]['band_gap'][0]['value'])
    except:
        bandgaps.append('None')
    try:
        reduced_formulas.append(entry['results']['material']['chemical_formula_reduced'])
    except:
        reduced_formulas.append('None')
    try:
        pce.append(entry['results']['properties']['optoelectronic']['solar_cell']['efficiency'])
    except:
        pce.append('None')
    try:
        device_stack.append(entry['results']['properties']['optoelectronic']['solar_cell']['device_stack'])
    except:
        device_stack.append('None')
    try:
        htl.append(entry['results']['properties']['optoelectronic']['solar_cell']['hole_transport_layer'])
    except:
        htl.append('None')
    try:
        etl.append(entry['results']['properties']['optoelectronic']['solar_cell']['electron_transport_layer'])
    except:
        etl.append('None')
    try:
        da.append(entry['results']['properties']['optoelectronic']['solar_cell']['device_area'])
    except:
        da.append('None')
    try:
        ill_int.append(entry['results']['properties']['optoelectronic']['solar_cell']['illumination_intensity'])
    except:
        ill_int.append('None')
    return bandgaps, reduced_formulas, pce, device_stack, htl, etl, da, ill_int

bandgaps =[]
pce = []
reduced_formulas = []
htl = []
etl = []
device_stack = []
da = []
ill_int = []

page_after_value = None
count = 0

while True:
    count = count + 1
    print(f"Progress: {count}/44")
    data = requests.post(f'{base_url}entries/query', json={
        "owner": "visible",
        "aggregations": {},
        "query": {
            "and": [
                {"sections:all": ["nomad.datamodel.results.SolarCell"]},
                ]},
        "required": {
            "results":{
                "material": {
                    "chemical_formula_reduced":"*",
                    "structural_type":"*"},
                "properties": {
                   "optoelectronic":{
                      "band_gap":"*",
                      "solar_cell":{
                          "open_circuit_voltage":"*",
                          "short_circuit_current_density":"*",
                          "fill_factor":"*",
                          "efficiency":"*",
                          }}},},
        },
        "pagination": {"page_size": 1000,
                       "page_after_value": page_after_value}
        }).json()


    if not data['data']:
        break
    if 'next_page_after_value' not in data['pagination'].keys():
    # make sure to grasp the entries of the last page before breaking
        for entry in data['data']:
            if 'results' not in entry.keys():
                continue
            else:
                extract_values(entry)
        break
    page_after_value = data['pagination']['next_page_after_value']

    for entry in data['data']:
        if 'results' not in entry.keys():
            continue
        else:
            extract_values(entry)
            
df = pd.DataFrame({
    'reduced_formulas': reduced_formulas,
    'bandgap': bandgaps,
    'pce': pce,
    'device_stack': device_stack,
    'htl': htl,
    'etl': etl,
    'ill_int': ill_int,
    'device_area': da,
    })

df.to_csv('data_RO2/df_pce_prediction.csv', index=False)

Progress: 1/43
Progress: 2/43
Progress: 3/43
Progress: 4/43
Progress: 5/43
Progress: 6/43
Progress: 7/43
Progress: 8/43
Progress: 9/43
Progress: 10/43
Progress: 11/43
Progress: 12/43
Progress: 13/43
Progress: 14/43
Progress: 15/43
Progress: 16/43
Progress: 17/43
Progress: 18/43
Progress: 19/43
Progress: 20/43
Progress: 21/43
Progress: 22/43
Progress: 23/43
Progress: 24/43
Progress: 25/43
Progress: 26/43
Progress: 27/43
Progress: 28/43
Progress: 29/43
Progress: 30/43
Progress: 31/43
Progress: 32/43
Progress: 33/43
Progress: 34/43
Progress: 35/43
Progress: 36/43
Progress: 37/43
Progress: 38/43
Progress: 39/43
Progress: 40/43
Progress: 41/43
Progress: 42/43
Progress: 43/43
Progress: 44/43


# Data preparation

In [8]:
# load data
df = pd.read_csv('data_RO2/df_pce_prediction.csv')
print(f"Raw data length: {len(df)}.")

with open('data_RO2/df_all_ctls_identified_v2.pkl', 'rb') as f:
    df_id = pickle.load(f)
print(f"Length of CTL identification info: {len(df_id)}.")

# Only keep fully identified cells
df['both_identified'] = df_id['both_identified']
df_all_identified = df[df['both_identified'] == True]
print(f"After keeping only fully identified CTLs: {len(df_all_identified)}.")

# drop cells without a device stack information
df_all_identified['device_stack'].apply(ast.literal_eval)
df_all_identified = df_all_identified.dropna(subset=['device_stack'])
print(f"After dropping NAs in device stack: {len(df_all_identified)}.")

# drop cells without reduced_formulas
df_all_identified = df_all_identified[df_all_identified['reduced_formulas'] != "None"]
print(f"After dropping None strings in reduced_formulas: {len(df_all_identified)}.")

df_all_identified = df_all_identified.dropna(subset=['reduced_formulas'])
print(f"After dropping NAs in reduced_formulas: {len(df_all_identified)}.")

# eliminate cells that were measured at an illumination intensity other than 1000 W/m^2
df_all_identified['ill_int'] = pd.to_numeric(df_all_identified['ill_int'], errors='coerce')
df_all_identified = df_all_identified[df_all_identified['ill_int'] == 1000]
print(f"After dropping cells that were measured at illumination intensity other than 1000 W/m^2: {len(df_all_identified)}.")

# eliminate rows with large device areas
df_all_identified['device_area'] = pd.to_numeric(df_all_identified['device_area'], errors='coerce')
df_all_identified = df_all_identified[df_all_identified['device_area'] <= 0.000025]
print(f"After dropping cells with device area greater than 25 mm^2: {len(df_all_identified)}.")


# drop everything that is not further needed
df_all_identified = df_all_identified.drop(columns=['both_identified', 
                                                    #'device_stack',
                                                    'ill_int',
                                                    'device_area',
                                                   ])

# transform etl and htl to lists
df_all_identified['etl'] = df_all_identified['etl'].apply(ast.literal_eval)
df_all_identified['htl'] = df_all_identified['htl'].apply(ast.literal_eval)


# split entries separated with semicolons into proper comma separated lists
for index, _ in df_all_identified.iterrows():
    i = 0
    while i < len(df_all_identified.at[index, "etl"]):
        if ";" in df_all_identified.at[index, "etl"][i]:
            elements = df_all_identified.at[index, "etl"][i].split(";")
            df_all_identified.at[index, "etl"] = df_all_identified.at[index, "etl"][:i] + elements + df_all_identified.at[index, "etl"][i+1:]
        i += 1

for index, _ in df_all_identified.iterrows():
    i = 0
    while i < len(df_all_identified.at[index, "htl"]):
        if ";" in df_all_identified.at[index, "htl"][i]:
            elements = df_all_identified.at[index, "htl"][i].split(";")
            df_all_identified.at[index, "htl"] = df_all_identified.at[index, "htl"][:i] + elements + df_all_identified.at[index, "htl"][i+1:]
        i += 1
        
# transform pce to numeric
df_all_identified['pce'] = pd.to_numeric(df_all_identified['pce'], errors='coerce')
df_all_identified = df_all_identified.dropna(subset=['pce'])
print(f"After dropping cells without pce information: {len(df_all_identified)}.")

# drop cells with very low PCE
df_all_identified = df_all_identified[df_all_identified['pce'] > 2]
print(f"After dropping cells with pce lower than 2: {len(df_all_identified)}.")

# transform bandgaps to proper size values, transforming nones into zeros
df_all_identified['bandgap'] = pd.to_numeric(df_all_identified['bandgap'], errors='coerce')

# check how many bandgaps are none
df_check = df_all_identified.dropna(subset=['bandgap'])
print(f"How many nones in bandgap: {len(df_all_identified)-len(df_check)}")
df_all_identified['bandgap'] = df_all_identified['bandgap'].fillna(0)
df_all_identified['bandgap'] = df_all_identified['bandgap']*6.24150974e18

df_all_identified['etl_key'] = df_all_identified['etl'].apply(lambda x: ';'.join(x))
df_all_identified['htl_key'] = df_all_identified['htl'].apply(lambda x: ';'.join(x))
df_all_identified['bandgap_key'] = df_all_identified['bandgap'].round(decimals=4)

df_all_identified = df_all_identified.groupby(['reduced_formulas', 'etl_key', 'htl_key', 'bandgap_key']).agg({
                       'reduced_formulas': 'first',
                       'etl': 'first',
                       'htl': 'first',
                       'bandgap': 'mean',
                       'device_stack': 'first',
                       'pce': 'mean',
                      }).reset_index(drop=True)

df_all_identified

Raw data length: 43108.
Length of CTL identification info: 43108.
After keeping only fully identified CTLs: 37713.
After dropping NAs in device stack: 37713.
After dropping None strings in reduced_formulas: 37160.
After dropping NAs in reduced_formulas: 37160.
After dropping cells that were measured at illumination intensity other than 1000 W/m^2: 36741.
After dropping cells with device area greater than 25 mm^2: 34103.
After dropping cells without pce information: 33382.
After dropping cells with pce lower than 2: 31635.
How many nones in bandgap: 7423


Unnamed: 0,reduced_formulas,etl,htl,bandgap,device_stack,pce
0,Ag20Bi20CsI60,"[TiO2-c, TiO2-mp]",[P3HT],1.86,"['SLG', 'FTO', 'TiO2-c', 'TiO2-mp', 'Perovskit...",3.530000
1,Ag20Bi20CsI60,"[TiO2-c, TiO2-mp]",[PTB7-th],1.86,"['SLG', 'FTO', 'TiO2-c', 'TiO2-mp', 'Perovskit...",3.530000
2,Ag2BiI5,"[TiO2-c, TiO2-mp]",[PTAA],2.22,"['SLG', 'ITO', 'TiO2-c', 'TiO2-mp', 'Perovskit...",2.600000
3,Ag3BiI6,"[TiO2-c, TiO2-mp]",[P3HT],1.80,"['SLG', 'FTO', 'TiO2-c', 'TiO2-mp', 'Perovskit...",2.320000
4,Ag3BiI6,"[TiO2-c, TiO2-mp]",[PTAA],0.00,"['SLG', 'FTO', 'TiO2-c', 'TiO2-mp', 'Perovskit...",4.300000
...,...,...,...,...,...,...
5444,CsI3Sn,[TiO2-c],[PTAA],0.00,"['SLG', 'FTO', 'TiO2-c', 'Perovskite', 'PTAA',...",3.866667
5445,CsI3Sn,"[TiO2-c, TiO2-mp]",[PTAA],0.00,"['SLG', 'FTO', 'TiO2-c', 'TiO2-mp', 'Perovskit...",3.790000
5446,CsI3Sn,"[TiO2-c, TiO2-mp]",[PTAA],1.30,"['SLG', 'FTO', 'TiO2-c', 'TiO2-mp', 'Perovskit...",3.655000
5447,CsI3Sn,"[TiO2-c, TiO2-mp]",[Spiro-MeOTAD],0.00,"['SLG', 'FTO', 'TiO2-c', 'TiO2-mp', 'Perovskit...",2.230000


# Prepare SMILES dictionary

The following code does:
- transform the CID dictionary into SMILES dictionary (or load if SMILES dictionary already exists)
- Add the SMILES from the dictionary
- removes all rows where ETLs or HTLs are not completely identfied

In [16]:
# Transform the dictionary entries to SMILES

try:
    with open('data_RO2/SMILES_dictionary.pkl', 'rb') as f:
        SMILES_dict = pickle.load(f)
except:
    with open('data_RO2/dict_all_cells_v2.pkl', 'rb') as f:
        CID_dict = pickle.load(f)
    
    def CID_to_SMILES(CID):
        '''
        This searches for a CTL material's CID in PubChem.
        Argument: industry_name (str) - the name of the material
        Value: CID (int) - the CID of the material
        '''
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{CID}/property/CanonicalSMILES/JSON"
        
        response = requests.get(url)
    
        if response.status_code == 200:
            data = response.json()
            data = data['PropertyTable']['Properties'][0]['CanonicalSMILES']
            return data
        else:
            print('debug: SMILES could not be retrieved')
            raise Exception(f"Error: Could not retrieve SMILES from this CID. Status code: {response.status_code}")
            return None
    
    #initialize new dictionary where the SMILES will be written
    SMILES_dict = {}
    
    # populate the SMILES dictionary
    for index, name in enumerate(CID_dict):
        print(f"{index+1}/{len(CID_dict)}. Next up: {name}")
        CID = CID_dict[name]
        if CID is not None:
            SMILES = CID_to_SMILES(CID)
            SMILES_dict[name] = SMILES
        else:
            SMILES_dict[name] = None
    
    with open('data_RO2/SMILES_dictionary.pkl', 'wb') as f:
        pickle.dump(SMILES_dict, f)


# Write SMILES into the dataframe
df_all_identified['pce'] = df_all_identified['pce'].astype(float)

# the SMILES_dict contains no entry for "no ctl" yet
SMILES_dict['none'] = "no_ctl"

for index, row in df_all_identified.iterrows():
    etl_SMILES = []
    for element in row['etl']:
        etl_SMILES.append(SMILES_dict[element])
    df_all_identified.loc[index, 'etl_SMILES'] = str(etl_SMILES)
    htl_SMILES = []
    for element in row['htl']:
        htl_SMILES.append(SMILES_dict[element])
    df_all_identified.loc[index, 'htl_SMILES'] = str(htl_SMILES)

# output of the function are strings, so we transform to lists
df_all_identified['etl_SMILES'] = df_all_identified['etl_SMILES'].apply(ast.literal_eval)
df_all_identified['htl_SMILES'] = df_all_identified['htl_SMILES'].apply(ast.literal_eval)

# drop rows where the etl_SMILES or the htl_SMILES contain None
def has_all_non_none_elements(data_list):
    """Checks if all elements in a list are not None."""
    return all(element is not None for element in data_list)

# Check: Filter rows where any element in etl_SMILES or htl_SMILES is None
print(f"Length before filtering: {len(df_all_identified)}")
df_all_identified = df_all_identified[df_all_identified['etl_SMILES'].apply(has_all_non_none_elements) &
                                 df_all_identified['htl_SMILES'].apply(has_all_non_none_elements)]
print(f"Length after filtering out elements where etl or htl smiles is None: {len(df_all_identified)}")
print(f"The two above numbers should be the same. Otherwise something in the name to SMILES conversion did not work.")

Length before filtering: 5449
Length after filtering out elements where etl or htl smiles is None: 5449
The two above numbers should be the same. Otherwise something in the name to SMILES conversion did not work.


# Save

In [18]:
df_all_identified.to_csv('data_RO2/df_ml_ready.csv', index=False)