In [12]:
import pandas as pd
from monty.json import MontyDecoder
from tqdm import tqdm

from aviary import ROOT
from aviary.wren.utils import get_aflow_label_aflow, get_aflow_label_spglib
from examples.mat_bench import DATA_PATHS, MODULE_DIR

__author__ = "Janosh Riebesell"
__date__ = "2022-05-17"


In [14]:
df_perovskites = pd.read_json(DATA_PATHS["matbench_perovskites"]).set_index("mbid")
df_perovskites = df_perovskites.rename(columns={"wyckoff": "spglib_wyckoff"})
decode = MontyDecoder().decode
df_perovskites.structure = [decode(struct) for struct in df_perovskites.structure]


In [24]:
# takes ~6h (when running uninterrupted)
for idx, struct in tqdm(df_perovskites.structure.items(), total=len(df_perovskites)):
    if pd.isna(df_perovskites.aflow_wyckoff[idx]):
        df_perovskites.at[idx, "aflow_wyckoff"] = get_aflow_label_aflow(
            struct, "/Users/janosh/bin/aflow"
        )


100%|██████████| 18928/18928 [18:04:55<00:00,  3.44s/it]     


In [None]:
# takes ~30 sec
for idx, struct in tqdm(df_perovskites.structure.items(), total=len(df_perovskites)):
    get_aflow_label_spglib(struct)


In [25]:
df_perovskites.dropna().query("wyckoff != aflow_wyckoff")


Unnamed: 0_level_0,structure,e_form,composition,wyckoff,aflow_wyckoff
mbid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mb-perovskites-00003,"[[0.60790913 0. 0. ] Re, [2.186...",1.48,Re1 As1 O2 F1,ABC2D_oP5_25_a_b_ac_d:As-F-O-Re,ABC2D_oP5_25_a_c_ab_d:As-F-O-Re
mb-perovskites-00005,"[[0.00518937 0. 0. ] Bi, [2.172...",0.62,Hf1 Bi1 O2 F1,ABCD2_tP5_123_c_a_b_e:Bi-F-Hf-O,ABCD2_oP5_25_d_c_a_ab:Bi-F-Hf-O
mb-perovskites-00006,"[[0.00247188 0. 0. ] Y, [1.9109...",1.48,Li1 Y1 N1 O1 F1,ABCDE_oP5_47_b_a_c_e_h:F-Li-N-O-Y,ABCDE_oP5_25_c_a_b_a_d:F-Li-N-O-Y
mb-perovskites-00007,"[[0.00124498 0. 0. ] Be, [2.053...",1.56,Be1 W1 O2 F1,ABC2D_tP5_123_c_a_e_b:Be-F-O-W,ABC2D_oP5_25_d_c_ab_a:Be-F-O-W
mb-perovskites-00008,"[[0.01070352 0. 0. ] Sc, [1.929...",1.20,Sc1 As1 N1 O2,ABC2D_tP5_123_b_a_e_c:As-N-O-Sc,ABC2D_oP5_25_a_c_ab_d:As-N-O-Sc
...,...,...,...,...,...
mb-perovskites-18919,"[[0.00799946 0. 0. ] Pd, [1.854...",1.04,Si1 Pd1 O2 F1,AB2CD_tP5_123_a_e_c_b:F-O-Pd-Si,AB2CD_oP5_25_c_ab_d_a:F-O-Pd-Si
mb-perovskites-18920,[[3.41465312e-03 0.00000000e+00 7.00937721e-20...,1.52,Rb1 Cu1 N1 O2,ABC2D_tP5_123_b_a_e_c:Cu-N-O-Rb,ABC2D_oP5_25_a_c_ab_d:Cu-N-O-Rb
mb-perovskites-18922,"[[0.02606092 0. 0. ] Tl, [2.195...",0.88,Tl1 Ir1 O2 F1,ABC2D_oP5_25_b_a_ac_d:F-Ir-O-Tl,ABC2D_oP5_25_c_a_ab_d:F-Ir-O-Tl
mb-perovskites-18923,"[[0.00376942 0. 0. ] Si, [2.291...",1.78,Si1 Bi1 O2 F1,ABC2D_tP5_123_b_a_e_c:Bi-F-O-Si,ABC2D_oP5_25_a_c_ab_d:Bi-F-O-Si


In [26]:
print(
    "Percentage of materials with spglib label != aflow label: "
    f"{len(df_perovskites.dropna().query('wyckoff != aflow_wyckoff')) / len(df_perovskites.dropna()):.0%}"
)


Percentage of materials with spglib label != aflow label: 52%


In [16]:
# df_perovskites.drop('structure', axis=1).to_csv(f"{ROOT}/datasets/matbench_perovskites_aflow_labels.csv")
df_perovskites = pd.read_csv(
    f"{ROOT}/datasets/matbench_perovskites_aflow_labels.csv"
).set_index("mbid")[["aflow_wyckoff", "spglib_wyckoff"]]


In [17]:
for src in ("aflow", "spglib"):
    df_perovskites[f"{src}_spg"] = (
        df_perovskites[f"{src}_wyckoff"].str.split("_").str[2].astype(int)
    )


In [20]:
from pymatviz import spacegroup_sunburst

fig = spacegroup_sunburst(df_perovskites.spglib_spg)
fig.update_layout(title=dict(text="Spglib Spacegroups", x=0.5, y=0.93))
# fig.write_image(f"{MODULE_DIR}/plots/matbench_perovskites_aflow_sunburst.png", scale=2)


In [24]:
fig = spacegroup_sunburst(df_perovskites.aflow_spg, title="Aflow")
fig.update_layout(title=dict(text="Aflow Spacegroups", x=0.5, y=0.85))
# fig.write_image(f"{MODULE_DIR}/plots/matbench_perovskites_spglib_sunburst.png", scale=2)
