In [1]:
# Load libraries.
import pandas as pd
import numpy as np
import time
import pickle
from matminer.featurizers.site import CrystalNNFingerprint  
from matminer.featurizers.structure import SiteStatsFingerprint
from KmdPlus import StatsDescriptor, formula_to_composition 
from pymatgen.core.composition import Composition
import matplotlib.pyplot as plt
from collections import Counter
# For parallel calculation.
import joblib

MP_data = pd.read_pickle("data_set/paper_used_mp_data_20211107.pd.xz") # All crystal data from Materials Project.
test_data = pd.read_pickle("data_set/all_searching_targets_20211107_with_predictions.pd.xz") # Preselected crystal data for testing.

In [2]:
# Exclude all formula in test data from MP data.
MP_data_left = MP_data[np.invert(MP_data.pretty_formula.isin(test_data.pretty_formula))]
# Get stable data.
MP_stable = MP_data_left[MP_data_left.e_above_hull.values == 0]
# Delete overlapping formula in stable data.
count = Counter(MP_stable.pretty_formula).most_common()
keys = np.array([count[i][0] for i in range(len(count))])
freqs = np.array([count[i][1] for i in range(len(count))])
overlapping_formulas = keys[freqs>1]

excl_ids = []
for i in range(len(overlapping_formulas)):
    x = MP_stable[MP_stable.pretty_formula.values == overlapping_formulas[i]]
    x_sorted = x.sort_values("final_energy_per_atom")
    excl_ids.append(np.asarray(x_sorted.index[1:]))
    
MP_stable = MP_stable[np.invert(MP_stable.index.isin(np.concatenate(excl_ids)))]
MP_stable

Unnamed: 0_level_0,full_formula,composition,composition_ratio,total_atoms,elements,n_elements,space_group_num,space_group,wy_cfg,wy_reformat,...,efermi,final_energy_per_atom,formation_energy_per_atom,has_bandstructure,is_ordered,oxide_type,point_group,pretty_formula,total_magnetization,volume
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mp-1006278,Ac1Eu1Au2,"{'Ac': 1.0, 'Eu': 1.0, 'Au': 2.0}","(1.0, 1.0, 2.0)",4.0,"(Ac, Au, Eu)",3,225,Fm-3m,"{'Ac': {'b': 4}, 'Eu': {'a': 4}, 'Au': {'c': 8}}","{'Ac': ('b',), 'Eu': ('a',), 'Au': ('c',)}",...,4.883417,-6.019130,-0.776843,True,True,,m-3m,AcEuAu2,1.627705,117.080578
mp-1017985,Ti2Ag2,"{'Ti': 2.0, 'Ag': 2.0}","(2.0, 2.0)",4.0,"(Ag, Ti)",2,129,P4/nmm,"{'Ti': {'c': 2}, 'Ag': {'c': 2}}","{'Ti': ('c',), 'Ag': ('c',)}",...,4.709549,-5.429487,-0.065696,True,True,,4/mmm,TiAg,0.000432,70.460966
mp-1018128,Sc1Ag2,"{'Sc': 1.0, 'Ag': 2.0}","(1.0, 2.0)",3.0,"(Ag, Sc)",2,139,I4/mmm,"{'Sc': {'a': 2}, 'Ag': {'e': 4}}","{'Sc': ('a',), 'Ag': ('e',)}",...,3.934398,-4.301338,-0.302162,True,True,,4/mmm,ScAg2,0.003584,57.497334
mp-1018131,Lu1Ag2,"{'Lu': 1.0, 'Ag': 2.0}","(1.0, 2.0)",3.0,"(Ag, Lu)",2,139,I4/mmm,"{'Lu': {'a': 2}, 'Ag': {'e': 4}}","{'Lu': ('a',), 'Ag': ('e',)}",...,3.456485,-3.736455,-0.341119,True,True,,4/mmm,LuAg2,0.004292,62.417938
mp-1025059,La2Ag4,"{'La': 2.0, 'Ag': 4.0}","(2.0, 4.0)",6.0,"(Ag, La)",2,74,Imma,"{'La': {'e': 4}, 'Ag': {'h': 8}}","{'La': ('e',), 'Ag': ('h',)}",...,5.563882,-3.832468,-0.298780,True,True,,mmm,LaAg2,0.000054,150.182757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mp-945077,Y2Te6,"{'Y': 2.0, 'Te': 6.0}","(2.0, 6.0)",8.0,"(Te, Y)",2,63,Cmcm,"{'Y': {'c': 4}, 'Te': {'c': 12}}","{'Y': ('c',), 'Te': ('c', 'c', 'c')}",...,5.933911,-4.907222,-0.933227,True,True,,mmm,YTe3,0.000283,245.997038
mp-972256,Xe1,{'Xe': 1.0},"(1.0,)",1.0,"(Xe,)",1,166,R-3m,{'Xe': {'a': 1}},"{'Xe': ('a',)}",...,-6.965177,-0.036174,0.000000,True,True,,-3m,Xe,0.000000,85.786507
mp-972364,Yb3,{'Yb': 3.0},"(3.0,)",3.0,"(Yb,)",1,166,R-3m,"{'Yb': {'a': 1, 'c': 2}}","{'Yb': ('a', 'c')}",...,1.419946,-1.539595,0.000000,True,True,,-3m,Yb,0.000007,123.042457
mp-977585,Zr3Tl1,"{'Zr': 3.0, 'Tl': 1.0}","(1.0, 3.0)",4.0,"(Tl, Zr)",2,221,Pm-3m,"{'Zr': {'c': 3}, 'Tl': {'a': 1}}","{'Zr': ('c',), 'Tl': ('a',)}",...,5.632566,-7.113048,-0.111859,True,True,,m-3m,Zr3Tl,0.002025,90.588661


In [3]:
# Calculate the local order parameter fingerprints for all stable structures (DOI: 10.3389/fmats.2017.00034.).
structures = MP_stable.structure.values

# Site featurizer.
cnnf = CrystalNNFingerprint.from_preset('ops', distance_cutoffs=None, x_diff_weight=0)

def parallel_cnnf(featurizer, str_x):
    return np.array(joblib.Parallel(n_jobs=-1)(joblib.delayed(featurizer)(str_x, i) for i in range(len(str_x.sites))))

# SiteStats.
def SiteStats(site_fgps):
    return np.array([site_fgps.mean(0), site_fgps.std(0), site_fgps.min(0), site_fgps.max(0)]).T.flatten()

In [4]:
# Calculate structure fingerprints for all stable data.
n_iter = len(structures)

strfgp_stable = []
errors_i = []

s = time.time()

for i in range(n_iter):
    str_x = structures[i] # ith str.
    
    try:
        strfgp_stable.append(SiteStats(parallel_cnnf(cnnf.featurize, str_x))) # site fgps for the ith str.
        
    except:
        strfgp_stable.append("NA")
        errors_i.append(i)
        print(f"error at {i}")
        
e = time.time()
print(f"time: {e-s}")
print(f"time per iteration: {(e-s)/n_iter}")

# Save results.
strfgp_stable_array = np.array(strfgp_stable)

print(strfgp_stable_array.shape)

np.save('data_set/strfgp_stable_20211107', strfgp_stable_array)

time: 4228.889889001846
time per iteration: 0.12790012971817827
(33064, 244)


In [5]:
# Calculate fingerprints for chemical compositions (five statistics of element_features).

# Element-level descriptors of shape (94, 58).
element_features = pd.read_csv("data_set/element_features.csv", index_col= 0)

cmpfgp_stable_array = StatsDescriptor(MP_stable.pretty_formula.values, element_features)

# Save results.
print(cmpfgp_stable_array.shape)

np.save('data_set/cmpfgp_stable_20211107', cmpfgp_stable_array)

(33064, 290)


In [6]:
# Calculate structure fingerprints for test data.
structures = test_data.structure.values

n_iter = len(structures)

strfgp_stable = []
errors_i = []

s = time.time()

for i in range(n_iter):
    str_x = structures[i] # ith str.
    
    try:
        strfgp_stable.append(SiteStats(parallel_cnnf(cnnf.featurize, str_x))) # site fgps for the ith str.
        
    except:
        strfgp_stable.append("NA")
        errors_i.append(i)
        print(f"error at {i}")
        
e = time.time()
print(f"time: {e-s}")
print(f"time per iteration: {(e-s)/n_iter}")

# Save results.
strfgp_stable_array = np.array(strfgp_stable)

print(strfgp_stable_array.shape)

np.save('data_set/strfgp_test_20211107', strfgp_stable_array)

time: 20.505083799362183
time per iteration: 0.22783426443735758
(90, 244)
