In [1]:
import pymatgen.core as mg
from pymatgen.io import cif
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty
import glob
import pandas as pd
import numpy as np

# read csv from previous step

In [2]:
data = pd.read_csv('../2_filter/filtered_entries_for_featurization.csv')

# for each entry, parse formulas into individual elements

In [3]:
element_1_list = []
element_2_list = []

for i, row in data.iterrows():
    formula = row['Formula']
    mg_composition = mg.Composition(formula)

    elements = np.array(mg_composition.elements)

    # # sort by electronegativity
    # electronegativities = [e.X for e in elements]
    # sorted_elements = elements[np.argsort(electronegativities)] # sort element ordering by electronegativity

    # sort by stoichiometric coefficient
    coefficients = [mg_composition[e] for e in elements]
    sorted_elements = elements[np.argsort(coefficients)] # sort element ordering by coefficient

    e1, e2 = str(sorted_elements[0]), str(sorted_elements[1])

#     if '-' in e1 or '+' in e1 or '-' in e2 or '+' in e2: 
#         print(row, e1, e2)
#         print([type(e) for e in sorted_elements])
#         assert(1==0)
    
    element_1_list += [e1]
    element_2_list += [e2]

data['Element 1'] = element_1_list
data['Element 2'] = element_2_list

# perform featurization on each element separately

In [4]:
# set up elements for featurization
e1_feat = ElementProperty.from_preset(preset_name="magpie")
e2_feat = ElementProperty.from_preset(preset_name="magpie")
e1_comps = StrToComposition().featurize_dataframe(data, 'Element 1')
e2_comps = StrToComposition().featurize_dataframe(data, 'Element 2')

# get features
e1_features = e1_feat.featurize_dataframe(e1_comps, col_id="composition")
e2_features = e2_feat.featurize_dataframe(e2_comps, col_id="composition")



StrToComposition:   0%|          | 0/8758 [00:00<?, ?it/s]

StrToComposition:   0%|          | 0/8758 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/8758 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/8758 [00:00<?, ?it/s]

In [5]:
e1_features

Unnamed: 0,Collection_Code,Formula,Prototype,Element 1,Element 2,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,159910,TiO2,Anatase#TiO2,Ti,O,(Ti),22.0,22.0,0.0,22.0,...,0.0,0.000023,0.0,0.000023,194.0,194.0,0.0,194.0,0.0,194.0
1,180903,SiO2,Cristobalite#SiO2,Si,O,(Si),14.0,14.0,0.0,14.0,...,0.0,0.000000,0.0,0.000000,227.0,227.0,0.0,227.0,0.0,227.0
2,246888,MnO2,Rutile#TiO2,Mn,O,(Mn),25.0,25.0,0.0,25.0,...,0.0,0.000310,0.0,0.000310,217.0,217.0,0.0,217.0,0.0,217.0
3,108587,MgZn2,Laves(2H)#MgZn2,Mg,Zn,(Mg),12.0,12.0,0.0,12.0,...,0.0,0.000000,0.0,0.000000,194.0,194.0,0.0,194.0,0.0,194.0
4,89278,SiO2,Quartz(low)#SiO2,Si,O,(Si),14.0,14.0,0.0,14.0,...,0.0,0.000000,0.0,0.000000,227.0,227.0,0.0,227.0,0.0,227.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8753,56165,Ag2F,Ag2F,F,Ag,(F),9.0,9.0,0.0,9.0,...,0.0,0.000000,0.0,0.000000,15.0,15.0,0.0,15.0,0.0,15.0
8754,88619,SbO2,HgMoO4,Sb,O,(Sb),51.0,51.0,0.0,51.0,...,0.0,0.000000,0.0,0.000000,166.0,166.0,0.0,166.0,0.0,166.0
8755,638612,HfMo2,Laves(cub)#MgCu2,Hf,Mo,(Hf),72.0,72.0,0.0,72.0,...,0.0,0.000000,0.0,0.000000,194.0,194.0,0.0,194.0,0.0,194.0
8756,99714,MoO2,Rutile#TiO2,Mo,O,(Mo),42.0,42.0,0.0,42.0,...,0.0,0.000000,0.0,0.000000,229.0,229.0,0.0,229.0,0.0,229.0
