In [15]:
import pymatgen.core as mg
from pymatgen.core.periodic_table import Species
from pymatgen.io import cif
from matminer.featurizers.conversions import StrToComposition, CompositionToOxidComposition
from matminer.featurizers.composition import ElementProperty, IonProperty, ElectronAffinity,\
                                                BandCenter, TMetalFraction, ElectronegativityDiff
import glob
import pandas as pd
import numpy as np


def prettify_features(df, discard_terms=['minimum','maximum','avg_dev','mode','range'], prefix=''):
    """Prettify feature names and remove redundant features 
    """
    feature_names = df.columns
    
    for discard_term in discard_terms:
        feature_names = df.columns
        discard_mask = np.array([discard_term in name for name in feature_names])
        df = df[feature_names[~discard_mask]]

    rename_dict = {col : f'{prefix}{col.replace("MagpieData ", "").replace("mean ","").replace(" ","_").lower()}' for col in df.columns}
    print(rename_dict)
    df = df.rename(columns=rename_dict)
    return df

# read csv from previous step

In [16]:
df = pd.read_csv('../2_filter/filtered_entries_for_featurization.csv')

# Composition-based featurization

In [17]:
# Find composition
df = StrToComposition().featurize_dataframe(df, 'Formula')

# Guess oxidation states
df = CompositionToOxidComposition().featurize_dataframe(df, 'composition')

### Detect 0 oxi states
has_0_oxi = [0 in [spec.oxi_state for spec in comp.elements] for comp in df.composition_oxid]
print(f'{sum(has_0_oxi)} entries with +0 oxidation states')

# Calculate composition based features
df = IonProperty().featurize_dataframe(df, 'composition_oxid')
df = ElementProperty.from_preset(preset_name="magpie").featurize_dataframe(df, 'composition_oxid')
df = ElectronAffinity().featurize_dataframe(df, 'composition_oxid', ignore_errors=True)
df = BandCenter().featurize_dataframe(df, 'composition_oxid', ignore_errors=True)

# Flag compounds which tend to charge-disproporionate
df['chg_dispro'] = [len(oxid_comp) > 2 for oxid_comp in df.composition_oxid]
df

StrToComposition:   0%|          | 0/8758 [00:00<?, ?it/s]

CompositionToOxidComposition:   0%|          | 0/8758 [00:00<?, ?it/s]



4573 entries with +0 oxidation states


IonProperty:   0%|          | 0/8758 [00:00<?, ?it/s]



ElementProperty:   0%|          | 0/8758 [00:00<?, ?it/s]

ElectronAffinity:   0%|          | 0/8758 [00:00<?, ?it/s]

BandCenter:   0%|          | 0/8758 [00:00<?, ?it/s]

Unnamed: 0,Collection_Code,Formula,Prototype,composition,composition_oxid,compound possible,max ionic char,avg ionic char,MagpieData minimum Number,MagpieData maximum Number,...,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,avg anion electron affinity,band center,chg_dispro
0,159910,TiO2,Anatase#TiO2,"(Ti, O)","(Ti4+, O2-)",True,0.594445,0.132099,8.0,22.0,...,0.0,12.0,194.0,182.0,72.666667,80.888889,12.0,-282000.0,5.812101,False
1,180903,SiO2,Cristobalite#SiO2,"(Si, O)","(Si4+, O2-)",True,0.447278,0.099395,8.0,14.0,...,0.0,12.0,227.0,215.0,83.666667,95.555556,12.0,-282000.0,6.471920,False
2,246888,MnO2,Rutile#TiO2,"(Mn, O)","(Mn4+, O2-)",True,0.590585,0.131241,8.0,25.0,...,0.0,12.0,217.0,205.0,80.333333,91.111111,12.0,-282000.0,5.956463,False
3,108587,MgZn2,Laves(2H)#MgZn2,"(Mg, Zn)","(Mg0+, Zn0+)",True,0.028486,0.006330,12.0,30.0,...,0.0,194.0,194.0,0.0,194.000000,0.000000,194.0,,4.385703,False
4,89278,SiO2,Quartz(low)#SiO2,"(Si, O)","(Si4+, O2-)",True,0.447278,0.099395,8.0,14.0,...,0.0,12.0,227.0,215.0,83.666667,95.555556,12.0,-282000.0,6.471920,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8753,56165,Ag2F,Ag2F,"(Ag, F)","(Ag0+, F0+)",True,0.650281,0.144507,9.0,47.0,...,0.0,15.0,225.0,210.0,155.000000,93.333333,225.0,,5.898090,False
8754,88619,SbO2,HgMoO4,"(Sb, O)","(Sb0+, O0+)",True,0.383086,0.085130,8.0,51.0,...,0.0,12.0,166.0,154.0,63.333333,68.444444,12.0,,6.511768,False
8755,638612,HfMo2,Laves(cub)#MgCu2,"(Hf, Mo)","(Hf0+, Mo0+)",True,0.168813,0.037514,42.0,72.0,...,0.0,194.0,229.0,35.0,217.333333,15.555556,229.0,,3.742333,False
8756,99714,MoO2,Rutile#TiO2,"(Mo, O)","(Mo4+, O2-)",True,0.336084,0.074685,8.0,42.0,...,0.0,12.0,229.0,217.0,84.333333,96.444444,12.0,-282000.0,6.062248,False


# Element-wise featurization

In [18]:
### Separate composition AB2 into A and B and their respective oxi states
### mean avg oxi state for charge disproportionated species

def separate_composition(oxid_comp):
    """Turns a composition with oxidation states into it's two components 
    sorted by ascending stoichiometric numbers
    Charge disproportionated elements are assigned the average charge
    """
    compdict = oxid_comp.as_dict()
#     species = oxid_comp.elements
#     symbols = np.array([elem.symbol for elem in oxid_comp.elements])
#     oxis = np.array([elem.oxi_state for elem in oxid_comp.elements])
    # Case of charge disproportionation
    if len(compdict) > 2:
        symbols = np.array([elem.symbol for elem in oxid_comp.elements])
        oxis = np.array([elem.oxi_state for elem in oxid_comp.elements])
        uniq_syms, counts = np.unique(symbols, return_counts=True)
        # Sort by stoich ratio
        uniq_syms = list(list(zip(*sorted(zip(counts, uniq_syms))))[1])
        # Collapse charge dispro sites and avg the oxi state
        avg_oxis = []
        for sym in uniq_syms:
            ident = (symbols == sym)
            avg_oxis.append(ident.dot(oxis)/len(ident[ident > 0]))
        return uniq_syms, avg_oxis
        return [Species(sym ,oxidation_state=oxi) for sym, oxi in zip(uniq_syms, avg_oxis)]
    # Case of no charge disproportionation
    else:
        sorted_counts_spec = list(zip(*sorted(zip(compdict.values(), compdict.keys()))))
        species = list(sorted_counts_spec[1])
        symbols = [Species.from_string(elem).symbol for elem in species]
        oxis = [Species.from_string(elem).oxi_state for elem in species]
    return symbols, oxis


sym_oxis = [separate_composition(oxid_comp) for oxid_comp in df.composition_oxid]
symbols = [entry[0] for entry in sym_oxis]
avg_oxis = [entry[1] for entry in sym_oxis]
elem_df = pd.DataFrame({'Collection_Code':df.Collection_Code})
for i in range(2):
    elem_df[f'e{i+1}_symbol'] = [entry[i] for entry in symbols]
    elem_df[f'e{i+1}_avg_oxi'] = [entry[i] for entry in avg_oxis]

In [19]:
### Get element-based features

for i in [1, 2]:
    # get element-based features
    elem_feats = StrToComposition().featurize_dataframe(elem_df[['Collection_Code', f'e{i}_symbol']], f'e{i}_symbol')
    elem_feats = ElementProperty.from_preset(preset_name="magpie").featurize_dataframe(elem_feats, 'composition')
    # Remove all columns except mean bc it's for single element
    feature_names = elem_feats.columns
    for discard_term in ['minimum','maximum','avg_dev','mode','range']:
        feature_names = elem_feats.columns
        discard_mask = np.array([discard_term in name for name in feature_names])
        elem_feats = elem_feats[feature_names[~discard_mask]]
    # rename columns to be element-specific
    rename_dict = {col:f'e{i}_{col}'.replace('mean ','') for col in elem_feats.columns if ((col[:2] != f'e{i}') and (col != 'Collection_Code')) }
    elem_feats = elem_feats.rename(columns=rename_dict)
    elem_df = elem_df.merge(elem_feats, on=['Collection_Code',f'e{i}_symbol'])
    
elem_df.columns.to_numpy()

StrToComposition:   0%|          | 0/8758 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/8758 [00:00<?, ?it/s]

StrToComposition:   0%|          | 0/8758 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/8758 [00:00<?, ?it/s]

array(['Collection_Code', 'e1_symbol', 'e1_avg_oxi', 'e2_symbol',
       'e2_avg_oxi', 'e1_composition', 'e1_MagpieData Number',
       'e1_MagpieData MendeleevNumber', 'e1_MagpieData AtomicWeight',
       'e1_MagpieData MeltingT', 'e1_MagpieData Column',
       'e1_MagpieData Row', 'e1_MagpieData CovalentRadius',
       'e1_MagpieData Electronegativity', 'e1_MagpieData NsValence',
       'e1_MagpieData NpValence', 'e1_MagpieData NdValence',
       'e1_MagpieData NfValence', 'e1_MagpieData NValence',
       'e1_MagpieData NsUnfilled', 'e1_MagpieData NpUnfilled',
       'e1_MagpieData NdUnfilled', 'e1_MagpieData NfUnfilled',
       'e1_MagpieData NUnfilled', 'e1_MagpieData GSvolume_pa',
       'e1_MagpieData GSbandgap', 'e1_MagpieData GSmagmom',
       'e1_MagpieData SpaceGroupNumber', 'e2_composition',
       'e2_MagpieData Number', 'e2_MagpieData MendeleevNumber',
       'e2_MagpieData AtomicWeight', 'e2_MagpieData MeltingT',
       'e2_MagpieData Column', 'e2_MagpieData Row',
       

In [20]:
### Merge element features into main df
df = df.merge(elem_df, on='Collection_Code')

# Clean up and write features

In [21]:
### Clean up feature names
rename_dict = {col : col.replace("MagpieData ", "").replace('minimum','min').replace('maximum','max').replace(" ","_") for col in df.columns}
df = df.rename(columns=rename_dict)

### Drop features we don't want
df = df.drop(columns=['compound_possible'] + [col for col in df.columns if 'SpaceGroupNumber' in col])
df.columns.to_numpy()

array(['Collection_Code', 'Formula', 'Prototype', 'composition',
       'composition_oxid', 'max_ionic_char', 'avg_ionic_char',
       'min_Number', 'max_Number', 'range_Number', 'mean_Number',
       'avg_dev_Number', 'mode_Number', 'min_MendeleevNumber',
       'max_MendeleevNumber', 'range_MendeleevNumber',
       'mean_MendeleevNumber', 'avg_dev_MendeleevNumber',
       'mode_MendeleevNumber', 'min_AtomicWeight', 'max_AtomicWeight',
       'range_AtomicWeight', 'mean_AtomicWeight', 'avg_dev_AtomicWeight',
       'mode_AtomicWeight', 'min_MeltingT', 'max_MeltingT',
       'range_MeltingT', 'mean_MeltingT', 'avg_dev_MeltingT',
       'mode_MeltingT', 'min_Column', 'max_Column', 'range_Column',
       'mean_Column', 'avg_dev_Column', 'mode_Column', 'min_Row',
       'max_Row', 'range_Row', 'mean_Row', 'avg_dev_Row', 'mode_Row',
       'min_CovalentRadius', 'max_CovalentRadius', 'range_CovalentRadius',
       'mean_CovalentRadius', 'avg_dev_CovalentRadius',
       'mode_CovalentRadius'

In [22]:
### Write
df.to_csv('raw_combined_features.csv', index=None)

In [23]:
df

Unnamed: 0,Collection_Code,Formula,Prototype,composition,composition_oxid,max_ionic_char,avg_ionic_char,min_Number,max_Number,range_Number,...,e2_NfValence,e2_NValence,e2_NsUnfilled,e2_NpUnfilled,e2_NdUnfilled,e2_NfUnfilled,e2_NUnfilled,e2_GSvolume_pa,e2_GSbandgap,e2_GSmagmom
0,159910,TiO2,Anatase#TiO2,"(Ti, O)","(Ti4+, O2-)",0.594445,0.132099,8.0,22.0,14.0,...,0.0,6.0,0.0,2.0,0.0,0.0,2.0,9.105,0.000,0.0
1,180903,SiO2,Cristobalite#SiO2,"(Si, O)","(Si4+, O2-)",0.447278,0.099395,8.0,14.0,6.0,...,0.0,6.0,0.0,2.0,0.0,0.0,2.0,9.105,0.000,0.0
2,246888,MnO2,Rutile#TiO2,"(Mn, O)","(Mn4+, O2-)",0.590585,0.131241,8.0,25.0,17.0,...,0.0,6.0,0.0,2.0,0.0,0.0,2.0,9.105,0.000,0.0
3,108587,MgZn2,Laves(2H)#MgZn2,"(Mg, Zn)","(Mg0+, Zn0+)",0.028486,0.006330,12.0,30.0,18.0,...,0.0,12.0,0.0,0.0,0.0,0.0,0.0,13.960,0.000,0.0
4,89278,SiO2,Quartz(low)#SiO2,"(Si, O)","(Si4+, O2-)",0.447278,0.099395,8.0,14.0,6.0,...,0.0,6.0,0.0,2.0,0.0,0.0,2.0,9.105,0.000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8753,56165,Ag2F,Ag2F,"(Ag, F)","(Ag0+, F0+)",0.650281,0.144507,9.0,47.0,38.0,...,0.0,11.0,1.0,0.0,0.0,0.0,1.0,16.330,0.000,0.0
8754,88619,SbO2,HgMoO4,"(Sb, O)","(Sb0+, O0+)",0.383086,0.085130,8.0,51.0,43.0,...,0.0,6.0,0.0,2.0,0.0,0.0,2.0,9.105,0.000,0.0
8755,638612,HfMo2,Laves(cub)#MgCu2,"(Hf, Mo)","(Hf0+, Mo0+)",0.168813,0.037514,42.0,72.0,30.0,...,0.0,6.0,1.0,0.0,5.0,0.0,6.0,15.690,0.000,0.0
8756,99714,MoO2,Rutile#TiO2,"(Mo, O)","(Mo4+, O2-)",0.336084,0.074685,8.0,42.0,34.0,...,0.0,6.0,0.0,2.0,0.0,0.0,2.0,9.105,0.000,0.0
