In [None]:
import pymatgen.core as mg
from pymatgen.core.periodic_table import Species
from pymatgen.io import cif
from matminer.featurizers.conversions import StrToComposition, CompositionToOxidComposition
from matminer.featurizers.composition import ElementProperty, IonProperty, ElectronAffinity,\
                                                BandCenter, TMetalFraction, ElectronegativityDiff
import glob
import pandas as pd
import numpy as np


def prettify_features(df, discard_terms=['minimum','maximum','avg_dev','mode','range'], prefix=''):
    """Prettify feature names and remove redundant features 
    """
    feature_names = df.columns
    
    for discard_term in discard_terms:
        feature_names = df.columns
        discard_mask = np.array([discard_term in name for name in feature_names])
        df = df[feature_names[~discard_mask]]

    rename_dict = {col : f'{prefix}{col.replace("MagpieData ", "").replace("mean ","").replace(" ","_").lower()}' for col in df.columns}
    print(rename_dict)
    df = df.rename(columns=rename_dict)
    return df

# read csv from previous step

In [None]:
df = pd.read_csv('../2_filter/filtered_entries_for_featurization.csv')

# Composition-based featurization

In [None]:
# Find composition
df = StrToComposition().featurize_dataframe(df, 'Formula')

# Guess oxidation states
df = CompositionToOxidComposition().featurize_dataframe(df, 'composition')

### Detect 0 oxi states
has_0_oxi = [0 in [spec.oxi_state for spec in comp.elements] for comp in df.composition_oxid]
print(f'{sum(has_0_oxi)} entries with +0 oxidation states')

# Calculate composition based features
df = IonProperty().featurize_dataframe(df, 'composition_oxid')
df = ElementProperty.from_preset(preset_name="magpie").featurize_dataframe(df, 'composition_oxid')
df = ElectronAffinity().featurize_dataframe(df, 'composition_oxid', ignore_errors=True)
df = BandCenter().featurize_dataframe(df, 'composition_oxid', ignore_errors=True)

# Flag compounds which tend to charge-disproporionate
df['chg_dispro'] = [len(oxid_comp) > 2 for oxid_comp in df.composition_oxid]
df

# Element-wise featurization

In [None]:
### Separate composition AB2 into A and B and their respective oxi states
### mean avg oxi state for charge disproportionated species

def separate_composition(oxid_comp):
    """Turns a composition with oxidation states into it's two components 
    sorted by ascending stoichiometric numbers
    Charge disproportionated elements are assigned the average charge
    """
    compdict = oxid_comp.as_dict()
#     species = oxid_comp.elements
#     symbols = np.array([elem.symbol for elem in oxid_comp.elements])
#     oxis = np.array([elem.oxi_state for elem in oxid_comp.elements])
    # Case of charge disproportionation
    if len(compdict) > 2:
        symbols = np.array([elem.symbol for elem in oxid_comp.elements])
        oxis = np.array([elem.oxi_state for elem in oxid_comp.elements])
        uniq_syms, counts = np.unique(symbols, return_counts=True)
        # Sort by stoich ratio
        uniq_syms = list(list(zip(*sorted(zip(counts, uniq_syms))))[1])
        # Collapse charge dispro sites and avg the oxi state
        avg_oxis = []
        for sym in uniq_syms:
            ident = (symbols == sym)
            avg_oxis.append(ident.dot(oxis)/len(ident[ident > 0]))
        return uniq_syms, avg_oxis
        return [Species(sym ,oxidation_state=oxi) for sym, oxi in zip(uniq_syms, avg_oxis)]
    # Case of no charge disproportionation
    else:
        sorted_counts_spec = list(zip(*sorted(zip(compdict.values(), compdict.keys()))))
        species = list(sorted_counts_spec[1])
        symbols = [Species.from_string(elem).symbol for elem in species]
        oxis = [Species.from_string(elem).oxi_state for elem in species]
    return symbols, oxis


sym_oxis = [separate_composition(oxid_comp) for oxid_comp in df.composition_oxid]
symbols = [entry[0] for entry in sym_oxis]
avg_oxis = [entry[1] for entry in sym_oxis]
elem_df = pd.DataFrame({'Collection_Code':df.Collection_Code})
for i in range(2):
    elem_df[f'e{i+1}_symbol'] = [entry[i] for entry in symbols]
    # Split oxi states into pos and negative columns so we can deal with 
    # zeros for dimensionless features later
    elem_df[f'e{i+1}_avg_oxi_pos'] = [entry[i] if entry[i] > 0 else 0 for entry in avg_oxis]
    elem_df[f'e{i+1}_avg_oxi_neg'] = [-entry[i] if entry[i] < 0 else 0 for entry in avg_oxis]

In [None]:
### Get element-based features

for i in [1, 2]:
    # get element-based features
    elem_feats = StrToComposition().featurize_dataframe(elem_df[['Collection_Code', f'e{i}_symbol']], f'e{i}_symbol')
    elem_feats = ElementProperty.from_preset(preset_name="magpie").featurize_dataframe(elem_feats, 'composition')
    # Remove all columns except mean bc it's for single element
    feature_names = elem_feats.columns
    for discard_term in ['minimum','maximum','avg_dev','mode','range']:
        feature_names = elem_feats.columns
        discard_mask = np.array([discard_term in name for name in feature_names])
        elem_feats = elem_feats[feature_names[~discard_mask]]
    # rename columns to be element-specific
    rename_dict = {col:f'e{i}_{col}'.replace('mean ','') for col in elem_feats.columns if ((col[:2] != f'e{i}') and (col != 'Collection_Code')) }
    elem_feats = elem_feats.rename(columns=rename_dict)
    elem_df = elem_df.merge(elem_feats, on=['Collection_Code',f'e{i}_symbol'])
    
elem_df.columns.to_numpy()

In [None]:
### Merge element features into main df
df = df.merge(elem_df, on='Collection_Code')

# Clean up and write features

In [None]:
### Clean up feature names
rename_dict = {col : col.replace("MagpieData ", "").replace('minimum','min').replace('maximum','max').replace(" ","_") for col in df.columns}
df = df.rename(columns=rename_dict)

### Drop features we don't want
df = df.drop(columns=['compound_possible'] + [col for col in df.columns if 'SpaceGroupNumber' in col])
df.columns.to_numpy()

In [None]:
### Write
df.to_csv('raw_combined_features.csv', index=None)

# Fix oxi state (split into neg)

In [42]:
# file_to_fix = 'final_feature_array.csv'
file_to_fix = 'cropped_final_feature_array.csv'
df = pd.read_csv(file_to_fix)

for i in range(2):
    df[f'e{i+1}_avg_oxi_pos'] = [entry if entry > 0 else 0 for entry in df[f'e{i+1}_avg_oxi']]
    df[f'e{i+1}_avg_oxi_neg'] = [-entry if entry < 0 else 0 for entry in df[f'e{i+1}_avg_oxi']]
df.drop(columns=['e1_avg_oxi','e2_avg_oxi']).columns.to_numpy()

array(['Collection_Code', 'Formula', 'max_ionic_char', 'avg_ionic_char',
       'chg_dispro', 'e1_Number', 'e1_MendeleevNumber', 'e1_AtomicWeight',
       'e1_MeltingT', 'e1_Column', 'e1_Row', 'e1_CovalentRadius',
       'e1_Electronegativity', 'e1_NsValence', 'e1_NpValence',
       'e1_NdValence', 'e1_NfValence', 'e1_NValence', 'e1_NsUnfilled',
       'e1_NpUnfilled', 'e1_NdUnfilled', 'e1_NfUnfilled', 'e1_NUnfilled',
       'e1_GSvolume_pa', 'e1_GSbandgap', 'e1_GSmagmom', 'e2_Number',
       'e2_MendeleevNumber', 'e2_AtomicWeight', 'e2_MeltingT',
       'e2_Column', 'e2_Row', 'e2_CovalentRadius', 'e2_Electronegativity',
       'e2_NsValence', 'e2_NpValence', 'e2_NdValence', 'e2_NfValence',
       'e2_NValence', 'e2_NsUnfilled', 'e2_NpUnfilled', 'e2_NdUnfilled',
       'e2_NfUnfilled', 'e2_NUnfilled', 'e2_GSvolume_pa', 'e2_GSbandgap',
       'e2_GSmagmom', 'e1_avg_oxi_pos', 'e1_avg_oxi_neg',
       'e2_avg_oxi_pos', 'e2_avg_oxi_neg'], dtype=object)

In [43]:
df.drop(columns=['e1_avg_oxi','e2_avg_oxi']).to_csv(file_to_fix, index=None)