In [1]:
from multiprocessing import set_start_method
# set_start_method("spawn")

import os
import pickle
import numpy as np
from numpy import nan as Nan
import pandas as pd

from ase import atoms
from ase.io import read, write
from dscribe.descriptors import SOAP
import matminer.featurizers.composition as mm_composition
import matminer.featurizers.structure as mm_structure
import pymatgen as mg
from pymatgen.io import ase
from pymatgen.io.cif import CifParser
from pymatgen.io.cif import CifWriter

from tqdm import notebook as tqdm
from tqdm.auto import tqdm as tqdm_pandas
tqdm_pandas.pandas()

AAA = ase.AseAtomsAdaptor

# # CAVD imports, comment out when using 3.7
# from numpy import nan as NaN
# from monty.io import zopen
# from cavd.channel import Channel
# from cavd.netstorage import AtomNetwork, connection_values_list
# from cavd.local_environment import CifParser_new, LocalEnvirCom
# import re

### 3a. Load the structures_df from the saved pickle

The structures_df already contains the eight simplifications plus the original structure. Thus there are nine unique representations. 

In [2]:
save_path = os.path.join(os.getcwd(), 'semi-supervised_supporting_files/structures_df_3p8_post_sanitize_w_simplifications.pkl')
open_file = open(save_path, 'rb')
structures_df = pickle.load(open_file)
open_file.close()

In [3]:
structures_df.head()

Unnamed: 0,index,structure,ICSD_ID,MP_ID,pretty_formula,spacegroup,bandgap,e_hull,ase_structure,composition,replacement,structure_A,structure_AM,structure_CAN,structure_CAMN,structure_A40,structure_AM40,structure_CAN40,structure_CAMN40
0,0,"[[0. 0. 2.6255595] Li0+, [0. ...",180565,mp-1001790,LiO3,Imm2,0.0854,0.22542,"(Atom('Li', [0.0, 0.0, 2.6255594986], index=0)...",Li1 O3,,"[[0. 0. 5.35128645] S0+, [5.55...",[[0. 0. 2.6255595] Li0+],"[[0. 0. 5.35128645] Mg0+, [5.5...","[[0. 0. 2.6255595] Li0+, [0. ...","[[0. 0. 7.5441549] S0+, [-1.1102...",[[0. 0. 3.70147024] Li0+],"[[0. 0. 7.5441549] Mg0+, [-1.110...","[[0. 0. 3.70147024] Li0+, [0. ..."
1,1,"[[2.79072525 1.34013453 0.79373764] Li0+, [0.9...",188829,mp-1001825,LiBe,P2_1/m,0.0,0.166972,"(Atom('Li', [2.79072525, 1.34013452726, 0.7937...",Li2 Be2,,"[[2.79072525 1.17476284 3.09734748] S0+, [0.93...","[[2.79072525 1.34013453 0.79373764] Li0+, [0.9...","[[2.79072525 1.17476284 3.09734748] Mg0+, [0.9...","[[2.79072525 1.34013453 0.79373764] Li0+, [0.9...","[[3.39085782 1.42739017 3.76341775] S0+, [1.13...","[[3.39085782 1.62832427 0.96442725] Li0+, [1.1...","[[3.39085782 1.42739017 3.76341775] Mg0+, [1.1...","[[3.39085782 1.62832427 0.96442725] Li0+, [1.1..."
2,2,"[[2.412716 2.412716 2.412716] Li0+, [3.619074 ...",236959,mp-1001831,LiB,Fd-3m,1.4331,0.386054,"(Atom('Li', [2.412716, 2.412716, 2.412716], in...",Li2 B2,,"[[1.206358 1.206358 1.206358] S0+, [0. 0. 0.] ...","[[2.412716 2.412716 2.412716] Li0+, [3.619074 ...","[[1.206358 1.206358 1.206358] Mg0+, [0. 0. 0.]...","[[2.412716 2.412716 2.412716] Li0+, [3.619074 ...","[[1.70997595 1.70997595 1.70997595] S0+, [0. 0...","[[3.41995189 3.41995189 3.41995189] Li0+, [5.1...","[[1.70997595 1.70997595 1.70997595] Mg0+, [0. ...","[[3.41995189 3.41995189 3.41995189] Li0+, [5.1..."
3,3,"[[0. 0. 0.] Li+, [1.2797665 1.2797665 1.279766...",184904,mp-1009009,LiF,Pm-3m,7.5195,0.287542,"(Atom('Li', [0.0, 0.0, 0.0], index=0), Atom('F...",Li1 F1,,[[1.2797665 1.2797665 1.2797665] S-],"[[0. 0. 0.] Li+, [1.2797665 1.2797665 1.279766...",[[1.2797665 1.2797665 1.2797665] S-],"[[0. 0. 0.] Li+, [1.2797665 1.2797665 1.279766...",[[1.70997595 1.70997595 1.70997595] S-],"[[0. 0. 0.] Li+, [1.70997595 1.70997595 1.7099...",[[1.70997595 1.70997595 1.70997595] S-],"[[0. 0. 0.] Li+, [1.70997595 1.70997595 1.7099..."
4,4,"[[1.4805505 1.9928345 2.448045 ] Li0+, [0. 0. ...",180561,mp-1018789,LiO2,Pnnm,0.0,0.084218,"(Atom('Li', [1.4805505, 1.9928345, 2.448045], ...",Li2 O4,,"[[1.4805505 2.51837684 0.41921791] S0+, [1.48...","[[1.4805505 1.9928345 2.448045 ] Li0+, [0. 0. ...","[[1.4805505 2.51837684 0.41921791] Mg0+, [1.4...","[[1.4805505 1.9928345 2.448045 ] Li0+, [0. 0. ...","[[2.07903566 3.53638411 0.588679 ] S0+, [2.07...","[[2.07903566 2.798401 3.43762192] Li0+, [0. ...","[[2.07903566 3.53638411 0.588679 ] Mg0+, [2.0...","[[2.07903566 2.798401 3.43762192] Li0+, [0. ..."


In [4]:
class Feature_Creator:
    """
    A class to handle calculation of the various features/descriptors. 

    ...

    Attributes
    ----------
    structures_df : pd.DataFrame
        The dataframe that all the structures and simplified representations are stored in
    
    mode_list : list
        A list containing the 9 modes that the class can use. If a different mode is passed then an error is thrown. 

    mode : str
        The mode string signifies what mode the class is operating in
        
    unique_atoms : list
        The SOAP featurizer requires knowledge of all unique atoms in the structure. This list stores the unique atoms. 
        
    n_jobs : int
        The number of CPU cores that will be used for featurizers that support parallel processing

    Methods
    -------
    set_mode(mode):
        Set the mode that the class operates in. Each move corresponds to one of the nine structure representations. 
        
    calculate_unique_atoms(mode):
        Calculate the unqiue atoms in the structure. Used for the SOAP representation. 
        
    run_atomic_packing_efficiency_featurizer(mode):
        Calculate the atomic packing efficiency using matminer.featurizers.composition.AtomicPackingEfficiency()  
        
    run_band_center_featurizer(mode):
        Calculate the band centers using matminer.featurizers.composition.BandCenter()    
                
    run_bond_fraction_featurizer(mode):
        Calculate the bond fractions using matminer.featurizers.structure.BondFractions()  
                
    run_chemical_ordering_featurizer(mode):
        Calculate the chemical ordering using matminer.featurizers.structure.ChemicalOrdering()    
                
    run_density_featurizer(mode):
        Calculate features related to density using matminer.featurizers.structure.DensityFeatures(("density", "vpa", "packing fraction"))    
                
    end_featurizer_helper(structure, end_featurizer):
        A helper for the run_electron_negativity_difference_featurizer     
                
    run_electron_negativity_difference_featurizer(mode):
        Calculates the electron negativity difference for atoms in the composition using matminer.featurizers.composition.ElectronegativityDiff()   
                
    run_ewald_energy_featurizer(mode):
        Calculates the Ewald energy using matminer.featurizers.structure.EwaldEnergy()    
                
    run_global_instability_index_featurizer(mode, rcut_list):
        Calculates the global instability index using matminer.featurizers.structure.GlobalInstabilityIndex(r_cut=rcut)    
                
    run_jarvis_cfid_featurizer(mode):
        A jarvis CFID calculation using matminer.featurizers.structure.JarvisCFID()    
                
    run_maximum_packing_efficiency_featurizer(mode):
        Calculates the packing efficiency using matminer.featurizers.structure.MaximumPackingEfficiency()  
                
    run_meredig_featurizer(mode):
        Calculates Meredig features using matminer.featurizers.composition.Meredig()    
                
    run_orbital_field_matrix_featurizer(mode):
        Calculates the orbital field matrix from matminer.featurizers.structure.OxidationStates()    
                
    run_oxidation_states_featurizer(mode):
        Grabs the oxidation states from the composition using matminer.featurizers.composition.OxidationStates()   
                
    run_rdf_featurizer(mode, cutoff_list, bin_size_list):
        Calculates a radial distribution function matminer.featurizers.structure.RadialDistributionFunction(cutoff=cutoff, bin_size=bin_size)    

    run_sine_coulomb_featurizer(mode):
        Calculates the Sine Coulomb matrix using matminer.featurizers.structure.SineCoulombMatrix()    
        
    run_SOAP(mode, rcut_list, nmax_list, lmax_list, average):
        Calculates a Smooth Overlap of Atomic Positions representaion using dscribe.descriptors.SOAP()    
        
    run_structural_complexity_featurizer(mode):
        Calculates the structural complexity using matminer.featurizers.structure.StructuralComplexity()    
        
    run_structural_heterogeneity_featurizer(mode):
        Calculates the structural heterogeneity using matminer.featurizers.structure.StructuralHeterogeneity()    
        
    run_valence_orbital_featurizer(mode):
        Calculates valence obrbital information using matminer.featurizers.composition.ValenceOrbital()    
            
    run_XRD_featurizer(mode, pattern_length_list):
        Calculates a powder X-ray diffraction pattern using matminer.featurizers.structure.XRDPowderPattern(pattern_length=pattern_length)
        
    run_yang_solid_solution_featurizer(mode):
        Calculates the yang solid solution information from matminer.featurizers.composition.YangSolidSolution() 
    """
    
    def __init__(self, structures_df):     
        self.structures_df = structures_df
        self.mode_list = ['structure', 'structure_A', 'structure_AM', 'structure_CAN', 'structure_CAMN', 'structure_A40', 'structure_AM40', 'structure_CAN40', 'structure_CAMN40']
        self.mode = 'structure'
        self.unique_atoms = []
        self.n_jobs = 63
    
    def set_mode(self, mode):
        """
        Function to set the operating mode for the class. This mode tells the class which represenation of the structure to use
        (i.e., 'structure', 'structure_A', 'structure_AM', 'structure_CAN', 'structure_CAMN', 'structure_A40', 
        'structure_AM40', 'structure_CAN40', 'structure_CAMN40')

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
            
        Raises
        ------
        Exception
            If the mode is not supported. 
        """
        if mode in self.mode_list:
            self.mode = mode
        else:
            raise Exception('The mode \'{}\' is not supported.'.format(mode))
    
    def calculate_unique_atoms(self, mode):
        """
        Function to identify the unique atoms that exist for a given mode. Then sets the 
        unique_atoms attribute so that it can be used by the SOAP featurizer. 

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        self.unique_atoms = []
        for structure in tqdm.tqdm(self.structures_df[mode]):
            for num in structure.symbol_set:
                if num not in self.unique_atoms:
                    self.unique_atoms.append(num)
        self.unique_atoms = np.sort(self.unique_atoms)

    """ 
    Featurizer functions (and any helpers) are listed below this line in alphabetical order.
    Because calculated feature representations can be quite large they are directly saved into the 'features' repository. 
    """
    def run_atomic_packing_efficiency_featurizer(self, mode):
        """
        Function to run the atomic packing efficiency featurizer.
        Saves the files with the prefix "ape" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        ape_featurizer = mm_composition.AtomicPackingEfficiency()
        ape_featurizer_result = np.array(self.structures_df[mode].progress_apply(lambda x: ape_featurizer.featurize(x.composition)).values.tolist())
        np.save('features/ape_features_mode-{}'.format(self.mode), ape_featurizer_result)
        
    def run_band_center_featurizer(self, mode):
        """
        Function to run the band center featurizer.
        Saves the files with the prefix "bc" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        bc_featurizer = mm_composition.BandCenter()
        bc_featurizer_result = np.array(self.structures_df[mode].progress_apply(lambda x: bc_featurizer.featurize(x.composition)).values.tolist())
        np.save('features/bc_features_mode-{}'.format(self.mode), bc_featurizer_result)

    def run_bond_fraction_featurizer(self, mode):
        """
        Function to run the bond fraction featurizer.
        Saves the files with the prefix "bf" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        bf_featurizer = mm_structure.BondFractions()
        bf_featurizer.fit(self.structures_df[self.mode])
        bf_featurizer.set_n_jobs = self.n_jobs
        bf_featurizer_result = bf_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/bf_features_mode-{}'.format(self.mode), bf_featurizer_result)
      
    def run_chemical_ordering_featurizer(self, mode):
        """
        Function to run the chemical ordering featurizer.
        Saves the files with the prefix "co" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        co_featurizer = mm_structure.ChemicalOrdering()
        co_featurizer.fit(self.structures_df[self.mode])
        co_featurizer.set_n_jobs = self.n_jobs
        co_featurizer_result = co_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/co_features_mode-{}'.format(self.mode), co_featurizer_result)

    def run_density_featurizer(self, mode):
        """
        Function to run the density featurizer.
        Saves the files with the prefix "density" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        density_featurizer = mm_structure.DensityFeatures(("density", "vpa", "packing fraction"))
        density_featurizer.fit(self.structures_df[self.mode])
        density_featurizer.set_n_jobs = self.n_jobs
        density_featurizer_result = density_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/density_features_mode-{}'.format(self.mode), density_featurizer_result)

    def end_featurizer_helper(self, structure, end_featurizer):
        """
        A helper function for the run_electron_negativity_difference_featurizer() function.
        The helper catches any value errors and returns a usable represntation. 
        This function is intended to be run using the pandas.DataFrame.apply() method

        Parameters
        ----------
        structure : pymatgen.core.structure
            A pymatgen structure
            
        end_featurizer : matminer.featurizers.composition.ElectronegativityDiff()
            The featurizer from matminer. 
        """
        try: 
            return end_featurizer.featurize(structure.composition)
        except ValueError:
            return [0, 0, 0, 0, 0]
        except:
            return [Nan, Nan, Nan, Nan, Nan]
        
    def run_electron_negativity_difference_featurizer(self, mode):
        """
        Function to run the electron negativity difference featurizer.
        Saves the files with the prefix "end" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        end_featurizer = mm_composition.ElectronegativityDiff()
        end_featurizer_result = np.array(self.structures_df[self.mode].progress_apply(self.end_featurizer_helper, end_featurizer=end_featurizer).values.tolist())
        np.save('features/end_features_mode-{}'.format(self.mode), end_featurizer_result)

    def run_ewald_energy_featurizer(self, mode):
        """
        Function to run the Ewald energy featurizer.
        Saves the files with the prefix "ee" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        ee_featurizer = mm_structure.EwaldEnergy()
        ee_featurizer.fit(self.structures_df[self.mode])
        ee_featurizer.set_n_jobs = self.n_jobs
        ee_featurizer_result = ee_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/ee_features_mode-{}'.format(self.mode), ee_featurizer_result)

    def run_global_instability_index_featurizer(self, mode, rcut_list):
        """
        Function to run the global instability index featurizer.
        The function will generate a feature for each radial cutoff that is passed in the list: rcut_list
        Saves the files with the prefix "gii" and a suffix indicating the mode. 

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        
        rcut_list : list
            A list containing one or more different radial cutoff (in angstroms)
        """
        self.set_mode(mode)
        for rcut in rcut_list:
            gii_featurizer = mm_structure.GlobalInstabilityIndex(r_cut=rcut)
            gii_featurizer.fit(self.structures_df[self.mode])
            gii_featurizer.set_n_jobs = self.n_jobs
            gii_featurizer_result = gii_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
            np.save('features/gii_features_rcut-{}_mode-{}'.format(rcut, self.mode), gii_featurizer_result)

    def run_jarvis_cfid_featurizer(self, mode):
        """
        Function to run the Jarvis CFID featurizer.
        Saves the files with the prefix "jc" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        jcfid_featurizer = mm_structure.JarvisCFID()
        jcfid_featurizer.fit(self.structures_df[self.mode])
        jcfid_featurizer.set_n_jobs = self.n_jobs
        jcfid_featurizer_result = jcfid_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/jcfid_features_mode-{}'.format(self.mode), jcfid_featurizer_result)

    def run_maximum_packing_efficiency_featurizer(self, mode):
        """
        Function to run the maximum packing efficiency featurizer.
        Saves the files with the prefix "mpe" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        mpe_featurizer = mm_structure.MaximumPackingEfficiency()
        mpe_featurizer.fit(self.structures_df[self.mode])
        mpe_featurizer.set_n_jobs = self.n_jobs
        mpe_featurizer_result = mpe_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/mpe_features_mode-{}'.format(self.mode), mpe_featurizer_result)

    def run_meredig_featurizer(self, mode):
        """
        Function to run the MereDig featurizer.
        Saves the files with the prefix "md" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        md_featurizer = mm_composition.Meredig()
        md_featurizer_result = np.array(self.structures_df[mode].progress_apply(lambda x: md_featurizer.featurize(x.composition)).values.tolist())
        np.save('features/md_features_mode-{}'.format(self.mode), md_featurizer_result)
        
    def run_orbital_field_matrix_featurizer(self, mode):
        """
        Function to run the orbital field matrix featurizer.
        Saves the files with the prefix "ofm" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        ofm_featurizer = mm_structure.OrbitalFieldMatrix(period_tag=True)
        ofm_featurizer.fit(self.structures_df[self.mode])
        ofm_featurizer.set_n_jobs = self.n_jobs
        ofm_featurizer_result = ofm_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/ofm_features_mode-{}'.format(self.mode), ofm_featurizer_result)
        
    def run_oxidation_states_featurizer(self, mode):
        """
        Function to run the oxidation states featurizer.
        Saves the files with the prefix "os" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        os_featurizer = mm_composition.OxidationStates()
        os_featurizer_result = np.array(self.structures_df[mode].progress_apply(lambda x: os_featurizer.featurize(x.composition)).values.tolist())
        np.save('features/os_features_mode-{}'.format(self.mode), os_featurizer_result)
        
    def run_rdf_featurizer(self, mode, cutoff_list, bin_size_list):
        """
        Function to run the radial distribution function featurizer.
        This function will generate an rdf feature for all the combinations of radial cutoffs and bin sizes that are passed
        into cutoff_list and bin_size_list.
        Saves the files with the prefix "rdf" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
            
        cutoff_list : list
            A list containing all the desired radial cutoffs. 
        
        bin_size_list : list
            A list containing all the desired bin sizes. 
        
        """
        self.set_mode(mode)
        
        # iteratre over the cutoff_list and the bin_size_list
        for cutoff in cutoff_list:
            for bin_size in bin_size_list:
                rdf_featurizer = mm_structure.RadialDistributionFunction(cutoff=cutoff, bin_size=bin_size)
                rdf_featurizer.fit(self.structures_df[self.mode])
                rdf_featurizer.set_n_jobs = self.n_jobs
                rdf_featurizer_result = rdf_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
                
                # capture errors and fill with zeroes
                error = 0
                radial_recreate = []
                for row in rdf_featurizer_result:
                    try:
                        radial_recreate.append(row[0]['distribution'].flatten())
                    except:
                        error+=1
                        radial_recreate.append([0.]*(cutoff/bin_size))
                print("There were {} errors when using mode: {} with cutoff={} and bin_size-{}. Filling those rows with zeroes.".format(error, self.mode, cutoff, bin_size))
                np.save('features/rdf_features_cutoff-{}_binsize-{}_mode-{}'.format(cutoff, bin_size, self.mode), radial_recreate)

    def run_sine_coulomb_featurizer(self, mode):
        """
        Function to run the sine coulomb featurizer.
        Saves the files with the prefix "scm" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        scm_featurizer = mm_structure.SineCoulombMatrix()
        scm_featurizer.fit(self.structures_df[self.mode])
        scm_featurizer.set_n_jobs = self.n_jobs
        scm_featurizer_result = scm_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/scm_features_mode-{}'.format(self.mode), scm_featurizer_result)
                
    def run_SOAP(self, mode, rcut_list, nmax_list, lmax_list, average):
        """
        Function to run the smooth overlap of atomic position featurizer.
        The function will automatically generate and save representations for all possible combinations 
        of the integers in rcut_list, nmax_list, and lmax_list. 
        Saves the files with the prefix "SOAP" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
            
        rcut_list : list
            A list containing all the desired radial cutoffs in angstroms. 
        
        nmax_list : list
            A list containing all the desired radial basis functions.
        
        lmax_list : list
            A list containing all the desired values for maximum degree of spherical harmonics. 
        
        average : str
            The averaging strategy used. Either 'inner' or 'outer'.
        """

        self.calculate_unique_atoms(mode)
        
        # iterate over all of the rcut, nmax and lmax values
        for rcut in rcut_list:
            for nmax in nmax_list:
                for lmax in lmax_list:
                    average_soap = SOAP(
                        species=self.unique_atoms,
                        rcut=rcut,
                        nmax=nmax,
                        lmax=lmax,
                        periodic=True,
                        average=average,
                        sparse=True
                    )
                    ase_structures = self.structures_df[mode].progress_apply(AAA.get_atoms).to_numpy()
                    average_soap_data = average_soap.create(ase_structures, n_jobs=31, verbose=False)
                    pairings = np.concatenate([np.r_[average_soap.get_location(("S", x))] for x in self.unique_atoms])
                    np.save('features/SOAP_features_partialS_{}_rcut-{}_nmax-{}_lmax-{}_mode-{}'.format(average, rcut, nmax, lmax, mode), average_soap_data[:,pairings])

    def run_structural_complexity_featurizer(self, mode):
        """
        Function to run the structural complexity featurizer.
        Saves the files with the prefix "sc" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        sc_featurizer = mm_structure.StructuralComplexity()
        sc_featurizer.fit(self.structures_df[self.mode])
        sc_featurizer.set_n_jobs = self.n_jobs
        sc_featurizer_result = sc_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/sc_features_mode-{}'.format(self.mode), sc_featurizer_result)
                    
    def run_structural_heterogeneity_featurizer(self, mode):
        """
        Function to run the structural heterogeneity featurizer.
        Saves the files with the prefix "sh" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        sh_featurizer = mm_structure.StructuralHeterogeneity()
        sh_featurizer.fit(self.structures_df[self.mode])
        sh_featurizer.set_n_jobs = self.n_jobs
        sh_featurizer_result = sh_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
        np.save('features/sh_features_mode-{}'.format(self.mode), sh_featurizer_result)

    def run_valence_orbital_featurizer(self, mode):
        """
        Function to run the valence orbital featurizer.
        Saves the files with the prefix "vo" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        vo_featurizer = mm_composition.ValenceOrbital()
        vo_featurizer_result = np.array(self.structures_df[mode].progress_apply(lambda x: vo_featurizer.featurize(x.composition)).values.tolist())
        np.save('features/vo_features_mode-{}'.format(self.mode), vo_featurizer_result)        

    def run_XRD_featurizer(self, mode, pattern_length_list):
        """
        Function to run the powder XRD featurizer.
        The function will automatically generate and save representations for every pattern length
        value that is contained in the list: pattern_length_list.
        Saves the files with the prefix "xrd" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
            
        pattern_length_list : list
            A list containing all the desired values for the pattern length. 
        """
        self.set_mode(mode)
        for pattern_length in pattern_length_list:
            xrd_featurizer = mm_structure.XRDPowderPattern(pattern_length=pattern_length)
            xrd_featurizer.fit(self.structures_df[self.mode])

            xrd_featurizer.set_n_jobs = self.n_jobs
            xrd_featurizer_result = xrd_featurizer.featurize_many(self.structures_df[self.mode], ignore_errors=True)
            np.save('features/xrd_features_pattern_length-{}_mode-{}'.format(pattern_length, self.mode), xrd_featurizer_result)        

    def run_yang_solid_solution_featurizer(self, mode):
        """
        Function to run the yang solid solution featurizer.
        Saves the files with the prefix "yss" and a suffix indicating the mode.

        Parameters
        ----------
        mode : str
            The class can operate in 9 modes. This str is used to set the mode attribute. 
        """
        self.set_mode(mode)
        yss_featurizer = mm_composition.YangSolidSolution()
        yss_featurizer_result = np.array(self.structures_df[mode].progress_apply(lambda x: yss_featurizer.featurize(x.composition)).values.tolist())
        np.save('features/yss_features_mode-{}'.format(self.mode), yss_featurizer_result)

## 3a. Initialize the Feature_Creator class

In [5]:
# for full model
fc = Feature_Creator(structures_df)

# the full model takes a long time to run. Use this line for testing/debugging purposes
# fc = Feature_Creator(structures_df[0:500])

## 3b. Make sure the 9 modes are working correctly
Check to see that the correct atoms are in each mode. 

In [6]:
for mode in fc.mode_list:
    fc.calculate_unique_atoms(mode)
    print("{} contains {}".format(mode, fc.unique_atoms))

  0%|          | 0/25193 [00:00<?, ?it/s]

structure contains ['Ac' 'Ag' 'Al' 'As' 'Au' 'B' 'Ba' 'Be' 'Bi' 'Br' 'C' 'Ca' 'Cd' 'Ce' 'Cl'
 'Co' 'Cr' 'Cs' 'Cu' 'Dy' 'Er' 'Eu' 'F' 'Fe' 'Ga' 'Gd' 'Ge' 'H' 'Hf' 'Hg'
 'Ho' 'I' 'In' 'Ir' 'K' 'La' 'Li' 'Lu' 'Mg' 'Mn' 'Mo' 'N' 'Na' 'Nb' 'Nd'
 'Ni' 'Np' 'O' 'Os' 'P' 'Pa' 'Pb' 'Pd' 'Pm' 'Pr' 'Pt' 'Pu' 'Rb' 'Re' 'Rh'
 'Ru' 'S' 'Sb' 'Sc' 'Se' 'Si' 'Sm' 'Sn' 'Sr' 'Ta' 'Tb' 'Tc' 'Te' 'Th' 'Ti'
 'Tl' 'Tm' 'U' 'V' 'W' 'Xe' 'Y' 'Yb' 'Zn' 'Zr']


  0%|          | 0/25193 [00:00<?, ?it/s]

structure_A contains ['S']


  0%|          | 0/25193 [00:00<?, ?it/s]

structure_AM contains ['Li' 'S']


  0%|          | 0/25193 [00:00<?, ?it/s]

structure_CAN contains ['Al' 'Mg' 'S']


  0%|          | 0/25193 [00:00<?, ?it/s]

structure_CAMN contains ['Al' 'Li' 'Mg' 'S']


  0%|          | 0/25193 [00:00<?, ?it/s]

structure_A40 contains ['S']


  0%|          | 0/25193 [00:00<?, ?it/s]

structure_AM40 contains ['Li' 'S']


  0%|          | 0/25193 [00:00<?, ?it/s]

structure_CAN40 contains ['Al' 'Mg' 'S']


  0%|          | 0/25193 [00:00<?, ?it/s]

structure_CAMN40 contains ['Al' 'Li' 'Mg' 'S']


##  3c. Run featurizers as needed
***
All featurizers require the user to specify which 'mode' is used. The modes tell the class which column of the dataframe to apply the featurizer to. The valid modes are:

* structure
* structure_A
* structure_AM
* structure_CAN
* structure_CAMN
* structure_A40
* structure_AM40
* structure_CAN40
* structure_CAMN40
***
A few of the featurizers require additional paramaters. The paramaters are discussed in more detail below.

* __Global Instability Index__: rcut_list
* __Radial Distribution Function__: cutoff_list
* __Smooth Overlap of Atomic Positions (SOAP)__: rcut_list, nmax_list, lmax_list, average
* __Xray Diffraction__: pattern_length_list

### Featurizer: Atomic Packing Efficiency 

In [None]:
fc.run_atomic_packing_efficiency_featurizer('structure_CAMN')

### Featurizer: Band Center

In [None]:
fc.run_band_center_featurizer('structure_CAMN')

### Featurizer: Bond Fraction

In [None]:
fc.run_bond_fraction_featurizer('structure_CAMN')

### Featurizer: Chemical Ordering

In [None]:
fc.run_chemical_ordering_featurizer('structure_CAMN')

### Featurizer: Density

In [None]:
fc.run_density_featurizer('structure_CAMN')

### Featurizer: Electron Negativity Difference

In [None]:
fc.run_electron_negativity_difference_featurizer('structure')

### Featurizer: Ewald Energy

In [None]:
fc.run_ewald_energy_featurizer('structure_CAMN')

### Featurizer: Global Instability Index

Pass in a list of rcut values. The function will iterate over the list generating a feature file for each entry. 

In [None]:
fc.run_global_instability_index_featurizer('structure', rcut_list=[20])

### Featurizer: Jarvis CFID

In [None]:
fc.run_jarvis_cfid_featurizer('structure')

### Featurizer: Maximum Packing Efficiency

In [None]:
fc.run_maximum_packing_efficiency_featurizer('structure_CAMN')

### Featurizer: MereDig 

In [None]:
fc.run_meredig_featurizer('structure_CAMN')

### Featurizer: Orbital Field Matrix

In [None]:
fc.run_orbital_field_matrix_featurizer('structure')

### Featurizer: Oxidation States

In [None]:
fc.run_oxidation_states_featurizer('structure_CAMN')

### Featurizer: Radial Distribution Function

Pass in a list of cutoff values and a list of bin_size values. The function will iterate over the lists, generating a feature file for each combination. 

In [None]:
fc.run_rdf_featurizer('structure_CAMN', cutoff_list=[10], bin_size_list=[0.1])

### Featurizer: Sine Coulomb Matrix

In [None]:
fc.run_sine_coulomb_featurizer('structure')

### Featurizer: Smooth Overlap of Atomic Positions (SOAP)

Pass in the following paramaters:

* __rcut_list__:  a list of rcut values for the dscribe SOAP class
* __nmax_list__:  a list of nmax values for the dscribe SOAP class
* __lmax_list__:  a list of lmax values fro the dscribe SOAP class
* __average__: the averaging strategy for SOAP. Either 'outer' or 'inner'

The function will create a feature file for every unique combination of the above paramaters. 

In [8]:
fc.run_SOAP('structure_CAN', rcut_list=[3], nmax_list=[5], lmax_list=[3], average='outer')

  0%|          | 0/25193 [00:00<?, ?it/s]

  0%|          | 0/25193 [00:00<?, ?it/s]

### Featurizer: Structural Complexity

In [None]:
fc.run_structural_complexity_featurizer('structure')

### Featurizer: Structural Heterogeneity

In [None]:
fc.run_structural_heterogeneity_featurizer('structure')

### Featurizer: Valence Orbital

In [None]:
fc.run_valence_orbital_featurizer('structure')

### Featurizer: X-ray Diffraction Pattern

Pass in a list of pattern lengths. The function will iterate over the list, saving a feature representation for each pattern length. 

In [None]:
fc.run_XRD_featurizer('structure_CAMN', pattern_length_list=[451])

### Featurizer: Yang Solid Solution

In [None]:
fc.run_yang_solid_solution_featurizer('structure')

### Featurizer: CAVD - requires python 3.7 for cavd library

In [None]:
mode = 'structure'

In [None]:
for i in tqdm.tqdm(np.arange(0, len(structuresDF), 1)):
    for site in structuresDF.loc[i, mode].sites:
        try:
            site._atom_site_label = site.species.alphabetical_formula
            site.properties.update({'_atom_site_label': site.species.alphabetical_formula})
        except Exception as e:
            print(e)
    
    try: 
        w = CifWriter(structuresDF.loc[i, mode], symprec=True)
        w.write_file('./Li Cifs CAVD/{}.cif'.format(str(i)))
    except:
        w = CifWriter(structuresDF.loc[i, mode])
        w.write_file('./Li Cifs CAVD/{}.cif'.format(str(i)))
        

In [None]:
def cavd_calc(filename, migrant, ntol=0.02, lower=0.0, upper=10.0):
    with zopen(filename, "rt") as f:
        input_string = f.read()
    parser = CifParser_new.from_string(input_string)
    stru = parser.get_structures(primitive=False)[0]
    
    species = [str(sp).replace("Specie ","") for sp in stru.species]
    elements = [re.sub('[^a-zA-Z]','',sp) for sp in species]
    if migrant not in elements:
        raise ValueError("The input migrant ion not in the input structure! Please check it.")
    effec_radii,migrant_radius,migrant_alpha,nei_dises,coordination_list = LocalEnvirCom(stru,migrant)
    
    atmnet = AtomNetwork.read_from_RemoveMigrantCif(filename, migrant, effec_radii, True)
    vornet, edge_centers, fcs, faces = atmnet.perform_voronoi_decomposition(True, ntol)

    prefixname = filename.replace(".cif","")
    prefixname = filename.replace("./Li Cifs/", "")
    newpath = "./cavdoutputs/" + prefixname

    # compute the R_T
    conn_val = connection_values_list(newpath+".resex", vornet)
    return conn_val

In [None]:
cavd_features = []
non_working = []
for i in tqdm.tqdm(np.arange(0, len(structures_df), 1)):
    try:
        cavd_features.append(sorted(cavd_calc("./Li Cifs CAVD/{}.cif".format(str(i)), "Li")))
    except:
        cavd_features.append([NaN, NaN, NaN])
        non_working.append(i)

In [None]:
np.save('features/cavd2_{}'.format(mode), cavd_features)