## Overview of the basic ways to create and fit a Cluster Expansion

In [1]:
import numpy as np
import json
from pymatgen.io.cif import CifParser
from pymatgen.core.structure import Structure
from sklearn.linear_model import LinearRegression
from smol.cofe import ClusterSubspace, StructureWrangler, ClusterExpansion
from smol.cofe.configspace import EwaldTerm
from smol.cofe.regression import constrain_dielectric

In [2]:
# Open prim structure
with open('data/lno_prim.json', 'r') as f:
    lno_prim = Structure.from_dict(json.load(f))
    
# Open fitting data
with open('data/lno_fitting_data.json', 'r') as f:
    lno_data = [(Structure.from_dict(x['s']), x['toten']) for x in json.load(f)]

## Create a CE using convenience functions and most defaults

In [3]:
ce = ClusterExpansion.from_radii(structure=lno_prim,
                                 radii={2: 5, 3: 4.1},
                                 ltol=0.15, stol=0.2, angle_tol=5,
                                 supercell_size='O2-',
                                 data=lno_data)

#### Fit the CE

In [4]:
ce.fit()

#### Check the fit

In [5]:
print(f"ECIS: {ce.ecis}")
print(f"RMSE: {ce.root_mean_squared_error} eV/prim")
print(f"Number non zero ECIs: {len([eci for eci in ce.ecis if np.abs(eci) > 1e-3])}/{len(ce.ecis)}")

ECIS: [-3.13615254e+01 -3.12526438e+00 -3.12526438e+00 -4.58945826e-01
  6.54404413e-01  3.01576392e-01 -3.32867834e-01 -4.93529265e-01
  7.12819855e-01 -2.66649892e-01  2.45976878e-02]
RMSE: 0.009973676733307091 eV/prim
Number non zero ECIs: 11/11


### Now change some of the defaults in the convenience function

In [6]:
est = LinearRegression(fit_intercept=False) # Linear estimator from Scikit-Learn
ce2 = ClusterExpansion.from_radii(structure=lno_prim,
                                  radii={2: 5, 3: 4.1},
                                  ltol=0.15, stol=0.2, angle_tol=5,
                                  supercell_size='O2-',
                                  basis='indicator',
                                  orthonormal=True,
                                  external_terms=[EwaldTerm, {'eta': None}], # add Ewald term to CE
                                  estimator=est, # Use the scikit-learn estimator
                                  data=lno_data,
                                  weights='hull', # Weigh data by E above Hull
                                  verbose=True # Use this print out structures that fail to match prim
                                 )
print(f'Total structures that match {len(ce2.property_vector)}/{len(lno_data)}')

Unable to match Ni4+6 O2-12 with energy -188.28833 to supercell_structure. Throwing out.
Error Message: Supercell could not be found from structure.
Unable to match Li+2 Ni4+4 Ni3+2 O2-12 with energy -200.13866 to supercell_structure. Throwing out.
Error Message: Supercell could not be found from structure.
Unable to match Li+2 Ni3+2 Ni4+4 O2-12 with energy -200.42049 to supercell_structure. Throwing out.
Error Message: Supercell could not be found from structure.
Unable to match Li+3 Ni4+4 Ni2+1 Ni3+1 O2-12 with energy -206.70884 to supercell_structure. Throwing out.
Error Message: Supercell could not be found from structure.
Unable to match Li+3 Ni3+3 Ni4+3 O2-12 with energy -206.85997 to supercell_structure. Throwing out.
Error Message: Mapping could not be found from structure.
Unable to match Li+5 Ni4+1 Ni3+5 O2-12 with energy -218.82429 to supercell_structure. Throwing out.
Error Message: Mapping could not be found from structure.
Total structures that match 25/31


#### Fit and check
** Now we have 12 ecis since we added an Ewald term

In [7]:
ce2.fit()
print(f"ECIS: {ce2.ecis}")
print(f"RMSE: {ce2.root_mean_squared_error} eV/prim")
print(f"Number non zero ECIs: {len([eci for eci in ce2.ecis if np.abs(eci) > 1e-3])}/{len(ce2.ecis)}")

ECIS: [-2.38315283e+01 -1.95956016e+00 -1.95956016e+00  1.27031258e-01
  1.78397878e-02 -3.45071381e-02 -2.21122794e-02 -2.33723559e-02
  4.05710781e-02 -1.75255495e-02 -1.02496234e-03  9.62584056e-02]
RMSE: 0.007473460838698365 eV/prim
Number non zero ECIs: 12/12


#### Previous functionality to constrain dieletric constant
** CE needs an ewald term for this to make sense. In this case it makes the fit alot worse. I copied this verbatum from the old code. I'm not entirely sure how people use this, so if you do let me know.

In [8]:
from smol.cofe.regression import constrain_dielectric
constrain_dielectric(ce2, .25)
print(f"ECIS: {ce2.ecis}")
print(f"RMSE: {ce2.root_mean_squared_error} eV/prim")
print(f"Number non zero ECIs: {len([eci for eci in ce2.ecis if np.abs(eci) > 1e-3])}/{len(ce2.ecis)}")

ECIS: [-2.38315283e+01 -1.95956016e+00 -1.95956016e+00  1.27031258e-01
  1.78397878e-02 -3.45071381e-02 -2.21122794e-02 -2.33723559e-02
  4.05710781e-02 -1.75255495e-02 -1.02496234e-03  4.00000000e+00]
RMSE: 0.007473460838698365 eV/prim
Number non zero ECIs: 12/12


## Creating all the objects by hand
#### First create a cluster subspace that defines the orbits and type of functions on them

In [9]:
# Create new ClusterSubspace. This represents the space of cluster functions over which
# the fitting will be done
cs = ClusterSubspace.from_radii(structure=lno_prim,
                                 radii={2: 5, 3: 4.1},
                                 ltol=0.15, stol=0.2, angle_tol=5,
                                 supercell_size='O2-',
                                 basis='indicator',
                                 orthonormal=True)
print(cs)

ClusterBasis: [Prim Composition] Li+0.5 Ni3+0.5 Ni4+0.5 O2-2
    size: 1
    [Orbit] id: 1    bit_id: 1   multiplicity: 1    no. symops: 12   [Cluster] id: 1    Radius: 0.0   Centroid: [0. 0. 0.]         Points: [[0. 0. 0.]]         
    [Orbit] id: 2    bit_id: 2   multiplicity: 1    no. symops: 12   [Cluster] id: 2    Radius: 0.0   Centroid: [0.5 0.5 0.5]      Points: [[0.5 0.5 0.5]]      
    size: 2
    [Orbit] id: 3    bit_id: 3   multiplicity: 6    no. symops: 2    [Cluster] id: 3    Radius: 2.97  Centroid: [0.75 0.25 0.75]   Points: [[1.  0.  1. ]  [0.5 0.5 0.5]]            
    [Orbit] id: 4    bit_id: 4   multiplicity: 3    no. symops: 4    [Cluster] id: 9    Radius: 2.97  Centroid: [0.5 0.5 0. ]      Points: [[1. 0. 0.]  [0. 1. 0.]]                  
    [Orbit] id: 5    bit_id: 5   multiplicity: 3    no. symops: 4    [Cluster] id: 12   Radius: 2.97  Centroid: [0.  0.5 0.5]      Points: [[-0.5  0.5  0.5]  [ 0.5  0.5  0.5]]      
    [Orbit] id: 6    bit_id: 6   multiplicity: 

#### We can use the ClusterSubspace to compute correlation vectors

In [10]:
test_struct = lno_data[1][0]
print(f'Correlation vector for {test_struct.composition} is\n'
      f'{cs.corr_from_structure(test_struct)}')

Correlation vector for Li+1 Ni4+5 Ni3+1 O2-12 is
[ 1.         -0.66666667 -0.66666667  0.44444444  0.55555556  0.55555556
  0.55555556 -0.44444444 -0.44444444 -0.66666667 -0.66666667]


#### And if we add an Ewald Term, it will be computed as the last entry

In [11]:
cs.add_external_term(EwaldTerm)
print(f'Correlation vector for {test_struct.composition} is\n'
      f'{cs.corr_from_structure(test_struct)}')

Correlation vector for Li+1 Ni4+5 Ni3+1 O2-12 is
[   1.           -0.66666667   -0.66666667    0.44444444    0.55555556
    0.55555556    0.55555556   -0.44444444   -0.44444444   -0.66666667
   -0.66666667 -116.41651779]


#### A Structure Wrangler is used to load data, check that it matches the prim structure, and obtain a feature matrix (we can also filter inputs)

In [12]:
# A StructureWrangler holds the data that matcher prim
sw = StructureWrangler(cluster_subspace=cs)
# Add data setting the weight for each energy by e above hull
sw.add_data(lno_data,
            weights=['hull', {'temperature': 2000}],  # by e above hull with T=2000
            verbose=True)

print(f'Total structures that match {len(sw.structures)}/{len(lno_data)}')

# filter structures with low electrostatic energies
print('Filtering by max ewald')
sw.filter_by_ewald(2, verbose=True)

Unable to match Ni4+6 O2-12 with energy -188.28833 to supercell_structure. Throwing out.
Error Message: Supercell could not be found from structure.
Unable to match Li+2 Ni4+4 Ni3+2 O2-12 with energy -200.13866 to supercell_structure. Throwing out.
Error Message: Supercell could not be found from structure.
Unable to match Li+2 Ni3+2 Ni4+4 O2-12 with energy -200.42049 to supercell_structure. Throwing out.
Error Message: Supercell could not be found from structure.
Unable to match Li+3 Ni4+4 Ni2+1 Ni3+1 O2-12 with energy -206.70884 to supercell_structure. Throwing out.
Error Message: Supercell could not be found from structure.
Unable to match Li+3 Ni3+3 Ni4+3 O2-12 with energy -206.85997 to supercell_structure. Throwing out.
Error Message: Mapping could not be found from structure.
Unable to match Li+5 Ni4+1 Ni3+5 O2-12 with energy -218.82429 to supercell_structure. Throwing out.
Error Message: Mapping could not be found from structure.
Total structures that match 25/31
Filtering by ma

#### Finally create and fit a Cluster Expansion

In [13]:
# Create an estimator
# This is the old L1regs estimator, maybe should give it a better name, suggestions?
# If you don't provide one this will be used as default.
from smol.cofe.regression import CVXEstimator
est = CVXEstimator()  

# Do this because we added an Ewald term after we created the StructureWrangler
sw.update_features()
# Create the CE
ce3 = ClusterExpansion.from_structure_wrangler(sw)
# Fit the CE
ce3.fit()

print(f"ECIS: {ce3.ecis}")
print(f"RMSE: {ce3.root_mean_squared_error} eV/prim")
print(f"Number non zero ECIs: {len([eci for eci in ce3.ecis if np.abs(eci) > 1e-3])}/{len(ce3.ecis)}")

ECIS: [-2.24686567e+01 -2.01540145e+00 -2.01540145e+00  1.47112693e-01
  3.33945950e-03 -4.01697167e-02 -1.71313669e-02 -1.93950446e-02
  3.62249602e-02 -1.55654190e-02 -1.46266269e-03  1.08601243e-01]
RMSE: 0.007531936125740539 eV/prim
Number non zero ECIs: 12/12


