# Translating a Cluster Expansion created with `pyabinitio.cluster_expansion`

In [1]:
import numpy as np
from monty.serialization import loadfn
from pyabinitio.cluster_expansion.eci_fit import EciGenerator
from smol.cofe import ClusterSubspace, StructureWrangler, ClusterExpansion
from smol.cofe.extern import EwaldTerm

MaterialsProjectCompatibility will be updated with new correction classes as well as new values of corrections and uncertainties in 2020
  def get_pourbaix_entries(self, chemsys, solid_compat=MaterialsProjectCompatibility()):


### 1) Loading and extracting information from a pyabinitio `EciGenerator`

Cluster expansions created with the `pyabinitio.cluster_expanstion` module,
can be very easily translated into `smol`.

The only thing you will need is the `EciGenerator` object
you created.

In [3]:
# load a ecigenerator object you previously created
file_path = 'data/ecigenerator.mson'
ecigen = loadfn(file_path)

# Get the prim structure
prim = ecigen.ce.structure

print(f'This is the prim \n{prim}\n')

# Get cluster diameter cutoffs from previous CE object
cutoffs = {}
for size, clusters in ecigen.ce.clusters.items():
    if size == 1: continue
    cutoffs[size] = max([c.max_radius for c in clusters])

print(f'Cutoff radii obtained are {cutoffs}\n')

# determine if an ewald term is used
use_ewald = ecigen.ce.use_ewald
print(f'Cluster expansion used Ewald: {use_ewald}')

# Check the fit did not constrain dielectric
# In case you plan to refit the same way.
print(f'Was the dieletric constrained: '
      f'{ecigen.max_dielectric is not None}')

# obtain the structurematcher parameters used
# You should get these because the default values
# in smol are different then pyabinitio
# in smol the pymatgen defaults are used.
supercell_size = ecigen.ce.supercell_size
ltol = ecigen.ce.ltol
stol = ecigen.ce.stol
angle_tol = ecigen.ce.angle_tol

This is the prim 
Full Formula (Li0.5 Ni1 O2)
Reduced Formula: Li0.5Ni1O2
abc   :   2.969848   2.969848   5.143928
angles:  73.221350  73.221347  60.000002
Sites (4)
  #  SP                         a     b     c
---  ----------------------  ----  ----  ----
  0  Li+:0.500               0     0     0
  1  Ni3+:0.500, Ni4+:0.500  0.5   0.5   0.5
  2  O2-                     0.75  0.75  0.75
  3  O2-                     0.25  0.25  0.25

Cutoff radii obtained are {2: 4.199999454995874, 3: 5.939696563574085, 4: 2.9698482817870424}

Cluster expansion used Ewald: True
Was the dieletric constrained: False


### 2) Now create the corresponding `ClusterSubspace`

We can now create the corrseponding subspace using `smol`. Here you can choose to use a different basis if you want to refit to an orthogonal one. In this example we will keep the **indicator** basis to compare that our new features match the ones from before.

In [7]:
basis='indicator'
subspace = ClusterSubspace.from_cutoffs(prim,
                                        cutoffs=cutoffs,
                                        ltol=ltol,
                                        stol=stol,
                                        angle_tol=angle_tol,
                                        supercell_size=supercell_size,
                                        basis=basis)

if use_ewald:
    subspace.add_external_term(EwaldTerm())


# Check that the number of orbits and orderings matches
assert subspace.num_orbits == ecigen.ce.n_sclusters
assert subspace.num_corr_functions == ecigen.ce.n_bit_orderings

### 3) We can now create a `StructureWrangler` and add the structures used.

If you don't plan to refit or compare that features are the same
(if still using indicator basis), then you can skip to 4)

In [8]:
# create the wrangler
wrangler = StructureWrangler(subspace)

for item in ecigen.items:
    structure = item['structure']
    # giving it the matrix will make things
    # faster but it should work either way
    # unless you didnt use the same StuctureMatcher options
    scmatrix = item['supercell'].supercell_matrix
    energy = item['energy']
    weight = item['weight']
    wrangler.add_data(structure,
                      properties={'energy': energy},
                      weights={'whatever_you_used': weight},
                      supercell_matrix=scmatrix,
                      verbose=True)

In [9]:
# If using indicator basis, check that feature matrices match
if basis == 'indicator':
    assert np.allclose(wrangler.feature_matrix,
                       ecigen.feature_matrix)

### 4) Creating new `ClusterExpansion` using previous fit

If you want to keep the previous fit you can just use the 
eci directly in the new cluster expansion
(the previous implementation in pyabinitio has the eci
as the eci * multiplicity so these are the coefficients)
Make sure you are using an indicator basis otherwise
it makes no sense to use the same eci

In [10]:
old_coefs = ecigen.ecis
expansion = ClusterExpansion(subspace,
                             coefficients=old_coefs,
                             feature_matrix=wrangler.feature_matrix)

# Compare a few predictions to make sure everything matches
ids = np.random.choice((len(wrangler.structures)), size=10)
for i in ids:
    struct = wrangler.structures[i]
    assert np.isclose(ecigen.structure_energy(struct),
                      expansion.predict(struct))

### 5) (Re)Fitting the `ClusterExpansion`

You can also refit the cluster expansion (this is useful
if you want to switch over to using an orthogonal basis)
Here we will just redo it with the original indicator basis
to compare with the old fit.

In [11]:
# the ecigenerators use the old l1regs by default
# so we will use the same for comparison
from theorytoolkit.regression import WDRLasso

estimator = WDRLasso()

# in this case it was an unweighted fit so
# adding the weights makes no difference, but
# if you used by e above hull or composition
# you need to do this to get a similar fit
estimator.fit(wrangler.feature_matrix,
              wrangler.get_property_vector('energy', True),
              sample_weight=wrangler.get_weights('whatever_you_used'))

new_coefs = estimator.coef_

expansion = ClusterExpansion(subspace,
                             coefficients=new_coefs,
                             feature_matrix=wrangler.feature_matrix)

# Lets check if our new fit is 
ids = np.random.choice((len(wrangler.structures)), size=10)
for i in ids:
    struct = wrangler.structures[i]
    print((ecigen.structure_energy(struct),
                      expansion.predict(struct)))

Consider using 3rd party estimators such as scikit learn.
  estimator = WDRLasso()


(-212.92970249164134, -212.929702396418)
(-212.14197927342383, -212.14197918191735)
(-206.95984156567107, -206.9598414710193)
(-212.75037649702597, -212.75037640515274)
(-207.1096266240102, -207.1096265250885)
(-212.75037649702597, -212.75037640515274)
(-207.29469231924628, -207.2946922217547)
(-200.62910151985614, -200.62910142172657)
(-194.51464229296096, -194.51464218954325)
(-218.95642689474653, -218.95642680173313)


### Now you are ready to use your cluster expansion from before in `smol`!