CederGroupHub · lbluque · Jul 28, 2021 · Jun 16, 2021 · Jun 30, 2021 · Jul 27, 2021
diff --git a/CHANGES.md b/CHANGES.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Use this section to keep track of changes in the works.
 ### Added
 * `rotate` method in SiteBasis class.
+[\#130](https://github.com/CederGroupHub/smol/pull/130)
+  ([lbluque](https://github.com/lbluque))
 
 ### Changed
 
@@ -16,6 +18,9 @@ Use this section to keep track of changes in the works.
   `Structure.sites_in_sphere` return value.
   [\#125](https://github.com/CederGroupHub/smol/pull/125)
   ([lbluque](https://github.com/lbluque))
+* Fix saving bit_combos in `Orbit.as_dict` when pruning has been done.
+[\#130](https://github.com/CederGroupHub/smol/pull/130)
+ ([qchempku2017](https://github.com/qchempku2017))
 
 ## [v1.0.1](https://github.com/CederGroupHub/smol/tree/v1.0.1) (2021-03-03)
 #### [Full Changelog](https://github.com/CederGroupHub/smol/compare/v1.0.0...v1.0.1)

diff --git a/smol/cofe/__init__.py b/smol/cofe/__init__.py
@@ -6,7 +6,8 @@
 """
 
 from .space.clusterspace import ClusterSubspace
-from .expansion import ClusterExpansion
+from .expansion import ClusterExpansion, RegressionData
 from smol.cofe.wrangling.wrangler import (StructureWrangler)
 
-__all__ = ['ClusterSubspace', 'StructureWrangler', 'ClusterExpansion']
+__all__ = ['ClusterSubspace', 'StructureWrangler', 'ClusterExpansion',
+           'RegressionData']
diff --git a/smol/cofe/expansion.py b/smol/cofe/expansion.py
@@ -12,11 +12,50 @@
 
 __author__ = "Luis Barroso-Luque"
 
+from copy import deepcopy
+from dataclasses import dataclass, asdict
 import numpy as np
-from monty.json import MSONable
+from monty.json import MSONable, jsanitize
 from smol.cofe.space.clusterspace import ClusterSubspace
 
 
+@dataclass
+class RegressionData:
+    """Dataclass used to store regression model details.
+
+    This class is used to store the details used in fitting a cluster expansion
+    for future reference and good provenance practices. It is highly
+    recommended to initialize :class:`ClusterExpansion` objects with this
+    class
+    """
+
+    module: str
+    class_name: str
+    parameters: dict
+    feature_matrix: np.ndarray
+    property_vector: np.ndarray
+
+    @classmethod
+    def from_sklearn(cls, estimator, feature_matrix, property_vector):
+        """Create a RegressionData object from sklearn estimator.
+
+        Args:
+            estimator (object):
+                scikit-leanr estimator class or derived.
+            feature_matrix (ndarray):
+                feature matrix used in fit.
+            property_vector (ndarray):
+                target property vector used in fit.
+        Returns:
+            RegressionData
+        """
+        return cls(module=estimator.__module__,
+                   class_name=estimator.__class__.__name__,
+                   parameters=estimator.get_params(),
+                   feature_matrix=feature_matrix,
+                   property_vector=property_vector)
+
+
 class ClusterExpansion(MSONable):
     """Class for the ClusterExpansion proper.
 
@@ -44,7 +83,7 @@ class ClusterExpansion(MSONable):
             expansion. i.e. if it was pruned, any error metrics etc.
     """
 
-    def __init__(self, cluster_subspace, coefficients, feature_matrix=None):
+    def __init__(self, cluster_subspace, coefficients, regression_data=None):
         r"""Initialize a ClusterExpansion.
 
         Args:
@@ -57,26 +96,30 @@ def __init__(self, cluster_subspace, coefficients, feature_matrix=None):
                 coefficients to the correlation vector terms (length and order)
                 These correspond to the
                 ECI x the multiplicity of orbit x multiplicity of bit ordering
-            feature_matrix (ndarray): optional
-                The feature matrix used in fitting the given coefficients.
-                Useful to report metrics when printing and numerically
-                converting eci to another basis.
+            regression_data (RegressionData): optional
+                RegressionData object with details used in the fit of the
+                corresponding expansion. The feature_matrix attributed here is
+                necessary to compute things like numerical ECI transormations
+                for different bases.
         """
-        if feature_matrix is not None and \
-                len(coefficients) != feature_matrix.shape[1]:
+        if regression_data is not None and \
+                len(coefficients) != regression_data.feature_matrix.shape[1]:
             raise AttributeError(
-                f"Feature matrix shape {feature_matrix.shape} does not match "
-                f"the number of coefficients {len(coefficients)}.")
+                f"Feature matrix shape {regression_data.feature_matrix.shape} "
+                f"does not match the number of coefficients "
+                f"{len(coefficients)}.")
 
         if len(coefficients) != len(cluster_subspace):
             raise AttributeError(
                 f"The size of the give subspace {len(cluster_subspace)} does "
                 f"not match the number of coefficients {len(coefficients)}")
 
         self.coefs = coefficients
-        self.metadata = {}
+        self.regression_data = regression_data
         self._subspace = cluster_subspace
-        self._feat_matrix = feature_matrix
+        # make copy for possible changes/pruning
+        self._feat_matrix = regression_data.feature_matrix.copy() \
+            if regression_data is not None else None
         self._eci = None
 
     @property
@@ -250,11 +293,23 @@ def __str__(self):
     @classmethod
     def from_dict(cls, d):
         """Create ClusterExpansion from serialized MSONable dict."""
+        if d['regression_data'] is not None:
+            dd = deepcopy(d)
+            dd['regression_data']['feature_matrix'] = np.array(
+                d['regression_data']['feature_matrix'])
+            dd['regression_data']['property_vector'] = np.array(
+                d['regression_data']['property_vector'])
+            reg_data = RegressionData(**dd['regression_data'])
+        else:
+            reg_data = None
+
         ce = cls(ClusterSubspace.from_dict(d['cluster_subspace']),
                  coefficients=np.array(d['coefs']),
-                 feature_matrix=np.array(d['feature_matrix'])
-                 if d['feature_matrix'] is not None else d['feature_matrix'])
-        ce.metadata = d['metadata']
+                 regression_data=reg_data)
+
+        # update copy of feature matrix to keep any changes
+        if d['feature_matrix'] is not None:
+            cls._feat_matrix = np.array(d['feature_matrix'])
         return ce
 
     def as_dict(self):
@@ -268,10 +323,15 @@ def as_dict(self):
             feature_matrix = self._feat_matrix.tolist()
         else:
             feature_matrix = self._feat_matrix
+        if self.regression_data is not None:
+            reg_data = jsanitize(asdict(self.regression_data))
+        else:
+            reg_data = None
+
         d = {'@module': self.__class__.__module__,
              '@class': self.__class__.__name__,
              'cluster_subspace': self.cluster_subspace.as_dict(),
              'coefs': self.coefs.tolist(),
-             'feature_matrix': feature_matrix,
-             'metadata': self.metadata}
+             'regression_data': reg_data,
+             'feature_matrix': feature_matrix}
         return d
diff --git a/tests/test_cofe/test_expansion.py b/tests/test_cofe/test_expansion.py
@@ -1,9 +1,27 @@
 import unittest
 import json
 import numpy as np
-from smol.cofe import StructureWrangler, ClusterSubspace, ClusterExpansion
+from sklearn.linear_model import LinearRegression
+from smol.cofe import StructureWrangler, ClusterSubspace, ClusterExpansion, \
+    RegressionData
 from smol.cofe.extern import EwaldTerm
 from tests.data import synthetic_CE_binary, synthetic_CEewald_binary
+from tests.utils import assert_msonable
+
+
+def test_regression_data(cluster_subspace):
+    reg = LinearRegression(fit_intercept=False)
+    n = np.random.randint(10, 100)
+    feat_matrix = np.random.random((n, len(cluster_subspace)))
+    prop_vec = np.random.random(n)
+    reg_data = RegressionData.from_sklearn(reg, feature_matrix=feat_matrix,
+                                           property_vector=prop_vec)
+    coeffs = np.random.random(len(cluster_subspace))
+    expansion = ClusterExpansion(cluster_subspace, coeffs, reg_data)
+    assert reg_data.class_name == reg.__class__.__name__
+    assert reg_data.module == reg.__module__
+    assert reg_data.parameters == reg.get_params()
+    assert_msonable(expansion)
 
 
 # TODO add tests with synthetic ternary dataset
@@ -29,7 +47,11 @@ def setUp(self) -> None:
         coefs = np.linalg.lstsq(self.sw.feature_matrix,
                                 self.sw.get_property_vector('energy', True),
                                 rcond=None)[0]
-        self.ce = ClusterExpansion(cs, coefs, self.sw.feature_matrix)
+        reg = LinearRegression(fit_intercept=False)
+        reg_data = RegressionData.from_sklearn(
+            reg, feature_matrix=self.sw.feature_matrix,
+            property_vector=self.sw.get_property_vector('energy'))
+        self.ce = ClusterExpansion(cs, coefs, reg_data)
 
     def test_predict_train(self):
         preds = [self.ce.predict(s) for s in self.sw.structures]
@@ -61,7 +83,7 @@ def test_convert_eci(self):
 
     def test_prune(self):
         cs = ClusterSubspace.from_dict(synthetic_CE_binary['cluster_subspace'])
-        ce = ClusterExpansion(cs, self.ce.coefs.copy(), self.ce._feat_matrix)
+        ce = ClusterExpansion(cs, self.ce.coefs.copy(), self.ce.regression_data)
         thresh = 8E-3
         ce.prune(threshold=thresh)
         ids = [i for i, coef in enumerate(self.ce.coefs) if abs(coef) >= thresh]
@@ -89,12 +111,13 @@ def test_print(self):
         _ = str(self.ce)
 
     def test_msonable(self):
-        self.ce.metadata['somethingimportant'] = 75
         d = self.ce.as_dict()
         ce1 = ClusterExpansion.from_dict(d)
         self.assertTrue(np.array_equal(self.ce.coefs, ce1.coefs))
         self.assertIsInstance(self.ce.cluster_subspace, ClusterSubspace)
-        self.assertEqual(ce1.metadata, self.ce.metadata)
+        # change this to just use assert_msonable
+        self.assertEqual(ce1.regression_data.module,
+                         self.ce.regression_data.module)
         j = json.dumps(d)
         json.loads(j)
 
@@ -140,7 +163,11 @@ def _test_predictions(self, cs, data):
         ecis = np.linalg.lstsq(sw.feature_matrix,
                                sw.get_property_vector('energy', True),
                                rcond=None)[0]
-        ce = ClusterExpansion(cs, ecis, sw.feature_matrix)
+        reg_data = RegressionData(
+            module='foo.bar', class_name='Estimator', parameters={'foo': 'bar'},
+            feature_matrix=sw.feature_matrix,
+            property_vector=sw.get_property_vector('energy'))
+        ce = ClusterExpansion(cs, ecis, reg_data)
         test_structs = [data[i][0] for i in self.test_ids]
         test_energies = np.array([data[i][1] for i in self.test_ids])
         preds = [ce.predict(s) for s in sw.structures]

diff --git a/tests/test_io.py b/tests/test_io.py
@@ -18,7 +18,7 @@ def setUpClass(cls) -> None:
             cls.sw.add_data(struct, {'energy': energy})
 
         coefs = np.ones(cls.cs.num_corr_functions)
-        cls.ce = ClusterExpansion(cls.cs, coefs, cls.sw.feature_matrix)
+        cls.ce = ClusterExpansion(cls.cs, coefs)
         cls.pr = CEProcessor(cls.cs, 2 * np.eye(3), coefs)
         cls.en = CanonicalEnsemble(cls.pr)
         cls.file_path = './test_save_work.mson'

diff --git a/tests/test_moca/test_ensemble.py b/tests/test_moca/test_ensemble.py
@@ -3,7 +3,7 @@
 import numpy.testing as npt
 import numpy as np
 
-from smol.cofe import ClusterExpansion
+from smol.cofe import ClusterExpansion, RegressionData
 from smol.cofe.extern import EwaldTerm
 from smol.moca import (CanonicalEnsemble, MuSemiGrandEnsemble,
                        FuSemiGrandEnsemble, CompositeProcessor,
@@ -61,8 +61,13 @@ def test_from_cluster_expansion(cluster_subspace, ensemble_cls):
     proc.add_processor(EwaldProcessor(cluster_subspace, scmatrix,
                        cluster_subspace.external_terms[0],
                                       coefficient=coefs[-1]))
-    fake_feature_matrix = np.random.random((5, len(coefs)))
-    expansion = ClusterExpansion(cluster_subspace, coefs, fake_feature_matrix)
+    reg_data = RegressionData(
+        module='fake.module', class_name='Estimator',
+        feature_matrix=np.random.random((5, len(coefs))),
+        property_vector=np.random.random(len(coefs)),
+        parameters={'foo': 'bar'})
+    expansion = ClusterExpansion(cluster_subspace, coefs, reg_data)
+
     if ensemble_cls is MuSemiGrandEnsemble:
         kwargs = {'chemical_potentials':
                   {sp: 0.3 for space in proc.unique_site_spaces