Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RegressionData dataclass #132

Merged
merged 7 commits into from
Jul 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
Use this section to keep track of changes in the works.
### Added
* `rotate` method in SiteBasis class.
[\#130](https://github.com/CederGroupHub/smol/pull/130)
([lbluque](https://github.com/lbluque))

### Changed

Expand All @@ -16,6 +18,9 @@ Use this section to keep track of changes in the works.
`Structure.sites_in_sphere` return value.
[\#125](https://github.com/CederGroupHub/smol/pull/125)
([lbluque](https://github.com/lbluque))
* Fix saving bit_combos in `Orbit.as_dict` when pruning has been done.
[\#130](https://github.com/CederGroupHub/smol/pull/130)
([qchempku2017](https://github.com/qchempku2017))

## [v1.0.1](https://github.com/CederGroupHub/smol/tree/v1.0.1) (2021-03-03)
#### [Full Changelog](https://github.com/CederGroupHub/smol/compare/v1.0.0...v1.0.1)
Expand Down
5 changes: 3 additions & 2 deletions smol/cofe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
"""

from .space.clusterspace import ClusterSubspace
from .expansion import ClusterExpansion
from .expansion import ClusterExpansion, RegressionData
from smol.cofe.wrangling.wrangler import (StructureWrangler)

__all__ = ['ClusterSubspace', 'StructureWrangler', 'ClusterExpansion']
__all__ = ['ClusterSubspace', 'StructureWrangler', 'ClusterExpansion',
'RegressionData']
94 changes: 77 additions & 17 deletions smol/cofe/expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,50 @@

__author__ = "Luis Barroso-Luque"

from copy import deepcopy
from dataclasses import dataclass, asdict
import numpy as np
from monty.json import MSONable
from monty.json import MSONable, jsanitize
from smol.cofe.space.clusterspace import ClusterSubspace


@dataclass
class RegressionData:
"""Dataclass used to store regression model details.

This class is used to store the details used in fitting a cluster expansion
for future reference and good provenance practices. It is highly
recommended to initialize :class:`ClusterExpansion` objects with this
class
"""

module: str
class_name: str
parameters: dict
feature_matrix: np.ndarray
property_vector: np.ndarray

@classmethod
def from_sklearn(cls, estimator, feature_matrix, property_vector):
"""Create a RegressionData object from sklearn estimator.

Args:
estimator (object):
scikit-leanr estimator class or derived.
feature_matrix (ndarray):
feature matrix used in fit.
property_vector (ndarray):
target property vector used in fit.
Returns:
RegressionData
"""
return cls(module=estimator.__module__,
class_name=estimator.__class__.__name__,
parameters=estimator.get_params(),
feature_matrix=feature_matrix,
property_vector=property_vector)


class ClusterExpansion(MSONable):
"""Class for the ClusterExpansion proper.

Expand Down Expand Up @@ -44,7 +83,7 @@ class ClusterExpansion(MSONable):
expansion. i.e. if it was pruned, any error metrics etc.
"""

def __init__(self, cluster_subspace, coefficients, feature_matrix=None):
def __init__(self, cluster_subspace, coefficients, regression_data=None):
r"""Initialize a ClusterExpansion.

Args:
Expand All @@ -57,26 +96,30 @@ def __init__(self, cluster_subspace, coefficients, feature_matrix=None):
coefficients to the correlation vector terms (length and order)
These correspond to the
ECI x the multiplicity of orbit x multiplicity of bit ordering
feature_matrix (ndarray): optional
The feature matrix used in fitting the given coefficients.
Useful to report metrics when printing and numerically
converting eci to another basis.
regression_data (RegressionData): optional
RegressionData object with details used in the fit of the
corresponding expansion. The feature_matrix attributed here is
necessary to compute things like numerical ECI transormations
for different bases.
"""
if feature_matrix is not None and \
len(coefficients) != feature_matrix.shape[1]:
if regression_data is not None and \
len(coefficients) != regression_data.feature_matrix.shape[1]:
raise AttributeError(
f"Feature matrix shape {feature_matrix.shape} does not match "
f"the number of coefficients {len(coefficients)}.")
f"Feature matrix shape {regression_data.feature_matrix.shape} "
f"does not match the number of coefficients "
f"{len(coefficients)}.")

if len(coefficients) != len(cluster_subspace):
raise AttributeError(
f"The size of the give subspace {len(cluster_subspace)} does "
f"not match the number of coefficients {len(coefficients)}")

self.coefs = coefficients
self.metadata = {}
self.regression_data = regression_data
self._subspace = cluster_subspace
self._feat_matrix = feature_matrix
# make copy for possible changes/pruning
self._feat_matrix = regression_data.feature_matrix.copy() \
if regression_data is not None else None
self._eci = None

@property
Expand Down Expand Up @@ -250,11 +293,23 @@ def __str__(self):
@classmethod
def from_dict(cls, d):
"""Create ClusterExpansion from serialized MSONable dict."""
if d['regression_data'] is not None:
dd = deepcopy(d)
dd['regression_data']['feature_matrix'] = np.array(
d['regression_data']['feature_matrix'])
dd['regression_data']['property_vector'] = np.array(
d['regression_data']['property_vector'])
reg_data = RegressionData(**dd['regression_data'])
else:
reg_data = None

ce = cls(ClusterSubspace.from_dict(d['cluster_subspace']),
coefficients=np.array(d['coefs']),
feature_matrix=np.array(d['feature_matrix'])
if d['feature_matrix'] is not None else d['feature_matrix'])
ce.metadata = d['metadata']
regression_data=reg_data)

# update copy of feature matrix to keep any changes
if d['feature_matrix'] is not None:
cls._feat_matrix = np.array(d['feature_matrix'])
return ce

def as_dict(self):
Expand All @@ -268,10 +323,15 @@ def as_dict(self):
feature_matrix = self._feat_matrix.tolist()
else:
feature_matrix = self._feat_matrix
if self.regression_data is not None:
reg_data = jsanitize(asdict(self.regression_data))
else:
reg_data = None

d = {'@module': self.__class__.__module__,
'@class': self.__class__.__name__,
'cluster_subspace': self.cluster_subspace.as_dict(),
'coefs': self.coefs.tolist(),
'feature_matrix': feature_matrix,
'metadata': self.metadata}
'regression_data': reg_data,
'feature_matrix': feature_matrix}
return d
39 changes: 33 additions & 6 deletions tests/test_cofe/test_expansion.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,27 @@
import unittest
import json
import numpy as np
from smol.cofe import StructureWrangler, ClusterSubspace, ClusterExpansion
from sklearn.linear_model import LinearRegression
from smol.cofe import StructureWrangler, ClusterSubspace, ClusterExpansion, \
RegressionData
from smol.cofe.extern import EwaldTerm
from tests.data import synthetic_CE_binary, synthetic_CEewald_binary
from tests.utils import assert_msonable


def test_regression_data(cluster_subspace):
reg = LinearRegression(fit_intercept=False)
n = np.random.randint(10, 100)
feat_matrix = np.random.random((n, len(cluster_subspace)))
prop_vec = np.random.random(n)
reg_data = RegressionData.from_sklearn(reg, feature_matrix=feat_matrix,
property_vector=prop_vec)
coeffs = np.random.random(len(cluster_subspace))
expansion = ClusterExpansion(cluster_subspace, coeffs, reg_data)
assert reg_data.class_name == reg.__class__.__name__
assert reg_data.module == reg.__module__
assert reg_data.parameters == reg.get_params()
assert_msonable(expansion)


# TODO add tests with synthetic ternary dataset
Expand All @@ -29,7 +47,11 @@ def setUp(self) -> None:
coefs = np.linalg.lstsq(self.sw.feature_matrix,
self.sw.get_property_vector('energy', True),
rcond=None)[0]
self.ce = ClusterExpansion(cs, coefs, self.sw.feature_matrix)
reg = LinearRegression(fit_intercept=False)
reg_data = RegressionData.from_sklearn(
reg, feature_matrix=self.sw.feature_matrix,
property_vector=self.sw.get_property_vector('energy'))
self.ce = ClusterExpansion(cs, coefs, reg_data)

def test_predict_train(self):
preds = [self.ce.predict(s) for s in self.sw.structures]
Expand Down Expand Up @@ -61,7 +83,7 @@ def test_convert_eci(self):

def test_prune(self):
cs = ClusterSubspace.from_dict(synthetic_CE_binary['cluster_subspace'])
ce = ClusterExpansion(cs, self.ce.coefs.copy(), self.ce._feat_matrix)
ce = ClusterExpansion(cs, self.ce.coefs.copy(), self.ce.regression_data)
thresh = 8E-3
ce.prune(threshold=thresh)
ids = [i for i, coef in enumerate(self.ce.coefs) if abs(coef) >= thresh]
Expand Down Expand Up @@ -89,12 +111,13 @@ def test_print(self):
_ = str(self.ce)

def test_msonable(self):
self.ce.metadata['somethingimportant'] = 75
d = self.ce.as_dict()
ce1 = ClusterExpansion.from_dict(d)
self.assertTrue(np.array_equal(self.ce.coefs, ce1.coefs))
self.assertIsInstance(self.ce.cluster_subspace, ClusterSubspace)
self.assertEqual(ce1.metadata, self.ce.metadata)
# change this to just use assert_msonable
self.assertEqual(ce1.regression_data.module,
self.ce.regression_data.module)
j = json.dumps(d)
json.loads(j)

Expand Down Expand Up @@ -140,7 +163,11 @@ def _test_predictions(self, cs, data):
ecis = np.linalg.lstsq(sw.feature_matrix,
sw.get_property_vector('energy', True),
rcond=None)[0]
ce = ClusterExpansion(cs, ecis, sw.feature_matrix)
reg_data = RegressionData(
module='foo.bar', class_name='Estimator', parameters={'foo': 'bar'},
feature_matrix=sw.feature_matrix,
property_vector=sw.get_property_vector('energy'))
ce = ClusterExpansion(cs, ecis, reg_data)
test_structs = [data[i][0] for i in self.test_ids]
test_energies = np.array([data[i][1] for i in self.test_ids])
preds = [ce.predict(s) for s in sw.structures]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def setUpClass(cls) -> None:
cls.sw.add_data(struct, {'energy': energy})

coefs = np.ones(cls.cs.num_corr_functions)
cls.ce = ClusterExpansion(cls.cs, coefs, cls.sw.feature_matrix)
cls.ce = ClusterExpansion(cls.cs, coefs)
cls.pr = CEProcessor(cls.cs, 2 * np.eye(3), coefs)
cls.en = CanonicalEnsemble(cls.pr)
cls.file_path = './test_save_work.mson'
Expand Down
11 changes: 8 additions & 3 deletions tests/test_moca/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy.testing as npt
import numpy as np

from smol.cofe import ClusterExpansion
from smol.cofe import ClusterExpansion, RegressionData
from smol.cofe.extern import EwaldTerm
from smol.moca import (CanonicalEnsemble, MuSemiGrandEnsemble,
FuSemiGrandEnsemble, CompositeProcessor,
Expand Down Expand Up @@ -61,8 +61,13 @@ def test_from_cluster_expansion(cluster_subspace, ensemble_cls):
proc.add_processor(EwaldProcessor(cluster_subspace, scmatrix,
cluster_subspace.external_terms[0],
coefficient=coefs[-1]))
fake_feature_matrix = np.random.random((5, len(coefs)))
expansion = ClusterExpansion(cluster_subspace, coefs, fake_feature_matrix)
reg_data = RegressionData(
module='fake.module', class_name='Estimator',
feature_matrix=np.random.random((5, len(coefs))),
property_vector=np.random.random(len(coefs)),
parameters={'foo': 'bar'})
expansion = ClusterExpansion(cluster_subspace, coefs, reg_data)

if ensemble_cls is MuSemiGrandEnsemble:
kwargs = {'chemical_potentials':
{sp: 0.3 for space in proc.unique_site_spaces
Expand Down