diff --git a/RELEASE.md b/RELEASE.md index 6ea6d4b..51828c7 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,5 +1,14 @@ # Upcoming release +# Release 0.8.1 + +* Added `DAGClassifier` sklearn interface using the Pytorch NOTEARS implementation. Supports binary classification. +* Added binary distributed data support for pytorch NOTEARS. +* Added a "distribution type" schema system for pytorch NOTEARS (`pytorch.dist_type`). +* Rename "data type" to "distribution type" in internal language. +* Fixed uniform discretiser (`Discretiser(method='uniform')`) where all bins have identical widths. +* Fixed and updated sklearn tutorial in docs. + # Release 0.8.0 * Add DYNOTEARS (`from_numpy_dynamic`, an algorithm for structure learning on Dynamic Bayesian Networks). @@ -52,6 +61,6 @@ The initial release of CausalNex. ## Thanks for supporting contributions CausalNex was originally designed by [Paul Beaumont](https://www.linkedin.com/in/pbeaumont/) and [Ben Horsburgh](https://www.linkedin.com/in/benhorsburgh/) to solve challenges they faced in inferencing causality in their project work. This work was later turned into a product thanks to the following contributors: -[Yetunde Dada](https://github.com/yetudada), [Wesley Leong](https://www.linkedin.com/in/wesleyleong/), [Steve Ler](https://www.linkedin.com/in/song-lim-steve-ler-380366106/), [Viktoriia Oliinyk](https://www.linkedin.com/in/victoria-oleynik/), [Roxana Pamfil](https://www.linkedin.com/in/roxana-pamfil-1192053b/), [Nisara Sriwattanaworachai](https://www.linkedin.com/in/nisara-sriwattanaworachai-795b357/), [Nikolaos Tsaousis](https://www.linkedin.com/in/ntsaousis/), [Angel Droth](https://www.linkedin.com/in/angeldroth/), and [Zain Patel](https://www.linkedin.com/in/zain-patel/). +[Yetunde Dada](https://github.com/yetudada), [Wesley Leong](https://www.linkedin.com/in/wesleyleong/), [Steve Ler](https://www.linkedin.com/in/song-lim-steve-ler-380366106/), [Viktoriia Oliinyk](https://www.linkedin.com/in/victoria-oleynik/), [Roxana Pamfil](https://www.linkedin.com/in/roxana-pamfil-1192053b/), [Nisara Sriwattanaworachai](https://www.linkedin.com/in/nisara-sriwattanaworachai-795b357/), [Nikolaos Tsaousis](https://www.linkedin.com/in/ntsaousis/), [Angel Droth](https://www.linkedin.com/in/angeldroth/), [Zain Patel](https://www.linkedin.com/in/zain-patel/), and [Shuhei Ishida](https://www.linkedin.com/in/shuhei-i/). CausalNex would also not be possible without the generous sharing from leading researches in the field of causal inference and we are grateful to everyone who advised and supported us, filed issues or helped resolve them, asked and answered questions or simply be part of inspiring discussions. diff --git a/causalnex/__init__.py b/causalnex/__init__.py index 01312ed..c3b0fb1 100644 --- a/causalnex/__init__.py +++ b/causalnex/__init__.py @@ -30,6 +30,6 @@ causalnex toolkit for causal reasoning (Bayesian Networks / Inference) """ -__version__ = "0.8.0" +__version__ = "0.8.1" __all__ = ["structure", "discretiser", "evaluation", "inference", "network", "plots"] diff --git a/causalnex/discretiser/discretiser.py b/causalnex/discretiser/discretiser.py index 763a38e..c3e26f8 100644 --- a/causalnex/discretiser/discretiser.py +++ b/causalnex/discretiser/discretiser.py @@ -174,10 +174,9 @@ def fit(self, data: np.ndarray) -> "Discretiser": x.sort() if self.method == "uniform": - bucket_width = len(x) / self.num_buckets + bucket_width = (np.max(x) - np.min(x)) / self.num_buckets self.numeric_split_points = [ - x[int(np.floor((n + 1) * bucket_width))] - for n in range(self.num_buckets - 1) + np.min(x) + bucket_width * (n + 1) for n in range(self.num_buckets - 1) ] elif self.method == "quantile": diff --git a/causalnex/structure/__init__.py b/causalnex/structure/__init__.py index 3651f9a..7699e22 100644 --- a/causalnex/structure/__init__.py +++ b/causalnex/structure/__init__.py @@ -30,7 +30,14 @@ ``causalnex.structure`` provides functionality to define or learn structure. """ -__all__ = ["StructureModel", "notears", "dynotears", "data_generators", "DAGRegressor"] +__all__ = [ + "StructureModel", + "notears", + "dynotears", + "data_generators", + "DAGRegressor", + "DAGClassifier", +] -from .sklearn import DAGRegressor +from .pytorch import DAGClassifier, DAGRegressor from .structuremodel import StructureModel diff --git a/causalnex/structure/dynotears.py b/causalnex/structure/dynotears.py index e7d1202..cee0115 100644 --- a/causalnex/structure/dynotears.py +++ b/causalnex/structure/dynotears.py @@ -39,8 +39,7 @@ import scipy.optimize as sopt from causalnex.structure import StructureModel - -from .transformers import DynamicDataTransformer +from causalnex.structure.transformers import DynamicDataTransformer def from_pandas_dynamic( # pylint: disable=too-many-arguments diff --git a/causalnex/structure/pytorch/__init__.py b/causalnex/structure/pytorch/__init__.py index a17bd56..cf44b77 100644 --- a/causalnex/structure/pytorch/__init__.py +++ b/causalnex/structure/pytorch/__init__.py @@ -30,7 +30,8 @@ ``causalnex.structure.pytorch`` provides functionality to define or learn structure using pytorch. """ -__all__ = ["from_numpy", "from_pandas", "NotearsMLP"] +__all__ = ["from_numpy", "from_pandas", "NotearsMLP", "DAGRegressor", "DAGClassifier"] from .core import NotearsMLP from .notears import from_numpy, from_pandas +from .sklearn import DAGClassifier, DAGRegressor diff --git a/causalnex/structure/pytorch/core.py b/causalnex/structure/pytorch/core.py index 58564d9..87f9893 100644 --- a/causalnex/structure/pytorch/core.py +++ b/causalnex/structure/pytorch/core.py @@ -45,7 +45,8 @@ import torch.nn as nn from sklearn.base import BaseEstimator -from .nonlinear import LocallyConnected +from causalnex.structure.pytorch.dist_type._base import DistTypeBase +from causalnex.structure.pytorch.nonlinear import LocallyConnected class NotearsMLP(nn.Module, BaseEstimator): @@ -56,9 +57,11 @@ class NotearsMLP(nn.Module, BaseEstimator): loc_lin_layer weights are the weight of hidden layers after the first fully connected layer """ + # pylint: disable=too-many-arguments def __init__( self, n_features: int, + dist_types: List[DistTypeBase], use_bias: bool = False, hidden_layer_units: Iterable[int] = (0,), bounds: List[Tuple[int, int]] = None, @@ -70,7 +73,8 @@ def __init__( Constructor for NOTEARS MLP class. Args: - n_features: number of input features + n_features: number of input features. + dist_types: list of data type objects used to fit the NOTEARS algorithm. use_bias: True to add the intercept to the model hidden_layer_units: An iterable where its length determine the number of layers used, and the numbers determine the number of nodes used for the layer in order. @@ -116,6 +120,8 @@ def __init__( # set the bounds as an attribute on the weights object self.dag_layer.weight.bounds = bounds + # set the dist types + self.dist_types = dist_types # type the adjacency matrix self.adj = None self.adj_mean_effect = None @@ -175,6 +181,31 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # [n, d] -> [n, d] x = x.squeeze(dim=2) # [n, d] return x + def reconstruct_data(self, X: np.ndarray) -> np.ndarray: + """ + Performs X_hat reconstruction, + then converts latent space to original data space via link function. + + Args: + X: input data used to reconstruct + + Returns: + reconstructed data + """ + + with torch.no_grad(): + # convert the predict data to pytorch tensor + X = torch.from_numpy(X).float().to(self.device) + + # perform forward reconstruction + X_hat = self(X) + + # recover each one of the latent space projections + for dist_type in self.dist_types: + X_hat = dist_type.inverse_link_function(X_hat) + + return np.asarray(X_hat.cpu().detach().numpy().astype(np.float64)) + @property def bias(self) -> Union[np.ndarray, None]: """ @@ -334,7 +365,12 @@ def _func(flat_params: np.ndarray) -> Tuple[float, np.ndarray]: X_hat = self(X) h_val = self._h_func() - loss = (0.5 / X.shape[0]) * torch.sum((X_hat - X) ** 2) + # preallocate loss tensor + loss = torch.tensor(0, device=X.device) # pylint: disable=not-callable + # sum the losses across all dist types + for dist_type in self.dist_types: + loss = loss + dist_type.loss(X, X_hat) + lagrange_penalty = 0.5 * rho * h_val * h_val + alpha * h_val # NOTE: both the l2 and l1 regularization are NOT applied to the bias parameters l2_reg = 0.5 * self.ridge_beta * self._l2_reg(n_features) diff --git a/causalnex/structure/pytorch/dist_type/__init__.py b/causalnex/structure/pytorch/dist_type/__init__.py new file mode 100644 index 0000000..02c0b8e --- /dev/null +++ b/causalnex/structure/pytorch/dist_type/__init__.py @@ -0,0 +1,45 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +``causalnex.pytorch.dist_type`` provides distribution type support classes for the pytorch NOTEARS algorithm. +""" + +from .binary import DistTypeBinary +from .continuous import DistTypeContinuous + +dist_type_aliases = { + "bin": DistTypeBinary, + "cont": DistTypeContinuous, +} + + +__all__ = [ + "DistTypeBinary", + "DistTypeContinuous", +] diff --git a/causalnex/structure/pytorch/dist_type/_base.py b/causalnex/structure/pytorch/dist_type/_base.py new file mode 100644 index 0000000..ec61400 --- /dev/null +++ b/causalnex/structure/pytorch/dist_type/_base.py @@ -0,0 +1,79 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +``causalnex.pytorch.dist_type._base`` defines the distribution type class interface and default behavior. +""" + +from abc import ABCMeta, abstractmethod + +import torch + + +class DistTypeBase(metaclass=ABCMeta): + """ Base class defining the distribution default behavior and interface """ + + def __init__(self, idx: int): + """ + Default constructor for the DistTypeBase class. + Unless overridden, provides default behavior to all subclasses. + + Args: + idx: Positional index in data passed to the NOTEARS algorithm + which correspond to this datatype. + """ + self.idx = idx + + @abstractmethod + def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor: + """ + Args: + X: The original data passed into NOTEARS (i.e. the reconstruction target). + + X_hat: The reconstructed data. + + Returns: + Scalar pytorch tensor of the reconstruction loss between X and X_hat. + """ + raise NotImplementedError("Must implement the loss() method") + + @abstractmethod + def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor: + """ + Convert the transformed data from the latent space to the original dtype + using the inverse link function. + + Args: + X_hat: Reconstructed data in the latent space. + + Returns: + Modified X_hat. + MUST be same shape as passed in data. + Projects the self.idx column from the latent space to the dist_type space. + """ + raise NotImplementedError("Must implement the inverse_link_function() method") diff --git a/causalnex/structure/pytorch/dist_type/binary.py b/causalnex/structure/pytorch/dist_type/binary.py new file mode 100644 index 0000000..0dfa293 --- /dev/null +++ b/causalnex/structure/pytorch/dist_type/binary.py @@ -0,0 +1,77 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +``causalnex.pytorch.data_type.continuous`` defines the binary distribution type. +""" + +import torch +import torch.nn as nn + +from causalnex.structure.pytorch.dist_type._base import DistTypeBase + + +class DistTypeBinary(DistTypeBase): + """ Class defining binary distribution type functionality """ + + def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor: + """ + https://pytorch.org/docs/stable/nn.html#torch.nn.BCEWithLogitsLoss + Uses the functional implementation of the BCEWithLogitsLoss class. + + The average logit binary cross entropy loss. + Averages across sample dimension (dim=0). + + Args: + X: The original data passed into NOTEARS (i.e. the reconstruction target). + + X_hat: The reconstructed data. + + Returns: + Scalar pytorch tensor of the reconstruction loss between X and X_hat. + """ + return nn.functional.binary_cross_entropy_with_logits( + input=X_hat[:, self.idx], + target=X[:, self.idx], + reduction="mean", + ) + + def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor: + """ + Inverse-logit (sigmoid) inverse link function for binary data. + + Args: + X_hat: Reconstructed data in the latent space. + + Returns: + Modified X_hat. + MUST be same shape as passed in data. + Projects the self.idx column from the latent space to the dist_type space. + """ + X_hat[:, self.idx] = torch.sigmoid(X_hat[:, self.idx]) + return X_hat diff --git a/causalnex/structure/pytorch/dist_type/continuous.py b/causalnex/structure/pytorch/dist_type/continuous.py new file mode 100644 index 0000000..7aab8d0 --- /dev/null +++ b/causalnex/structure/pytorch/dist_type/continuous.py @@ -0,0 +1,70 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +``causalnex.pytorch.data_type.continuous`` defines the continuous distribution type. +""" + +import torch + +from causalnex.structure.pytorch.dist_type._base import DistTypeBase + + +class DistTypeContinuous(DistTypeBase): + """ Class defining continuous distribution type functionality """ + + def loss(self, X: torch.Tensor, X_hat: torch.Tensor) -> torch.Tensor: + """ + The average gaussian loss. + + Args: + X: The original data passed into NOTEARS (i.e. the reconstruction target). + + X_hat: The reconstructed data. + + Returns: + Scalar pytorch tensor of the reconstruction loss between X and X_hat. + """ + + return (0.5 / X.shape[0]) * torch.sum( + (X_hat[:, self.idx] - X[:, self.idx]) ** 2 + ) + + def inverse_link_function(self, X_hat: torch.Tensor) -> torch.Tensor: + """ + Identity inverse link function for continuous data. + + Args: + X_hat: Reconstructed data in the latent space. + + Returns: + Modified X_hat. + MUST be same shape as passed in data. + Projects the self.idx column from the latent space to the dist_type space. + """ + return X_hat diff --git a/causalnex/structure/pytorch/notears.py b/causalnex/structure/pytorch/notears.py index dd88ca6..79159fd 100644 --- a/causalnex/structure/pytorch/notears.py +++ b/causalnex/structure/pytorch/notears.py @@ -31,13 +31,14 @@ import logging from copy import deepcopy -from typing import Iterable, List, Tuple +from typing import Dict, Iterable, List, Tuple, Union import numpy as np import pandas as pd from sklearn.utils import check_array from causalnex.structure.pytorch.core import NotearsMLP +from causalnex.structure.pytorch.dist_type import DistTypeContinuous, dist_type_aliases from causalnex.structure.structuremodel import StructureModel __all__ = ["from_numpy", "from_pandas"] @@ -47,6 +48,7 @@ # pylint: disable=too-many-arguments def from_numpy( X: np.ndarray, + dist_type_schema: Dict[int, str] = None, lasso_beta: float = 0.0, ridge_beta: float = 0.0, use_bias: bool = False, @@ -74,6 +76,11 @@ def from_numpy( Args: X: 2d input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented. + dist_type_schema: The dist type schema corresponding to the passed in data X. + It maps the positional column in X to the string alias of a dist type. + A list of alias names can be found in ``dist_type/__init__.py``. + If None, assumes that all data in X is continuous. + lasso_beta: Constant that multiplies the lasso term (l1 regularisation). NOTE when using nonlinearities, the l1 loss only applies to the dag_layer. @@ -102,6 +109,7 @@ def from_numpy( Raises: ValueError: If X does not contain data. + ValueError: If schema does not correspond to columns. """ # n examples, d properties if not X.size: @@ -110,6 +118,25 @@ def from_numpy( # Check array for NaN or inf values check_array(X) + if dist_type_schema is not None: + + # make sure that there is one provided key per column + if set(range(X.shape[1])).symmetric_difference(set(dist_type_schema.keys())): + raise ValueError( + "Difference indices and expected indices. Got {} schema".format( + dist_type_schema + ) + ) + + # if dist_type_schema is None, assume all columns are continuous, else ini + dist_types = ( + [DistTypeContinuous(idx=idx) for idx in np.arange(X.shape[1])] + if dist_type_schema is None + else [ + dist_type_aliases[alias](idx=idx) for idx, alias in dist_type_schema.items() + ] + ) + _, d = X.shape # if None or empty, convert into a list with single item @@ -139,6 +166,7 @@ def from_numpy( model = NotearsMLP( n_features=d, + dist_types=dist_types, hidden_layer_units=hidden_layer_units, lasso_beta=lasso_beta, ridge_beta=ridge_beta, @@ -171,6 +199,10 @@ def from_numpy( value = bias[node] sm.nodes[node]["bias"] = value + for dist_type in dist_types: + # attach each dist_type object to corresponding node + sm.nodes[dist_type.idx]["dist_type"] = dist_type + # preserve the structure_learner as a graph attribute sm.graph["structure_learner"] = model @@ -181,15 +213,16 @@ def from_numpy( # pylint: disable=too-many-arguments def from_pandas( X: pd.DataFrame, + dist_type_schema: Dict[Union[str, int], str] = None, lasso_beta: float = 0.0, ridge_beta: float = 0.0, + use_bias: bool = False, hidden_layer_units: Iterable[int] = None, max_iter: int = 100, w_threshold: float = None, tabu_edges: List[Tuple[str, str]] = None, tabu_parent_nodes: List[str] = None, tabu_child_nodes: List[str] = None, - use_bias: bool = False, **kwargs ) -> StructureModel: """ @@ -215,6 +248,11 @@ def from_pandas( Args: X: 2d input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented. + dist_type_schema: The dist type schema corresponding to the passed in data X. + It maps the pandas column name in X to the string alias of a dist type. + A list of alias names can be found in ``dist_type/__init__.py``. + If None, assumes that all data in X is continuous. + lasso_beta: Constant that multiplies the lasso term (l1 regularisation). NOTE when using nonlinearities, the l1 loss only applies to the dag_layer. @@ -247,6 +285,13 @@ def from_pandas( data = deepcopy(X) + # if dist_type_schema is not None, convert dist_type_schema from cols to idx + dist_type_schema = ( + dist_type_schema + if dist_type_schema is None + else {X.columns.get_loc(col): alias for col, alias in dist_type_schema.items()} + ) + non_numeric_cols = data.select_dtypes(exclude="number").columns if len(non_numeric_cols) > 0: @@ -269,6 +314,7 @@ def from_pandas( g = from_numpy( X=data.values, + dist_type_schema=dist_type_schema, lasso_beta=lasso_beta, ridge_beta=ridge_beta, use_bias=use_bias, @@ -294,7 +340,7 @@ def from_pandas( mean_effect=edge_dict["mean_effect"], ) - # retrieve dtype information from graph attribute + # retrieve all graphs attrs for key, val in g.graph.items(): sm.graph[key] = val @@ -303,4 +349,9 @@ def from_pandas( node_name = idx_col[node[0]] sm.nodes[node_name]["bias"] = node[1]["bias"] + # recover and preseve the node dist_types + for node in g.nodes(data=True): + node_name = idx_col[node[0]] + sm.nodes[node_name]["dist_type"] = node[1]["dist_type"] + return sm diff --git a/causalnex/structure/pytorch/sklearn/__init__.py b/causalnex/structure/pytorch/sklearn/__init__.py new file mode 100644 index 0000000..4836361 --- /dev/null +++ b/causalnex/structure/pytorch/sklearn/__init__.py @@ -0,0 +1,35 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +``causalnex.structure.pytorch.sklearn`` provides sklearn style functionality to NOTEARS. +""" + +__all__ = ["DAGRegressor", "DAGClassifier"] + +from .clf import DAGClassifier +from .reg import DAGRegressor diff --git a/causalnex/structure/sklearn.py b/causalnex/structure/pytorch/sklearn/_base.py similarity index 75% rename from causalnex/structure/sklearn.py rename to causalnex/structure/pytorch/sklearn/_base.py index cabe896..152a17c 100644 --- a/causalnex/structure/sklearn.py +++ b/causalnex/structure/pytorch/sklearn/_base.py @@ -26,19 +26,18 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -This module contains the implementation of ``DAGRegressor``. +This module contains the implementation of ``DAGBase``. -``DAGRegressor`` is a class which wraps the StructureModel in an sklearn interface for regression. +``DAGBase`` is a class which provides an interface and common function for sklearn style NOTEARS functions. """ - import copy import warnings -from typing import Iterable, List, Union +from abc import ABCMeta, abstractmethod +from typing import Dict, Iterable, List, Union import numpy as np import pandas as pd -import torch -from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.base import BaseEstimator from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_is_fitted, check_X_y @@ -46,37 +45,18 @@ from causalnex.structure.pytorch import notears -class DAGRegressor( - BaseEstimator, RegressorMixin +class DAGBase( + BaseEstimator, metaclass=ABCMeta ): # pylint: disable=too-many-instance-attributes """ - Regressor wrapper of the StructureModel. + Base class for all sklearn wrappers of the StructureModel. Implements the sklearn .fit and .predict interface. - Currently only supports linear NOTEARS fitting by the DAG. - - Example: - :: - >>> from causalnex.sklearn import DAGRegressor - >>> - >>> smr = DAGRegressor(threshold=0.1) - >>> smr.fit(X_train, y_train) - >>> - >>> y_preds = smr.predict(X_test) - >>> type(y_preds) - np.ndarray - >>> - >>> type(smr.feature_importances_) - np.ndarray - :: - - Attributes: - feature_importances_ (np.ndarray): An array of edge weights corresponding - positionally to the feature X. """ # pylint: disable=too-many-arguments def __init__( self, + dist_type_schema: Dict[Union[str, int], str] = None, alpha: float = 0.0, beta: float = 0.0, fit_intercept: bool = True, @@ -92,6 +72,12 @@ def __init__( ): """ Args: + dist_type_schema: The dist type schema corresponding to the X data passed to fit or predict. + It maps the pandas column name in X to the string alias of a dist type. + If X is a np.ndarray, it maps the positional index to the string alias of a dist type. + A list of alias names can be found in ``dist_type/__init__.py``. + If None, assumes that all data in X is continuous. + alpha: l1 loss weighting. When using nonlinear layers this is only applied to the first layer. @@ -140,6 +126,7 @@ def __init__( self.beta = beta self.fit_intercept = fit_intercept self.hidden_layer_units = hidden_layer_units + self.dist_type_schema = dist_type_schema self.threshold = threshold self.tabu_edges = tabu_edges self.tabu_parent_nodes = tabu_parent_nodes @@ -160,9 +147,15 @@ def __init__( self.enforce_dag = enforce_dag self.standardize = standardize - def fit( - self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray] - ) -> "DAGRegressor": + @abstractmethod + def _target_dist_type(self) -> str: + """ + NOTE: + When extending this class override this method to return a dist_type alias + """ + raise NotImplementedError("Must implement _target_dist_type()") + + def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]): """ Fits the sm model using the concat of X and y. """ @@ -170,21 +163,40 @@ def fit( # defensive X, y checks check_X_y(X, y, y_numeric=True) - # force as DataFrame and Series (for later calculations) + # force X, y to DataFrame, Series for later calculations X = pd.DataFrame(X) y = pd.Series(y) # force name so that name != None (causes errors in notears) y.name = y.name or "__target" + # if self.dist_type_schema is None, assume all columns are continuous + dist_type_schema = self.dist_type_schema or {col: "cont" for col in X.columns} + if self.standardize: - self.ss_X = StandardScaler() - self.ss_y = StandardScaler() - X = pd.DataFrame(self.ss_X.fit_transform(X), columns=X.columns) - y = pd.Series( - self.ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1), - name=y.name, + # only standardize the continuous dist type columns. + self.continuous_col_idxs = [ + X.columns.get_loc(col) + for col, alias in dist_type_schema.items() + if alias == "cont" + ] + + # copy X to prevet changes to underlying array data + X = X.copy() + self._ss_X = StandardScaler() + X.iloc[:, self.continuous_col_idxs] = self._ss_X.fit_transform( + X.iloc[:, self.continuous_col_idxs] ) + # if its a continuous target also standardize + if self._target_dist_type() == "cont": + y = y.copy() + self._ss_y = StandardScaler() + y[:] = self._ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1) + + # add the target to the dist_type_schema + # NOTE: this must be done AFTER standardize + dist_type_schema[y.name] = self._target_dist_type() + # preserve the feature and target colnames self._features = tuple(X.columns) self._target = y.name @@ -203,6 +215,7 @@ def fit( # fit the structured model self.graph_ = notears.from_pandas( X, + dist_type_schema=dist_type_schema, lasso_beta=self.alpha, ridge_beta=self.beta, hidden_layer_units=self.hidden_layer_units, @@ -220,42 +233,39 @@ def fit( return self - def _predict_from_parents(self, X: Union[pd.DataFrame, np.ndarray]): - - # extract the base solver - structure_learner = self.graph_.graph["structure_learner"] - - # convert the predict data to pytorch tensor - X = torch.from_numpy(X).float().to(structure_learner.device) - # need to concat y onto X so that the dimensions are the same - y = torch.zeros(X.shape[0], 1).float().to(structure_learner.device) - X = torch.cat([X, y], dim=1) - - # perform forward reconstruction - X_hat = structure_learner(X) - - # FUTURE NOTE: with dtypes the projection from latent -> dtype goes here - - # extract the desired y column, return as array - y_pred = X_hat[:, -1] - return y_pred.cpu().detach().numpy() - def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: """ - Get the predictions of the structured model. - This is done by multiplying the edge weights with the feature i.e. X @ W + Uses the fitted NOTEARS algorithm to reconstruct y from known X data. + + Returns: + Predicted y values for each row of X. """ # force convert to ndarray X = np.asarray(X) if self.standardize: - X = self.ss_X.transform(X) + X = X.copy() + X[:, self.continuous_col_idxs] = self._ss_X.transform( + X[:, self.continuous_col_idxs] + ) + + # insert dummy y column + y_fill = np.zeros(shape=(X.shape[0], 1)) + X = np.hstack([X, y_fill]) # check that the model has been fit check_is_fitted(self, "graph_") - y_pred = np.asarray(self._predict_from_parents(X)) - if self.standardize: - y_pred = self.ss_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1) + # extract the base solver + structure_learner = self.graph_.graph["structure_learner"] + # use base solver to reconstruct data + X_hat = structure_learner.reconstruct_data(X) + # pull off reconstructed y column + y_pred = X_hat[:, -1] + + # inverse-standardize + if self.standardize and self._target_dist_type() == "cont": + y_pred = self._ss_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1) + return y_pred def get_edges_to_node(self, name: str, data: str = "weight") -> pd.Series: @@ -321,9 +331,7 @@ def plot_dag(self, enforce_dag: bool = False, filename: str = "./graph.png"): # pylint: disable=import-outside-toplevel from IPython.display import Image except ImportError as e: - raise ImportError( - "DAGRegressor.plot_dag method requires IPython installed." - ) from e + raise ImportError("plot_dag method requires IPython installed.") from e check_is_fitted(self, "graph_") diff --git a/causalnex/structure/pytorch/sklearn/clf.py b/causalnex/structure/pytorch/sklearn/clf.py new file mode 100644 index 0000000..880b7f8 --- /dev/null +++ b/causalnex/structure/pytorch/sklearn/clf.py @@ -0,0 +1,132 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module contains the implementation of ``DAGClassifier``. + +``DAGClassifier`` is a class which wraps the StructureModel in an sklearn interface for classification. +""" + +from typing import Union + +import numpy as np +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.preprocessing import LabelEncoder +from sklearn.utils.multiclass import check_classification_targets + +from causalnex.structure.pytorch.sklearn._base import DAGBase + + +class DAGClassifier(ClassifierMixin, DAGBase): + """ + Classifier wrapper of the StructureModel. + Implements the sklearn .fit and .predict interface. + + Example: + :: + >>> from causalnex.sklearn import DAGRegressor + >>> + >>> clf = DAGClassifier(threshold=0.1) + >>> clf.fit(X_train, y_train) + >>> + >>> y_preds = clf.predict(X_test) + >>> type(y_preds) + np.ndarray + >>> + >>> type(clf.feature_importances_) + np.ndarray + :: + + Attributes: + feature_importances_ (np.ndarray): An array of edge weights corresponding + positionally to the feature X. + + coef_ (np.ndarray): An array of edge weights corresponding + positionally to the feature X. + + intercept_ (float): The target node bias value. + """ + + def _target_dist_type(self) -> str: + return self.__target_dist_type + + def fit( + self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray] + ) -> "DAGClassifier": + """ + Fits the sm model using the concat of X and y. + """ + # clf target check + check_classification_targets(y) + + # encode the categories to be numeric + enc = LabelEncoder() + y = y.copy() + y[:] = enc.fit_transform(y) + # store the classes from the LabelEncoder + self.classes_ = enc.classes_ + + # class number checks + n_classes = len(self.classes_) + if n_classes < 2: + raise ValueError( + "This solver needs samples of at least 2 classes" + " in the data, but the data contains only one" + " class: {}".format(self.classes_[0]) + ) + if n_classes > 2: + raise ValueError("This solver does not support more than 2 classes") + + # store the private attr __target_dist_type + self.__target_dist_type = "bin" + # fit the NOTEARS model + super().fit(X, y) + return self + + def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: + """ + Uses the fitted NOTEARS algorithm to reconstruct y from known X data. + + Returns: + Predicted y values for each row of X. + """ + probs = self.predict_proba(X) + + # get the class by rounding the (0, 1) bound probability + indices = probs.round().astype(np.int64) + + return self.classes_[indices] + + def predict_proba(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: + """ + Uses the fitted NOTEARS algorithm to reconstruct y from known X data. + + Returns: + Predicted y class probabilities for each row of X. + """ + return super().predict(X) diff --git a/causalnex/structure/pytorch/sklearn/reg.py b/causalnex/structure/pytorch/sklearn/reg.py new file mode 100644 index 0000000..378dc33 --- /dev/null +++ b/causalnex/structure/pytorch/sklearn/reg.py @@ -0,0 +1,86 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module contains the implementation of ``DAGRegressor``. + +``DAGRegressor`` is a class which wraps the StructureModel in an sklearn interface for regression. +""" + +from typing import Union + +import numpy as np +import pandas as pd +from sklearn.base import RegressorMixin + +from causalnex.structure.pytorch.sklearn._base import DAGBase + + +class DAGRegressor(RegressorMixin, DAGBase): + """ + Regressor wrapper of the StructureModel. + Implements the sklearn .fit and .predict interface. + + Example: + :: + >>> from causalnex.sklearn import DAGRegressor + >>> + >>> reg = DAGRegressor(threshold=0.1) + >>> reg.fit(X_train, y_train) + >>> + >>> y_preds = reg.predict(X_test) + >>> type(y_preds) + np.ndarray + >>> + >>> type(reg.feature_importances_) + np.ndarray + :: + + Attributes: + feature_importances_ (np.ndarray): An array of edge weights corresponding + positionally to the feature X. + + coef_ (np.ndarray): An array of edge weights corresponding + positionally to the feature X. + + intercept_ (float): The target node bias value. + """ + + def _target_dist_type(self) -> str: + return self.__target_dist_type + + def fit( + self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray] + ) -> "DAGRegressor": + """ + Fits the sm model using the concat of X and y. + """ + # store the private attr __target_dist_type + self.__target_dist_type = "cont" + # fit the NOTEARS model + super().fit(X, y) + return self diff --git a/doc_requirements.txt b/doc_requirements.txt index 744f4c3..e248479 100644 --- a/doc_requirements.txt +++ b/doc_requirements.txt @@ -4,6 +4,7 @@ jupyter_client>=5.1.0, <6.0 nbsphinx==0.4.2 nbstripout==0.3.3 patchy>=1.5, <2.0 +pygments>=2.6.1, <3.0 recommonmark==0.5.0 sphinx-autodoc-typehints>=1.6.0, < 1.11.0 sphinx-markdown-tables==0.0.9 diff --git a/docs/source/03_tutorial/regressor_tutorial.ipynb b/docs/source/03_tutorial/sklearn_tutorial.ipynb similarity index 59% rename from docs/source/03_tutorial/regressor_tutorial.ipynb rename to docs/source/03_tutorial/sklearn_tutorial.ipynb index 648653c..6eb0540 100644 --- a/docs/source/03_tutorial/regressor_tutorial.ipynb +++ b/docs/source/03_tutorial/sklearn_tutorial.ipynb @@ -1,35 +1,17 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import os\n", - "import sys\n", - "module_path = os.path.abspath(os.path.join(\"../../..\"))\n", - "if module_path not in sys.path:\n", - " sys.path.append(module_path)" + "# Sklearn Tutorial" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Contents\n", - "\n", - "This notebook walks through using the DAGRegressor model.\n", - "\n", - "The material covered here is as follows:\n", - "- Linear Interface\n", - "- Nonlinear Interface" + "This notebook walks through using the sklearn style DAGRegressor and DAGClassifier models." ] }, { @@ -37,11 +19,11 @@ "metadata": {}, "source": [ "___\n", - "## Real Data (boston housing)\n", - "\n", - "This section demonstrates the performance of the algorithm on a real-world dataset. The main things to note in this section are:\n", + "## DAGRegressor\n", + "This section demonstrates the performance of the DAGRegressor on a real-world dataset. The main things to note in this section are:\n", "- The scale sensitivity of the algorithm\n", "- Interpretability of nonlinear `.coef_`\n", + "### The Data: Boston Housing\n", "\n", "The boston housing dataset is a classic benchmark regression task. The objective is to predict a set of house prices given a small set of features.\n", "\n", @@ -112,6 +94,8 @@ } ], "source": [ + "import numpy as np\n", + "import pandas as pd\n", "from sklearn.datasets import load_boston\n", "print(load_boston(return_X_y=False)[\"DESCR\"])" ] @@ -120,7 +104,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Lets initially benchmark the performance of an `ElasticNetCV` fitted across the entire dataset." + "Lets initially benchmark the performance of an `ElasticNetCV` fitted across the entire dataset." ] }, { @@ -160,13 +144,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Linear DAGRegressor" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "### Linear DAGRegressor\n", + "\n", "The DAGRegressor has several parameters which can be used to better fit a more complicated noisy DAG:\n", "- `alpha`: The l1 (lasso) regularisation parameter. Increasing this creates a sparser DAG.\n", "- `beta`: The l2 (ridge) regularisation parameter.\n", @@ -251,13 +230,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### NonLinear DAGRegressor" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "### NonLinear DAGRegressor\n", + "\n", "Specifying a nonlinear model is extremely simple, only a single parameter needs to be altered: `hidden_layer_units`\n", "\n", "`hidden_layer_units` takes _any_ **iterable** of **integers**: \n", @@ -332,7 +306,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Interpereting the Nonlinear DAG\n", + "#### Interpereting the Nonlinear DAG\n", "\n", "For nonlinear analysis, understanding the impact of one feature on another is not as simple as taking the mean effect as in the linear case.\n", "Instead, a combination of `reg.coef_` and `reg.feature_importances` should be used:\n", @@ -516,7 +490,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Dependent Target\n", + "#### Dependent Target\n", "\n", "Setting the `dependent_target=False` has an impact on performance as shown below, but can give better insight into the overall nonlinear structure of the data.\n", "\n", @@ -605,6 +579,236 @@ "reg.plot_dag(True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "___\n", + "## DAGClassifier\n", + "This section demonstrates the performance of the algorithm on a real-world dataset.\n", + "\n", + "The interface is very similar to the DAGRegressor so key details should be found there.\n", + "### The Data: Breast Cancer" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _breast_cancer_dataset:\n", + "\n", + "Breast cancer wisconsin (diagnostic) dataset\n", + "--------------------------------------------\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 569\n", + "\n", + " :Number of Attributes: 30 numeric, predictive attributes and the class\n", + "\n", + " :Attribute Information:\n", + " - radius (mean of distances from center to points on the perimeter)\n", + " - texture (standard deviation of gray-scale values)\n", + " - perimeter\n", + " - area\n", + " - smoothness (local variation in radius lengths)\n", + " - compactness (perimeter^2 / area - 1.0)\n", + " - concavity (severity of concave portions of the contour)\n", + " - concave points (number of concave portions of the contour)\n", + " - symmetry \n", + " - fractal dimension (\"coastline approximation\" - 1)\n", + "\n", + " The mean, standard error, and \"worst\" or largest (mean of the three\n", + " largest values) of these features were computed for each image,\n", + " resulting in 30 features. For instance, field 3 is Mean Radius, field\n", + " 13 is Radius SE, field 23 is Worst Radius.\n", + "\n", + " - class:\n", + " - WDBC-Malignant\n", + " - WDBC-Benign\n", + "\n", + " :Summary Statistics:\n", + "\n", + " ===================================== ====== ======\n", + " Min Max\n", + " ===================================== ====== ======\n", + " radius (mean): 6.981 28.11\n", + " texture (mean): 9.71 39.28\n", + " perimeter (mean): 43.79 188.5\n", + " area (mean): 143.5 2501.0\n", + " smoothness (mean): 0.053 0.163\n", + " compactness (mean): 0.019 0.345\n", + " concavity (mean): 0.0 0.427\n", + " concave points (mean): 0.0 0.201\n", + " symmetry (mean): 0.106 0.304\n", + " fractal dimension (mean): 0.05 0.097\n", + " radius (standard error): 0.112 2.873\n", + " texture (standard error): 0.36 4.885\n", + " perimeter (standard error): 0.757 21.98\n", + " area (standard error): 6.802 542.2\n", + " smoothness (standard error): 0.002 0.031\n", + " compactness (standard error): 0.002 0.135\n", + " concavity (standard error): 0.0 0.396\n", + " concave points (standard error): 0.0 0.053\n", + " symmetry (standard error): 0.008 0.079\n", + " fractal dimension (standard error): 0.001 0.03\n", + " radius (worst): 7.93 36.04\n", + " texture (worst): 12.02 49.54\n", + " perimeter (worst): 50.41 251.2\n", + " area (worst): 185.2 4254.0\n", + " smoothness (worst): 0.071 0.223\n", + " compactness (worst): 0.027 1.058\n", + " concavity (worst): 0.0 1.252\n", + " concave points (worst): 0.0 0.291\n", + " symmetry (worst): 0.156 0.664\n", + " fractal dimension (worst): 0.055 0.208\n", + " ===================================== ====== ======\n", + "\n", + " :Missing Attribute Values: None\n", + "\n", + " :Class Distribution: 212 - Malignant, 357 - Benign\n", + "\n", + " :Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian\n", + "\n", + " :Donor: Nick Street\n", + "\n", + " :Date: November, 1995\n", + "\n", + "This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.\n", + "https://goo.gl/U2Uwz2\n", + "\n", + "Features are computed from a digitized image of a fine needle\n", + "aspirate (FNA) of a breast mass. They describe\n", + "characteristics of the cell nuclei present in the image.\n", + "\n", + "Separating plane described above was obtained using\n", + "Multisurface Method-Tree (MSM-T) [K. P. Bennett, \"Decision Tree\n", + "Construction Via Linear Programming.\" Proceedings of the 4th\n", + "Midwest Artificial Intelligence and Cognitive Science Society,\n", + "pp. 97-101, 1992], a classification method which uses linear\n", + "programming to construct a decision tree. Relevant features\n", + "were selected using an exhaustive search in the space of 1-4\n", + "features and 1-3 separating planes.\n", + "\n", + "The actual linear program used to obtain the separating plane\n", + "in the 3-dimensional space is that described in:\n", + "[K. P. Bennett and O. L. Mangasarian: \"Robust Linear\n", + "Programming Discrimination of Two Linearly Inseparable Sets\",\n", + "Optimization Methods and Software 1, 1992, 23-34].\n", + "\n", + "This database is also available through the UW CS ftp server:\n", + "\n", + "ftp ftp.cs.wisc.edu\n", + "cd math-prog/cpo-dataset/machine-learn/WDBC/\n", + "\n", + ".. topic:: References\n", + "\n", + " - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction \n", + " for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on \n", + " Electronic Imaging: Science and Technology, volume 1905, pages 861-870,\n", + " San Jose, CA, 1993.\n", + " - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and \n", + " prognosis via linear programming. Operations Research, 43(4), pages 570-577, \n", + " July-August 1995.\n", + " - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques\n", + " to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) \n", + " 163-171.\n" + ] + } + ], + "source": [ + "from sklearn.datasets import load_breast_cancer\n", + "print(load_breast_cancer(return_X_y=False)[\"DESCR\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MEAN Score: 0.977\n", + "MEAN EFFECT DIRECTIONAL:\n", + "fractal dimension error 0.183807\n", + "compactness error 0.151190\n", + "mean fractal dimension 0.141346\n", + "symmetry error 0.066475\n", + "concavity error 0.001481\n", + "texture error 0.000643\n", + "smoothness error 0.000028\n", + "mean compactness -0.000081\n", + "mean symmetry -0.000104\n", + "concave points error -0.011845\n", + "worst fractal dimension -0.039221\n", + "worst compactness -0.074652\n", + "mean smoothness -0.096894\n", + "perimeter error -0.301725\n", + "mean concavity -0.302624\n", + "area error -0.315381\n", + "mean area -0.335588\n", + "mean perimeter -0.349792\n", + "mean texture -0.360211\n", + "mean radius -0.362093\n", + "worst concavity -0.364120\n", + "worst symmetry -0.388658\n", + "worst smoothness -0.444673\n", + "mean concave points -0.454980\n", + "radius error -0.457226\n", + "worst area -0.458990\n", + "worst perimeter -0.488915\n", + "worst radius -0.538895\n", + "worst texture -0.555389\n", + "worst concave points -0.559847\n", + "dtype: float64\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from causalnex.structure import DAGClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "from sklearn.datasets import load_breast_cancer\n", + "X, y = load_breast_cancer(return_X_y=True)\n", + "names = load_breast_cancer(return_X_y=False)[\"feature_names\"]\n", + "\n", + "reg = DAGClassifier(\n", + " alpha=0.1,\n", + " beta=0.5,\n", + " hidden_layer_units=[0],\n", + " fit_intercept=True,\n", + " standardize=True\n", + " )\n", + "from sklearn.model_selection import KFold\n", + "scores = cross_val_score(reg, X, y, cv=KFold(shuffle=True, random_state=42))\n", + "print(f'MEAN Score: {np.mean(scores).mean():.3f}')\n", + "\n", + "X_pd = pd.DataFrame(X, columns=names)\n", + "y_pd = pd.Series(y, name=\"NOT CANCER\")\n", + "reg.fit(X_pd, y_pd)\n", + "print(\"MEAN EFFECT DIRECTIONAL:\")\n", + "print(pd.Series(reg.coef_, index=names).sort_values(ascending=False))\n", + "reg.plot_dag(True)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -636,7 +840,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.9" } }, "nbformat": 4, diff --git a/docs/source/api_docs/index.rst b/docs/source/api_docs/index.rst index 61e55e4..b73ed2e 100644 --- a/docs/source/api_docs/index.rst +++ b/docs/source/api_docs/index.rst @@ -61,7 +61,7 @@ Welcome to CausalNex's API docs and tutorials! 03_tutorial/03_tutorial.md 03_tutorial/plotting_tutorial.md - 03_tutorial/regressor_tutorial.md + 03_tutorial/sklearn_tutorial.md .. toctree:: :maxdepth: 2 diff --git a/tests/structure/data_generators/test_core.py b/tests/structure/data_generators/test_core.py index e1dad0d..ffeb112 100644 --- a/tests/structure/data_generators/test_core.py +++ b/tests/structure/data_generators/test_core.py @@ -335,7 +335,7 @@ def test_mixed_type_independence( seed=seed, ) - atol = 0.05 # 5% difference bewteen joint & factored! + atol = 0.02 # at least 2% difference bewteen joint & factored! # 1. dependent links # 0 -> 1 (we look at the class with the highest deviation from uniform # to avoid small values) @@ -360,7 +360,7 @@ def test_mixed_type_independence( atol=atol, ) - tol = 0.15 # relative tolerance of +- 15% of the + tol = 0.20 # at most relative tolerance of +- 20% of the # 2. independent links # categorical c, _ = max( diff --git a/tests/structure/data_generators/test_wrappers.py b/tests/structure/data_generators/test_wrappers.py index 486db8c..c2ae19f 100644 --- a/tests/structure/data_generators/test_wrappers.py +++ b/tests/structure/data_generators/test_wrappers.py @@ -1,4 +1,3 @@ -# pylint: disable=too-many-lines # Copyright 2019-2020 QuantumBlack Visual Analytics Limited # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/structure/test_dist_type.py b/tests/structure/test_dist_type.py new file mode 100644 index 0000000..8187301 --- /dev/null +++ b/tests/structure/test_dist_type.py @@ -0,0 +1,110 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest +import torch + +from causalnex.structure.pytorch.dist_type import DistTypeBinary, DistTypeContinuous +from causalnex.structure.pytorch.notears import from_numpy, from_pandas + + +class TestDistTypeClasses: + @pytest.mark.parametrize("dist_type", [DistTypeBinary, DistTypeContinuous]) + def test_default_init(self, dist_type): + idx = 1 + dt = dist_type(idx=idx) + + assert dt.idx == idx + + @pytest.mark.parametrize( + "dist_type, X, X_hat", + [ + ( + DistTypeContinuous, + torch.from_numpy(np.random.normal(size=(5, 2))), + torch.from_numpy(np.random.normal(size=(5, 2))), + ), + ( + DistTypeBinary, + torch.from_numpy(np.random.randint(2, size=(5, 2))).float(), + torch.from_numpy(np.random.randint(2, size=(5, 2))).float(), + ), + ], + ) + def test_loss(self, dist_type, X, X_hat): + dist_types = [dist_type(idx=idx) for idx in np.arange(X.shape[1])] + loss = 0.0 + with torch.no_grad(): + for dt in dist_types: + loss = loss + dt.loss(X, X_hat) + + assert isinstance(loss, torch.Tensor) + assert loss.shape == torch.Size([]) + + +class TestDistTypeNotears: + def test_schema_mismatch_error(self): + X = np.ones(shape=(10, 2)) + schema = {0: "cont", 1: "cont", 2: "cont"} + with pytest.raises(ValueError): + from_numpy(X, schema) + + @pytest.mark.parametrize( + "X, schema", + [ + (np.random.normal(size=(10, 3)), {0: "cont", 1: "cont", 2: "cont"}), + (np.random.randint(2, size=(10, 3)), {0: "bin", 1: "bin", 2: "bin"}), + ( + np.hstack( + [np.random.normal(size=(5, 2)), np.random.randint(2, size=(5, 2))] + ), + {0: "cont", 1: "cont", 2: "bin", 3: "bin"}, + ), + ], + ) + def test_numpy_notears_with_schema(self, X, schema): + from_numpy(X, schema) + + @pytest.mark.parametrize( + "X, schema", + [ + (np.random.normal(size=(10, 3)), {0: "cont", 1: "cont", 2: "cont"}), + (np.random.randint(2, size=(10, 3)), {0: "bin", 1: "bin", 2: "bin"}), + ( + np.hstack( + [np.random.normal(size=(5, 2)), np.random.randint(2, size=(5, 2))] + ), + {0: "cont", 1: "cont", 2: "bin", 3: "bin"}, + ), + ], + ) + def test_pandas_notears_with_schema(self, X, schema): + X = pd.DataFrame(X) + from_pandas(X, schema) diff --git a/tests/structure/test_pytorch_notears.py b/tests/structure/test_pytorch_notears.py index 6ec5f8d..db87cf0 100644 --- a/tests/structure/test_pytorch_notears.py +++ b/tests/structure/test_pytorch_notears.py @@ -36,7 +36,12 @@ from mock import patch from causalnex.structure import StructureModel -from causalnex.structure.data_generators import generate_continuous_dataframe +from causalnex.structure.data_generators import ( + generate_binary_data, + generate_binary_dataframe, + generate_continuous_dataframe, + generate_structure, +) from causalnex.structure.pytorch.notears import from_numpy, from_pandas @@ -58,7 +63,7 @@ def test_isolated_nodes_exist(self, train_data_idx): def test_expected_structure_learned(self, train_data_idx, train_model): """Given a small data set that can be examined by hand, the structure should be deterministic""" - g = from_pandas(train_data_idx, w_threshold=0.25) + g = from_pandas(train_data_idx, w_threshold=0.15) assert set(g.edges) == set(train_model.edges) def test_empty_data_raises_error(self): @@ -220,6 +225,35 @@ def test_check_array(self, data): ): from_pandas(pd.DataFrame(data=data, columns=["a"])) + def test_f1score_generated_binary(self): + """ Binary strucutre learned should have good f1 score """ + np.random.seed(10) + sm = generate_structure(5, 2.0) + df = generate_binary_dataframe( + sm, 1000, intercept=False, noise_scale=0.1, seed=10 + ) + + dist_type_schema = {i: "bin" for i in range(df.shape[1])} + sm_fitted = from_pandas( + df, + dist_type_schema=dist_type_schema, + lasso_beta=0.1, + ridge_beta=0.0, + w_threshold=0.1, + use_bias=False, + ) + + right_edges = sm.edges + n_predictions_made = len(sm_fitted.edges) + n_correct_predictions = len(set(sm_fitted.edges).intersection(set(right_edges))) + n_relevant_predictions = len(right_edges) + + precision = n_correct_predictions / n_predictions_made + recall = n_correct_predictions / n_relevant_predictions + f1_score = 2 * (precision * recall) / (precision + recall) + + assert f1_score > 0.8 + class TestFromNumpy: """Test behaviour of the from_numpy_lasso method""" @@ -239,7 +273,7 @@ def test_isolated_nodes_exist(self, train_data_idx): def test_expected_structure_learned(self, train_data_idx, train_model_idx): """Given a small data set that can be examined by hand, the structure should be deterministic""" - g = from_numpy(train_data_idx.values, w_threshold=0.25) + g = from_numpy(train_data_idx.values, w_threshold=0.15) assert set(g.edges) == set(train_model_idx.edges) def test_empty_data_raises_error(self): @@ -420,3 +454,30 @@ def test_check_array(self, data): match="Input contains NaN, infinity or a value too large for dtype*", ): from_numpy(np.array([data])) + + def test_f1score_generated_binary(self): + """ Binary strucutre learned should have good f1 score """ + np.random.seed(10) + sm = generate_structure(5, 2.0) + df = generate_binary_data(sm, 1000, intercept=False, noise_scale=0.1, seed=10) + + dist_type_schema = {i: "bin" for i in range(df.shape[1])} + sm_fitted = from_numpy( + df, + dist_type_schema=dist_type_schema, + lasso_beta=0.1, + ridge_beta=0.0, + w_threshold=0.1, + use_bias=False, + ) + + right_edges = sm.edges + n_predictions_made = len(sm_fitted.edges) + n_correct_predictions = len(set(sm_fitted.edges).intersection(set(right_edges))) + n_relevant_predictions = len(right_edges) + + precision = n_correct_predictions / n_predictions_made + recall = n_correct_predictions / n_relevant_predictions + f1_score = 2 * (precision * recall) / (precision + recall) + + assert f1_score > 0.8 diff --git a/tests/structure/test_sklearn.py b/tests/structure/test_sklearn.py index bbb2a97..18fa154 100644 --- a/tests/structure/test_sklearn.py +++ b/tests/structure/test_sklearn.py @@ -36,11 +36,14 @@ from sklearn.gaussian_process.kernels import RBF from sklearn.model_selection import KFold, cross_val_score +from causalnex.structure import DAGClassifier, DAGRegressor from causalnex.structure import data_generators as dg -from causalnex.structure.sklearn import DAGRegressor -class TestStructureModel: +class TestDAGSklearn: + """ Tests aspects common to both DAGRegressor and DAGClassifier """ + + @pytest.mark.parametrize("model", [DAGRegressor, DAGClassifier]) @pytest.mark.parametrize( "val, msg, error", [ @@ -50,38 +53,162 @@ class TestStructureModel: ({"threshold": "0.0"}, "threshold should be numeric", TypeError), ], ) - def test_input_type_assertion(self, val, msg, error): + def test_input_type_assertion(self, val, msg, error, model): with pytest.raises(error, match=msg): - DAGRegressor(**val) + model(**val) - def test_pandas_fit(self): - reg = DAGRegressor() - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + @pytest.mark.parametrize("model", [DAGRegressor, DAGClassifier]) + def test_notfitted_error(self, model): + m = model() + X = np.random.normal(size=(100, 2)) + with pytest.raises(NotFittedError): + m.predict(X) + + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + def test_tabu_parent_nodes(self, model, y): + X = np.random.normal(size=(100, 2)) + X, y = pd.DataFrame(X), pd.Series(y, name="test") + + m = model(dependent_target=True, tabu_parent_nodes=["test"]) + assert "test" in m.tabu_parent_nodes + + m = model(dependent_target=True, tabu_parent_nodes=[]) + m.fit(X, y) + assert "test" not in m.tabu_parent_nodes + + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + def test_numpy_fit(self, model, y): + m = model() + X = np.random.normal(size=(100, 2)) + m.fit(X, y) + + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + def test_pandas_fit(self, model, y): + m = model() + X = np.random.normal(size=(100, 2)) X, y = pd.DataFrame(X), pd.Series(y) - reg.fit(X, y) + m.fit(X, y) - def test_numpy_fit(self): - reg = DAGRegressor() - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) - reg.fit(X, y) + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + @pytest.mark.parametrize( + "fit_intercept, equals_zero", [(True, False), (False, True)] + ) + def test_intercept(self, fit_intercept, equals_zero, model, y): + m = model(fit_intercept=fit_intercept) + X = np.random.normal(size=(100, 2)) + X, y = pd.DataFrame(X), pd.Series(y) + m.fit(X, y) + # intercept should return zero when fit_intercept == False + assert (m.intercept_ == 0) is equals_zero + assert isinstance(m.intercept_, float) - def test_predict_type(self): - reg = DAGRegressor() - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) - reg.fit(X, y) - assert isinstance(reg.predict(X), np.ndarray) - reg = DAGRegressor() - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + @pytest.mark.parametrize("enforce_dag", [True, False]) + def test_plot_dag(self, enforce_dag, model, y): + m = model() + X = np.random.normal(size=(100, 2)) + m.fit(X, y) + image = m.plot_dag(enforce_dag=enforce_dag) + assert isinstance(image, Image) + + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + def test_plot_dag_importerror(self, model, y): + with patch.dict("sys.modules", {"IPython.display": None}): + m = model() + X = np.random.normal(size=(100, 2)) + m.fit(X, y) + + with pytest.raises( + ImportError, + match=r"plot_dag method requires IPython installed.", + ): + m.plot_dag() + + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + @pytest.mark.parametrize( + "hidden_layer_units", [None, [], [0], [1], (0,), (1,), [1, 1], (1, 1)] + ) + def test_hidden_layer_units(self, hidden_layer_units, model, y): + m = model(hidden_layer_units=hidden_layer_units) + X = np.random.normal(size=(100, 2)) + m.fit(X, y) + + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + def test_enforce_dag(self, model, y): + m = model(enforce_dag=True) + X = np.random.normal(size=(100, 2)) X, y = pd.DataFrame(X), pd.Series(y) - reg.fit(X, y) - assert isinstance(reg.predict(X), np.ndarray) + m.fit(X, y) + assert nx.algorithms.is_directed_acyclic_graph(m.graph_) - def test_notfitted_error(self): - reg = DAGRegressor() + @pytest.mark.parametrize( + "model, y", + [ + (DAGRegressor, np.random.normal(size=(100,))), + (DAGClassifier, np.random.randint(2, size=(100,))), + ], + ) + def test_container_predict_type(self, model, y): + m = model() X = np.random.normal(size=(100, 2)) - with pytest.raises(NotFittedError): - reg.predict(X) + m.fit(X, y) + assert isinstance(m.predict(X), np.ndarray) + m = model() + X = np.random.normal(size=(100, 2)) + X, y = pd.DataFrame(X), pd.Series(y) + m.fit(X, y) + assert isinstance(m.predict(X), np.ndarray) + +class TestDAGRegressor: @pytest.mark.parametrize("hidden_layer_units", [None, [2], [2, 2]]) def test_coef(self, hidden_layer_units): reg = DAGRegressor(hidden_layer_units=hidden_layer_units) @@ -110,64 +237,6 @@ def test_feature_importances(self, hidden_layer_units): # assert that the sign of the coefficient is positive for both nonlinear and linear cases assert coef_["true_feat"] > 0 - def test_tabu_parent_nodes(self): - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) - X, y = pd.DataFrame(X), pd.Series(y, name="test") - - reg = DAGRegressor(dependent_target=True, tabu_parent_nodes=["test"]) - assert "test" in reg.tabu_parent_nodes - - reg = DAGRegressor(dependent_target=True, tabu_parent_nodes=[]) - reg.fit(X, y) - assert "test" not in reg.tabu_parent_nodes - - @pytest.mark.parametrize( - "fit_intercept, equals_zero", [(True, False), (False, True)] - ) - def test_intercept(self, fit_intercept, equals_zero): - reg = DAGRegressor(fit_intercept=fit_intercept) - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) - X, y = pd.DataFrame(X), pd.Series(y) - reg.fit(X, y) - # intercept should return zero when fit_intercept == False - assert (reg.intercept_ == 0) is equals_zero - assert isinstance(reg.intercept_, float) - - @pytest.mark.parametrize("enforce_dag", [True, False]) - def test_plot_dag(self, enforce_dag): - reg = DAGRegressor() - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) - reg.fit(X, y) - image = reg.plot_dag(enforce_dag=enforce_dag) - assert isinstance(image, Image) - - def test_plot_dag_importerror(self): - with patch.dict("sys.modules", {"IPython.display": None}): - reg = DAGRegressor() - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) - reg.fit(X, y) - - with pytest.raises( - ImportError, - match=r"DAGRegressor\.plot_dag method requires IPython installed\.", - ): - reg.plot_dag() - - @pytest.mark.parametrize( - "hidden_layer_units", [None, [], [0], [1], (0,), (1,), [1, 1], (1, 1)] - ) - def test_hidden_layer_units(self, hidden_layer_units): - reg = DAGRegressor(hidden_layer_units=hidden_layer_units) - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) - reg.fit(X, y) - - def test_enforce_dag(self): - reg = DAGRegressor(enforce_dag=True) - X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) - X, y = pd.DataFrame(X), pd.Series(y) - reg.fit(X, y) - assert nx.algorithms.is_directed_acyclic_graph(reg.graph_) - @pytest.mark.parametrize("standardize", [True, False]) def test_nonlinear_performance(self, standardize): np.random.seed(42) @@ -182,10 +251,8 @@ def test_nonlinear_performance(self, standardize): reg = DAGRegressor( alpha=0.0, - l1_ratio=0.0, fit_intercept=True, dependent_target=True, - enforce_dag=False, hidden_layer_units=[0], standardize=standardize, ) @@ -195,9 +262,7 @@ def test_nonlinear_performance(self, standardize): reg = DAGRegressor( alpha=0.1, - l1_ratio=1.0, fit_intercept=True, - enforce_dag=False, hidden_layer_units=[2], standardize=standardize, ) @@ -207,9 +272,7 @@ def test_nonlinear_performance(self, standardize): reg = DAGRegressor( alpha=0.1, - l1_ratio=1.0, fit_intercept=True, - enforce_dag=False, hidden_layer_units=[4], standardize=standardize, ) @@ -219,3 +282,59 @@ def test_nonlinear_performance(self, standardize): assert small_nl_score > linear_score assert medium_nl_score > small_nl_score + + +class TestDAGClassifier: + @pytest.mark.parametrize("hidden_layer_units", [None, [2], [2, 2]]) + def test_coef(self, hidden_layer_units): + clf = DAGClassifier(alpha=0.1, hidden_layer_units=hidden_layer_units) + X, y = ( + pd.DataFrame(np.random.normal(size=(100, 2))), + pd.Series(np.zeros(shape=(100,), dtype=int)), + ) + y[X[0] < 0] = 1 + clf.fit(X, y) + + assert isinstance(clf.coef_, np.ndarray) + coef_ = pd.Series(clf.coef_, index=X.columns) + # assert that the sign of the coefficient is correct for both nonlinear and linear cases + assert coef_[0] < 0 + + @pytest.mark.parametrize("hidden_layer_units", [None, [2], [2, 2]]) + def test_feature_importances(self, hidden_layer_units): + clf = DAGClassifier(alpha=0.1, hidden_layer_units=hidden_layer_units) + X, y = ( + pd.DataFrame(np.random.normal(size=(100, 2))), + pd.Series(np.zeros(shape=(100,), dtype=int)), + ) + y[X[0] < 0] = 1 + clf.fit(X, y) + + assert isinstance(clf.feature_importances_, np.ndarray) + coef_ = pd.Series(clf.feature_importances_, index=X.columns) + # assert that the sign of the coefficient is positive for both nonlinear and linear cases + assert coef_[0] > 0 + + @pytest.mark.parametrize("y_type", [float, str, np.int32, np.int64, np.float32]) + def test_value_predict_type(self, y_type): + clf = DAGClassifier(alpha=0.1) + X, y = ( + pd.DataFrame(np.random.normal(size=(100, 2))), + pd.Series(np.zeros(shape=(100,), dtype=y_type)), + ) + y[X[0] < 0] = y_type(1) + clf.fit(X, y) + + y_pred = clf.predict(X) + assert isinstance(y_pred[0], y_type) + y_pred_proba = clf.predict_proba(X) + assert isinstance(y_pred_proba[0], np.float64) + + @pytest.mark.parametrize( + "y", [np.random.randint(1, size=(100,)), np.random.randint(3, size=(100,))] + ) + def test_class_number_error(self, y): + clf = DAGClassifier(alpha=0.1) + X = (pd.DataFrame(np.random.normal(size=(100, 2))),) + with pytest.raises(ValueError): + clf.fit(X, y) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 78f2d37..6104d5b 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -36,26 +36,16 @@ class TestUniform: def test_fit_creates_exactly_uniform_splits_when_possible(self): - """splits should be exactly uniform if possible""" + """splits should be exactly uniform""" - arr = np.array(range(20)) + arr = np.array(range(21)) np.random.shuffle(arr) - d = Discretiser(method="uniform", num_buckets=4) + d = Discretiser(method="uniform", num_buckets=5) d.fit(arr) for n in range(2): - assert 4 < (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) <= 5 - - def test_fit_creates_close_to_uniform_splits_when_uniform_not_possible(self): - """splits should be close to uniform if uniform is not possible""" - - arr = np.array(range(9)) - np.random.shuffle(arr) - d = Discretiser(method="uniform", num_buckets=4) - d.fit(arr) - - assert len(d.numeric_split_points) == 3 - for n in range(2): - assert 2 <= (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) <= 3 + assert (d.numeric_split_points[n + 1] - d.numeric_split_points[n]) == ( + (d.numeric_split_points[n + 2] - d.numeric_split_points[n + 1]) + ) def test_fit_does_not_attempt_to_deal_with_identical_split_points(self): """if all data is identical, and num_buckets>1, then this is not possible. @@ -70,20 +60,6 @@ def test_fit_does_not_attempt_to_deal_with_identical_split_points(self): d.numeric_split_points, ) - def test_transform_uneven_split(self): - """Data that cannot be split evenly between buckets should be transformed - into near-even buckets""" - - arr = np.array([n + 1 for n in range(10)]) - np.random.shuffle(arr) - d = Discretiser(method="uniform", num_buckets=4) - d.fit(arr) - unique, counts = np.unique(d.transform(arr), return_counts=True) - # check all 4 buckets are used - assert np.array_equal([0, 1, 2, 3], unique) - # check largest difference in distribution is 1 item - assert (np.max(counts) - np.min(counts)) <= 1 - def test_transform_larger_than_fit_range_goes_into_last_bucket(self): """If a value larger than the input is transformed, then it should go into the maximum bucket"""