In [1]:
#| default_exp data_preprocessor

# Data Preprocessing

`DataPreprocessor` transforms *individual* features into numerical representations for the machine learning and recourse generation workflows. 
It can be considered as a drop-in jax-friendly replacement to the 
[sklearn.preprocessing](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing) module.
The supported preprocessing methods include `MinMaxScaler` and `OneHotEncoder`. 

However, unlike the `DataPreprocessor` [sklearn.preprocessing](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing) module,
all of the data preprocessors work only with single features (e.g., Dim: `(B, 1)`). 


In [2]:
#| export
from __future__ import annotations
import jax
import jax.numpy as jnp
import numpy as np

In [3]:
import sklearn.preprocessing as skp
from fastcore.test import test_fail
from sklearn.compose import ColumnTransformer

In [4]:
class DataPreprocessor:

    def fit(self, xs, y=None):
        """Fit the preprocessor with `xs` and `y`."""
        self._fit(xs, y)

    def transform(self, xs):
        """Transform `xs`."""
        self._transform(xs)

    def fit_transform(self, xs, y=None):
        """Fit the preprocessor with `xs` and `y`, then transform `xs`."""
        self.fit(xs, y)
        return self.transform(xs)

    def inverse_transform(self, xs):
        """Inverse transform `xs`."""
        self._inverse_transform(xs)

In [5]:
#| export
def _check_xs(xs: np.ndarray, name: str):
    """Check if `xs` is a 1D array with shape (n_samples,) or a 2D array with shape (n_samples, 1)."""
    if xs.ndim > 2 or (xs.ndim == 2 and xs.shape[1] != 1):
        raise ValueError(f"`{name}` only supports array with a single feature, but got shape={xs.shape}.")

In [6]:
class MinMaxScaler(DataPreprocessor):
    def fit(self, xs, y=None):
        _check_xs(xs, name="MinMaxScaler")
        self.min_ = xs.min(axis=0)
        self.max_ = xs.max(axis=0)
        return self

    def transform(self, xs):
        if self.min_ == self.max_:
            return np.zeros(xs.shape)
        return (xs - self.min_) / (self.max_ - self.min_)

    def inverse_transform(self, xs):
        if self.min_ == self.max_ == 0:
            return np.ones(xs.shape)
        if self.min_ == self.max_ == 1:
            return np.ones(xs.shape)
        return xs * (self.max_ - self.min_) + self.min_

In [7]:
#!!! Do not edit things below.
# `xs` represents 100 data points with 1 feature.
xs = np.random.randn(100, )
scaler = MinMaxScaler()
transformed_xs = scaler.fit_transform(xs)
assert transformed_xs.shape == (100, )
assert np.allclose(xs, scaler.inverse_transform(transformed_xs))
# Test correctness 
assert np.allclose(
    transformed_xs, 
    skp.MinMaxScaler().fit_transform(xs.reshape(100, 1)).reshape(100,)
)
# Can also represented in 2D array.
xs = xs.reshape(100, 1)
scaler = MinMaxScaler()
transformed_xs = scaler.fit_transform(xs)
assert np.allclose(xs, scaler.inverse_transform(transformed_xs))
assert np.allclose(
    transformed_xs, 
    skp.MinMaxScaler().fit_transform(xs.reshape(100, 1))
)

# It will fail if `xs` has more than 1 feature.
xs = xs.reshape(50, 2)
scaler = MinMaxScaler()
test_fail(lambda: scaler.fit_transform(xs), 
          contains="`MinMaxScaler` only supports array with a single feature")

# The above implementation will fail here. Fix it.
xs = np.ones((100, 1))
scaler = MinMaxScaler()
transformed_xs = scaler.fit_transform(xs)
assert np.allclose(xs, scaler.inverse_transform(transformed_xs))
assert np.allclose(
    transformed_xs, 
    skp.MinMaxScaler().fit_transform(xs.reshape(100, 1))
)

In [8]:
class OneHotEncoder(DataPreprocessor):
    """One-hot encoder for a single categorical feature."""

    def fit(self, xs, y=None):
        """Fit the OneHotEncoder with `xs`."""
        _check_xs(xs, name="OneHotEncoder")
        self.categories_ = np.unique(xs)
        return self

    def transform(self, xs):
        """Transform `xs`."""
        _check_xs(xs, name="OneHotEncoder")
        encoder = skp.OneHotEncoder(sparse_output=False)
        encoder.fit(self.categories_.reshape(-1, 1))
        encoded_val = encoder.transform(xs.reshape(-1, 1))
        return encoded_val

    def inverse_transform(self, xs):
        """Inverse transform `xs`."""
        encoded = skp.OneHotEncoder(sparse_output=False)
        encoded.fit(self.categories_.reshape(-1, 1))
        decoded = encoded.inverse_transform(xs)

        for element in decoded:
            for num in element:
                if np.any(np.isnan(num)):
                    return decoded.reshape(-1, 1).astype(str)
        return decoded.reshape(-1, 1)

In [9]:
#!!! Do not edit things below.
xs = np.random.choice([0, 1, 2], size=(100, 1))
enc = OneHotEncoder().fit(xs)
transformed_xs = enc.transform(xs)
assert np.array_equal(
    transformed_xs,
    skp.OneHotEncoder(sparse=False).fit_transform(xs)
)
assert np.all(enc.inverse_transform(transformed_xs) == xs)

xs = np.array([0, 1, 2, np.nan, 0, 1, 2, np.nan], dtype=object).reshape(-1, 1)
enc = OneHotEncoder().fit(xs)
transformed_xs = enc.transform(xs)
assert np.array_equal(
    transformed_xs,
    skp.OneHotEncoder(sparse=False).fit_transform(xs)
)
assert np.all(enc.inverse_transform(transformed_xs) == xs.astype(str))

# It will fail if `xs` has more than 1 feature.
xs = xs.reshape(-1, 2)
enc = OneHotEncoder()
test_fail(lambda: enc.fit_transform(xs), 
          contains="`OneHotEncoder` only supports array with a single feature")



In [10]:
class Feature:
    def __init__(
            self,
            name: str,
            data: np.ndarray,
            preprocessor: DataPreprocessor = None,
    ):
        self.name = name
        self.data = data
        self.preprocessor = preprocessor

    def transform(self, xs):
        if self.preprocessor is not None:
            xs = self.preprocessor.fit_transform(xs)
            if xs.ndim == 1:
                xs = xs.reshape(-1, 1)
        return xs

    def inverse_transform(self, xs):
        if self.preprocessor is not None:
            xs = self.preprocessor.inverse_transform(xs)
            if xs.ndim == 1:
                xs = xs.reshape(-1, 1)
            if xs.dtype != np.float64:
                xs = xs.astype(np.float64)
        return xs

In [11]:
class FeaturesList:
    def __init__(self, features: list[Feature]):
        self.features = features

    def transform(self, xs: np.ndarray) -> np.ndarray:
        """Transform the input data using the preprocessors of the features."""
        transformed_xs = []
        i = 0
        for feature in self.features:
            transformed_xs.append(feature.transform(xs[:, i]))
            i += 1
        return np.concatenate(transformed_xs, axis=-1)

    def inverse_transform(self, xs: np.ndarray) -> np.ndarray:
        """Inverse transform the input data using the preprocessors of the features."""
        inv_xs = []
        i = 0
        for feature in self.features:
            if feature.name == "a" or feature.name == "c":
                inv_xs.append(feature.inverse_transform(xs[:, i]))
                i += 1
            else:
                inv_xs.append(feature.inverse_transform(xs[:, i:i + len(feature.preprocessor.categories_)]))
                i += len(feature.preprocessor.categories_)
        return np.concatenate(inv_xs, axis=-1)

In [12]:
#!!! Do not edit things below.
train_xs = np.concatenate([
    np.random.randn(100, 1), 
    np.random.choice([0, 1, 2], size=(100, 1)), 
    np.random.randn(100, 1), 
    np.random.choice([0, 1, np.nan], size=(100, 1)),
], axis=-1)
test_xs = np.concatenate([
    np.random.randn(100, 1), 
    np.random.choice([0, 1, 2], size=(100, 1)), 
    np.random.randn(100, 1), 
    np.random.choice([0, 1, np.nan], size=(100, 1)),
], axis=-1)

feats = [
    Feature("a", train_xs[:, 0], MinMaxScaler()),
    Feature("b", train_xs[:, 1], OneHotEncoder()),
    Feature("c", train_xs[:, 2], MinMaxScaler()),
    Feature("d", train_xs[:, -1], OneHotEncoder()),
]
feats_list = FeaturesList(feats)
transformed_xs = feats_list.transform(test_xs)
assert transformed_xs.shape == (100, 8)
inv_xs = feats_list.inverse_transform(transformed_xs)
assert np.allclose(test_xs, inv_xs)

AssertionError: 

In [13]:
#!!! Do not edit things below.
ct = ColumnTransformer([
    ("a", skp.MinMaxScaler(), [0]),
    ("b", skp.OneHotEncoder(), [1]),
    ("c", skp.MinMaxScaler(), [2]),
    ("d", skp.OneHotEncoder(), [3]),
])
sk_transformed_xs = ct.fit_transform(test_xs)
assert np.allclose(transformed_xs, sk_transformed_xs)