-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathbin_number_transformer.py
246 lines (197 loc) · 8.37 KB
/
bin_number_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
from __future__ import absolute_import, division, print_function
import logging
import numpy as np
import pandas as pd
from cyclic_boosting import flags
from ._binary_search import eq_multi, ge_multi
from ._utils import _read_feature_property, check_frame_empty
from .ecdf_transformer import ECdfTransformer
from typing import Union, Optional
MISSING_VALUE_AS_BINNO = -1
_logger = logging.getLogger(__name__)
class BinNumberTransformer(ECdfTransformer):
r"""This transformer bins feature-variables in ``X`` into integral bins,
depending on each feature's *feature property*. Features
with discrete preprocessing (not continuous, but ordered or unordered) are
enumerated by their unique values, ascending from the lowest (Thus, a
column with ``10, 11, 12`` would be binned as ``0, 1, 2``).
If no ``feature_properties`` are passed, all columns in ``X`` are treated
as :obj:`cyclic_boosting.flags.IS_CONTINUOUS`. If a ``feature_properties``
dictionary is supplied, it must contain feature properties for each feature
in ``X``.
Not-a-number values in the input feature matrix are mapped to
:obj:`cyclic_boosting.binning.MISSING_VALUE_AS_BINNO` in the transform
step. This value can then be treated as a missing value by Cyclic Boosting.
The feature property :obj:`cyclic_boosting.flags.HAS_MAGIC_INT_MISSING`
enables missing-value treatment for values of -999 and -9 in integer-typed
feature columns (for both continuous and non-continuous features).
Binning is performed for each feature-column individually. For example, two
columns with the same value range can end up with totally different bin
numbers. Also, the ``n_bins`` argument which is typically an integer, can
be indivualized by passing a dict that provides column-names and the
respective number of bins, that should be used for continuous
preprocessing.
During the fit, all features are treated in the same way as in
:class:`ECdfTransformer`. During the transform step, each feature value is
transformed to the number of its feature bin. The range of bin numbers
is::
[0, trans.bins_and_cdfs_[feature_no][1].shape[0] - 1)
For the **estimated parameters** see :class:`ECdfTransformer`.
Parameters
----------
n_bins: int
Maximum number of bins used to estimate the empirical CDF. ``n_bins`` is
ignored for features with discrete preprocessing.
If a dict is passed, the feature names/indices should be the keys and the
n_bins are the values. Example : ``{'feature a': 150, 'feature b': 20}``
feature_properties: dict
Dictionary listing the names of all features as keys and their
preprocessing flags as values. When using a numpy feature matrix X with
no column names the keys of the feature properties are the column
indices.
weight_column: str or int
Optional column label or column index for the weight column. If not set
all samples receive the same weight 1.
epsilon: float
Used thresholds for the comparison of float values:
* ``epsilon * 1.0`` for the comparison of CDF values
* ``epsilon * minimal_bin_width`` for the comparison with bin
boundaries of a given feature
Default value for epsilon: 1e-9
tolerance: double
Relative tolerance of the minimum bin weight. (E.g.
if you specify 100 bins and a tolerance of 0.05 the bins are
required to have only 0.95% of the total bin weights instead of
1.0%)
Examples
--------
>>> feature_1 = np.asarray([2.1, 2.2, 2.5, 3.1, 3.3, 3.7, 4.1, 4.4])
>>> X = np.c_[feature_1]
>>> from cyclic_boosting.binning import BinNumberTransformer
>>> trans = BinNumberTransformer(n_bins=4, epsilon=1e-8)
>>> trans = trans.fit(X)
>>> # only one input column
>>> column, epsilon, bins_cdfs = trans.bins_and_cdfs_[0]
>>> assert column == 0, np.allclose(epsilon, 1e-8 * 0.1)
>>> bins_cdfs
array([[ 2.1 , 0. ],
[ 2.2 , 0.25],
[ 3.1 , 0.5 ],
[ 3.7 , 0.75],
[ 4.4 , 1. ]])
>>> X_test = np.c_[[1.9, 2.15, 2.4, 2.2, 3.6, 3.5, 4.3, 5.1]]
>>> trans.transform(X_test)
array([[0],
[0],
[1],
[0],
[2],
[2],
[3],
[3]], dtype=int8)
"""
def __init__(
self,
n_bins=100,
feature_properties=None,
weight_column=None,
epsilon=1e-9,
tolerance=0.1,
inplace=False,
):
self.n_bins = n_bins
self.feature_properties = feature_properties
self.weight_column = weight_column
self.epsilon = epsilon
self.tolerance = tolerance
self.nan_representation = MISSING_VALUE_AS_BINNO
self.inplace = inplace
ECdfTransformer.__init__(
self,
n_bins=self.n_bins,
feature_properties=self.feature_properties,
weight_column=self.weight_column,
epsilon=self.epsilon,
tolerance=self.tolerance,
)
def _transform_one_feature(self, X, feature_prop, col, epsilon, bins_and_cdfs):
xt = column_selector(X, col).astype(np.float64)
def is_finite(x):
if flags.has_magic_missing_set(feature_prop):
return (x != -9) & (x != -999) & np.isfinite(xt)
else:
return np.isfinite(xt)
if bins_and_cdfs is not None:
finite_mask = is_finite(xt)
xt_f = xt[finite_mask]
if flags.is_continuous_set(feature_prop):
ge_multi(bins_and_cdfs[1:, 0], xt_f - epsilon, 1, xt_f)
else:
eq_multi(
bins_and_cdfs[1:, 0],
xt_f,
np.arange(len(bins_and_cdfs[1:, 0]), dtype=np.float64),
epsilon,
xt_f,
)
xt[finite_mask] = xt_f
# re_check for nans, which may have been brought in by the
# binary search (values out of bounds)
xt[~is_finite(xt)] = MISSING_VALUE_AS_BINNO
else:
xt = MISSING_VALUE_AS_BINNO
return xt
def transform(
self, X_orig: Union[pd.DataFrame, np.ndarray], y: Optional[np.ndarray] = None
) -> Union[pd.DataFrame, np.ndarray]:
self._check_input_for_transform(X_orig)
if not self.inplace:
X = X_orig.copy()
else:
X = X_orig
if check_frame_empty(X):
if isinstance(X, pd.DataFrame):
X = X.astype({col: np.int8 for col, _, _ in self.bins_and_cdfs_})
else:
return _as_int_array_of_minimum_dtype(X)
return X
n_transformed_features = len(self.bins_and_cdfs_)
for col, epsilon, bins_and_cdfs in self.bins_and_cdfs_:
feature_prop = _read_feature_property(col, self.feature_properties)
if feature_prop is None:
pass
else:
xt = self._transform_one_feature(X, feature_prop, col, epsilon, bins_and_cdfs)
column_setter(X, col, xt)
if not isinstance(X, pd.DataFrame) and n_transformed_features == X.shape[1]:
X = _as_int_array_of_minimum_dtype(X)
return X
def get_feature_bin_boundaries(self):
return {feature: probas for feature, epsilon, probas in self.bins_and_cdfs_}
def column_selector(X, column):
"""Dispatches to column selection via pandas or numpy, depending on the type of X"""
if isinstance(X, pd.DataFrame):
return X[column].values
else:
return X[:, int(column)]
def _as_int_array_of_minimum_dtype(arr):
if isinstance(arr, int):
maximum = abs(arr)
elif len(arr) == 0:
maximum = 0
else:
maximum = max(arr.max(), abs(arr.min()))
if maximum <= np.iinfo(np.int8).max:
return np.asarray(arr, dtype=np.int8)
elif maximum <= np.iinfo(np.int16).max:
return np.asarray(arr, dtype=np.int16)
elif maximum <= np.iinfo(np.int32).max:
return np.asarray(arr, dtype=np.int32)
else:
return np.asarray(arr, dtype=np.int64)
def column_setter(X, column, rhs):
"""Dispatches to column selection via pandas or numpy, depending on the type of X"""
if isinstance(X, pd.DataFrame):
X[column] = _as_int_array_of_minimum_dtype(rhs)
else:
X[:, int(column)] = rhs