-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
Copy pathvalidation.py
207 lines (173 loc) · 7.25 KB
/
validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# -*- encoding: utf-8 -*-
from typing import List, Optional, Tuple, Union
import logging
import numpy as np
import pandas as pd
from scipy.sparse import spmatrix
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from autosklearn.data.feature_validator import SUPPORTED_FEAT_TYPES, FeatureValidator
from autosklearn.data.target_validator import SUPPORTED_TARGET_TYPES, TargetValidator
from autosklearn.util.logging_ import get_named_client_logger
def convert_if_sparse(
y: SUPPORTED_TARGET_TYPES,
) -> Union[np.ndarray, List, pd.DataFrame, pd.Series]:
"""If the labels `y` are sparse, it will convert it to its dense representation
Parameters
----------
y: {array-like, sparse matrix} of shape (n_samples,) or (n_samples, n_outputs)
The labels to 'densify' if sparse
Returns
-------
np.ndarray of shape (n_samples, ) or (n_samples, n_outputs)
"""
if isinstance(y, spmatrix):
y_ = y.toarray()
# For sparse one dimensional data, y.toarray will return [[1], [2], [3], ...]
# We need to flatten this before returning it
if y_.shape[0] == 1:
y_ = y_.flatten()
else:
y_ = y
return y_
class InputValidator(BaseEstimator):
"""
Makes sure the input data complies with Auto-sklearn requirements.
Categorical inputs are encoded via a Label Encoder, if the input
is a dataframe.
This class also perform checks for data integrity and flags the user
via informative errors.
Attributes
----------
feat_type: Optional[List[str]] = None
In case the dataset is not a pandas DataFrame:
+ If provided, this list indicates which columns should be treated as
categorical it is internally transformed into a dictionary that
indicates a mapping from column index to categorical/numerical.
+ If not provided, by default all columns are treated as numerical
If the input dataset is of type pandas dataframe, this argument
must be none, as the column type will be inferred from the pandas dtypes.
is_classification: bool
For classification task, this flag indicates that the target data
should be encoded
feature_validator: FeatureValidator
A FeatureValidator instance used to validate and encode feature columns to match
sklearn expectations on the data
target_validator: TargetValidator
A TargetValidator instance used for classification to validate and encode the
target values
"""
def __init__(
self,
feat_type: Optional[List[str]] = None,
is_classification: bool = False,
logger_port: Optional[int] = None,
allow_string_features: bool = True,
) -> None:
self.feat_type = feat_type
self.is_classification = is_classification
self.logger_port = logger_port
if self.logger_port is not None:
self.logger = get_named_client_logger(
name="Validation",
port=self.logger_port,
)
else:
self.logger = logging.getLogger("Validation")
self.allow_string_features = allow_string_features
self.feature_validator = FeatureValidator(
feat_type=self.feat_type,
logger=self.logger,
allow_string_features=self.allow_string_features,
)
self.target_validator = TargetValidator(
is_classification=self.is_classification, logger=self.logger
)
self._is_fitted = False
def fit(
self,
X_train: SUPPORTED_FEAT_TYPES,
y_train: SUPPORTED_TARGET_TYPES,
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
) -> BaseEstimator:
"""
Validates and fit a categorical encoder (if needed) to the features, and
a encoder for targets in the case of classification. Specifically:
For features:
Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy
sparse) as well as dimensionality checks
If the provided data is a pandas DataFrame with categorical/boolean/int columns,
such columns will be encoded using an Ordinal Encoder
For targets:
* Checks for dimensionality as well as missing values are performed.
* If performing a classification task, the data is going to be encoded
Parameters
----------
X_train: SUPPORTED_FEAT_TYPES
A set of features that are going to be validated (type and dimensionality
checks). If this data contains categorical columns, an encoder is going to
be instantiated and trained with this data.
y_train: SUPPORTED_TARGET_TYPES
A set of targets to encoded if the task is for classification.
X_test: Optional[SUPPORTED_FEAT_TYPES]
A hold out set of features used for checking
y_test: SUPPORTED_TARGET_TYPES
A hold out set of targets used for checking. Additionally, if the current
task is a classification task, this y_test categories are also going to be
used to fit a pre-processing encoding (to prevent errors on unseen classes).
Returns
-------
self
"""
# Check that the data is valid
if np.shape(X_train)[0] != np.shape(y_train)[0]:
raise ValueError(
"Inconsistent number of train datapoints for features and targets,"
" {} for features and {} for targets".format(
np.shape(X_train)[0],
np.shape(y_train)[0],
)
)
if X_test is not None and np.shape(X_test)[0] != np.shape(y_test)[0]:
raise ValueError(
"Inconsistent number of test datapoints for features and targets,"
" {} for features and {} for targets".format(
np.shape(X_test)[0],
np.shape(y_test)[0],
)
)
self.feature_validator.fit(X_train, X_test)
self.target_validator.fit(y_train, y_test)
self._is_fitted = True
return self
def transform(
self,
X: SUPPORTED_FEAT_TYPES,
y: Optional[Union[List, pd.Series, pd.DataFrame, np.ndarray]] = None,
) -> Tuple[Union[np.ndarray, pd.DataFrame, spmatrix], Optional[np.ndarray]]:
"""
Transform the given target or features to a numpy array
Parameters
----------
X: SUPPORTED_FEAT_TYPES
A set of features to transform
y: Optional[SUPPORTED_TARGET_TYPES]
A set of targets to transform
Return
------
np.ndarray:
The transformed features array
np.ndarray:
The transformed targets array
"""
if not self._is_fitted:
raise NotFittedError(
"Cannot call transform on a validator that is not fitted"
)
X_transformed = self.feature_validator.transform(X)
if y is not None:
y_transformed = self.target_validator.transform(y)
return X_transformed, y_transformed
else:
return X_transformed, None