-
Notifications
You must be signed in to change notification settings - Fork 23
/
standard_scaler.py
345 lines (286 loc) · 11.2 KB
/
standard_scaler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import json
import os
import pickle
import numpy as np
from pycompss.api.constraint import constraint
from pycompss.api.parameter import Depth, Type, COLLECTION_IN, COLLECTION_OUT
from pycompss.api.task import task
from scipy.sparse import csr_matrix, issparse
from dislib.data.array import Array
import dislib as ds
import dislib.data.util.model as utilmodel
from dislib.data.util import encoder_helper, decoder_helper, sync_obj
class StandardScaler(object):
""" Standardize features by removing the mean and scaling to unit variance
Centering and scaling happen independently on each feature by computing the
relevant statistics on the samples in the training set. Mean and standard
deviation are then stored to be used on later data using the transform
method.
Attributes
----------
mean_ : ds-array, shape (1, n_features)
The mean value for each feature in the training set.
var_ : ds-array, shape (1, n_features)
The variance for each feature in the training set.
"""
def __init__(self):
self.mean_ = None
self.var_ = None
def fit(self, x):
""" Compute the mean and std to be used for later scaling.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
self : StandardScaler
"""
self.mean_ = ds.apply_along_axis(np.mean, 0, x)
var_blocks = [[]]
for row, m_row in zip(x._iterator(1), self.mean_._iterator(1)):
var_blocks[0].append(_compute_var(row._blocks, m_row._blocks))
self.var_ = Array(var_blocks,
top_left_shape=self.mean_._top_left_shape,
reg_shape=self.mean_._reg_shape,
shape=self.mean_.shape, sparse=x._sparse)
return self
def fit_transform(self, x):
""" Fit to data, then transform it.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
x_new : ds-array, shape=(n_samples, n_features)
Scaled data.
"""
return self.fit(x).transform(x)
def transform(self, x):
"""
Standarize data.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
x_new : ds-array, shape=(n_samples, n_features)
Scaled data.
"""
if self.mean_ is None or self.var_ is None:
raise Exception("Model has not been initialized.")
n_blocks = x._n_blocks[1]
blocks = []
m_blocks = self.mean_._blocks
v_blocks = self.var_._blocks
for row in x._iterator(axis=0):
out_blocks = [object() for _ in range(n_blocks)]
_transform(row._blocks, m_blocks, v_blocks, out_blocks)
blocks.append(out_blocks)
return Array(blocks, top_left_shape=x._top_left_shape,
reg_shape=x._reg_shape, shape=x.shape,
sparse=x._sparse)
def inverse_transform(self, x):
"""
Returns data to its original values. The Scaler should be fitted
before using this function.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
x_new : ds-array, shape=(n_samples, n_features)
Original valued data.
"""
if self.mean_ is None or self.var_ is None:
raise Exception("Model has not been initialized.")
n_blocks = x._n_blocks[1]
blocks = []
m_blocks = self.mean_._blocks
v_blocks = self.var_._blocks
for row in x._iterator(axis=0):
out_blocks = [object() for _ in range(n_blocks)]
_inverse_transform(row._blocks, m_blocks, v_blocks, out_blocks)
blocks.append(out_blocks)
return Array(blocks, top_left_shape=x._top_left_shape,
reg_shape=x._reg_shape, shape=x.shape,
sparse=x._sparse)
def save_model(self, filepath, overwrite=True, save_format="json"):
"""Saves a model to a file.
The model is synchronized before saving and can be reinstantiated in
the exact same state, without any of the code used for model
definition or fitting.
Parameters
----------
filepath : str
Path where to save the model
overwrite : bool, optional (default=True)
Whether any existing model at the target
location should be overwritten.
save_format : str, optional (default='json)
Format used to save the models.
Examples
--------
>>> from dislib.classification import CascadeSVM
>>> import numpy as np
>>> import dislib as ds
>>> x = ds.array(np.array([[1, 2], [2, 1], [-1, -2],
>>> [-2, -1]]), (2, 2))
>>> y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1))
>>> model = StandardScaler()
>>> model.fit(x)
>>> model.save_model('/tmp/model')
>>> loaded_model = StandardScaler()
>>> loaded_model.load_model('/tmp/model')
>>> x_test = ds.array(np.array([[1, 2], [2, 1], [-1, -2], [-2, -1],
>>> [1, 1], [-1, -1]]), (2, 2))
>>> x_transformed = model.transform(x_test)
>>> x_loaded_pred = loaded_model.transform(x_test)
>>> assert np.allclose(x_transformed.collect(),
>>> x_loaded_pred.collect())
"""
# Check overwrite
if not overwrite and os.path.isfile(filepath):
return
sync_obj(self.__dict__)
model_metadata = self.__dict__
model_metadata["model_name"] = "standardscaler"
# Save model
if save_format == "json":
with open(filepath, "w") as f:
json.dump(model_metadata, f, default=_encode_helper)
elif save_format == "cbor":
if utilmodel.cbor2 is None:
raise ModuleNotFoundError("No module named 'cbor2'")
with open(filepath, "wb") as f:
utilmodel.cbor2.dump(model_metadata, f,
default=_encode_helper_cbor)
elif save_format == "pickle":
with open(filepath, "wb") as f:
pickle.dump(model_metadata, f)
else:
raise ValueError("Wrong save format.")
def load_model(self, filepath, load_format="json"):
"""Loads a model from a file.
The model is reinstantiated in the exact same state in which it was
saved, without any of the code used for model definition or fitting.
Parameters
----------
filepath : str
Path of the saved the model
load_format : str, optional (default='json')
Format used to load the model.
Examples
--------
>>> from dislib.preprocessing import StandardScaler
>>> import numpy as np
>>> import dislib as ds
>>> x = ds.array(np.array([[1, 2], [2, 1], [-1, -2],
>>> [-2, -1]]), (2, 2))
>>> y = ds.array(np.array([0, 1, 1, 0]).reshape(-1, 1), (2, 1))
>>> model = StandardScaler()
>>> model.fit(x)
>>> model.save_model('/tmp/model')
>>> loaded_model = StandardScaler()
>>> loaded_model.load_model('/tmp/model')
>>> x_test = ds.array(np.array([[1, 2], [2, 1], [-1, -2], [-2, -1],
>>> [1, 1], [-1, -1]]), (2, 2))
>>> x_transformed = model.transform(x_test)
>>> x_loaded_pred = loaded_model.transform(x_test)
>>> assert np.allclose(x_transformed.collect(),
>>> x_loaded_pred.collect())
"""
# Load model
if load_format == "json":
with open(filepath, "r") as f:
model_metadata = json.load(f, object_hook=_decode_helper)
elif load_format == "cbor":
if utilmodel.cbor2 is None:
raise ModuleNotFoundError("No module named 'cbor2'")
with open(filepath, "rb") as f:
model_metadata = utilmodel.cbor2. \
load(f, object_hook=_decode_helper_cbor)
elif load_format == "pickle":
with open(filepath, "rb") as f:
model_metadata = pickle.load(f)
else:
raise ValueError("Wrong load format.")
for key, val in model_metadata.items():
setattr(self, key, val)
def _encode_helper_cbor(encoder, obj):
encoder.encode(_encode_helper(obj))
def _encode_helper(obj):
encoded = encoder_helper(obj)
if encoded is not None:
return encoded
def _decode_helper_cbor(decoder, obj):
"""Special decoder wrapper for dislib using cbor2."""
return _decode_helper(obj)
def _decode_helper(obj):
if isinstance(obj, dict) and "class_name" in obj:
class_name = obj["class_name"]
decoded = decoder_helper(class_name, obj)
if decoded is not None:
return decoded
elif class_name == "RandomState":
random_state = np.random.RandomState()
random_state.set_state(_decode_helper(obj["items"]))
return random_state
return obj
@constraint(computing_units="${ComputingUnits}")
@task(blocks={Type: COLLECTION_IN, Depth: 2},
m_blocks={Type: COLLECTION_IN, Depth: 2},
returns=1)
def _compute_var(blocks, m_blocks):
x = Array._merge_blocks(blocks)
mean = Array._merge_blocks(m_blocks)
sparse = issparse(x)
if sparse:
x = x.toarray()
mean = mean.toarray()
var = np.mean(np.array(x - mean) ** 2, axis=0)
if sparse:
return csr_matrix(var)
else:
return var
@constraint(computing_units="${ComputingUnits}")
@task(blocks={Type: COLLECTION_IN, Depth: 2},
m_blocks={Type: COLLECTION_IN, Depth: 2},
v_blocks={Type: COLLECTION_IN, Depth: 2},
out_blocks=COLLECTION_OUT)
def _transform(blocks, m_blocks, v_blocks, out_blocks):
x = Array._merge_blocks(blocks)
mean = Array._merge_blocks(m_blocks)
var = Array._merge_blocks(v_blocks)
sparse = issparse(x)
if sparse:
x = x.toarray()
mean = mean.toarray()
var = var.toarray()
scaled_x = (x - mean) / np.sqrt(var)
constructor_func = np.array if not sparse else csr_matrix
start, end = 0, 0
for i, block in enumerate(blocks[0]):
end += block.shape[1]
out_blocks[i] = constructor_func(scaled_x[:, start:end])
start += block.shape[1]
@constraint(computing_units="${ComputingUnits}")
@task(blocks={Type: COLLECTION_IN, Depth: 2},
m_blocks={Type: COLLECTION_IN, Depth: 2},
v_blocks={Type: COLLECTION_IN, Depth: 2},
out_blocks=COLLECTION_OUT)
def _inverse_transform(blocks, m_blocks, v_blocks, out_blocks):
x = Array._merge_blocks(blocks)
mean = Array._merge_blocks(m_blocks)
var = Array._merge_blocks(v_blocks)
sparse = issparse(x)
if sparse:
x = x.toarray()
mean = mean.toarray()
var = var.toarray()
x = x * np.sqrt(var) + mean
constructor_func = np.array if not sparse else csr_matrix
start, end = 0, 0
for i, block in enumerate(blocks[0]):
end += block.shape[1]
out_blocks[i] = constructor_func(x[:, start:end])
start += block.shape[1]