-
Notifications
You must be signed in to change notification settings - Fork 23
/
classes.py
141 lines (109 loc) · 3.98 KB
/
classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
from pycompss.api.parameter import Depth, Type, COLLECTION_IN, COLLECTION_OUT
from pycompss.api.task import task
from scipy.sparse import csr_matrix, issparse
from dislib.data.array import Array
import dislib as ds
class StandardScaler(object):
""" Standardize features by removing the mean and scaling to unit variance
Centering and scaling happen independently on each feature by computing the
relevant statistics on the samples in the training set. Mean and standard
deviation are then stored to be used on later data using the transform
method.
Attributes
----------
mean_ : ds-array, shape (1, n_features)
The mean value for each feature in the training set.
var_ : ds-array, shape (1, n_features)
The variance for each feature in the training set.
"""
def __init__(self):
self.mean_ = None
self.var_ = None
def fit(self, x):
""" Compute the mean and std to be used for later scaling.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
self : StandardScaler
"""
self.mean_ = ds.apply_along_axis(np.mean, 0, x)
var_blocks = [[]]
for row, m_row in zip(x._iterator(1), self.mean_._iterator(1)):
var_blocks[0].append(_compute_var(row._blocks, m_row._blocks))
self.var_ = Array(var_blocks,
top_left_shape=self.mean_._top_left_shape,
reg_shape=self.mean_._reg_shape,
shape=self.mean_.shape, sparse=x._sparse)
return self
def fit_transform(self, x):
""" Fit to data, then transform it.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
x_new : ds-array, shape=(n_samples, n_features)
Scaled data.
"""
return self.fit(x).transform(x)
def transform(self, x):
"""
Standarize data.
Parameters
----------
x : ds-array, shape=(n_samples, n_features)
Returns
-------
x_new : ds-array, shape=(n_samples, n_features)
Scaled data.
"""
if self.mean_ is None or self.var_ is None:
raise Exception("Model has not been initialized.")
n_blocks = x._n_blocks[1]
blocks = []
m_blocks = self.mean_._blocks
v_blocks = self.var_._blocks
for row in x._iterator(axis=0):
out_blocks = [object() for _ in range(n_blocks)]
_transform(row._blocks, m_blocks, v_blocks, out_blocks)
blocks.append(out_blocks)
return Array(blocks, top_left_shape=x._top_left_shape,
reg_shape=x._reg_shape, shape=x.shape,
sparse=x._sparse)
@task(blocks={Type: COLLECTION_IN, Depth: 2},
m_blocks={Type: COLLECTION_IN, Depth: 2},
returns=1)
def _compute_var(blocks, m_blocks):
x = Array._merge_blocks(blocks)
mean = Array._merge_blocks(m_blocks)
sparse = issparse(x)
if sparse:
x = x.toarray()
mean = mean.toarray()
var = np.mean(np.array(x - mean) ** 2, axis=0)
if sparse:
return csr_matrix(var)
else:
return var
@task(blocks={Type: COLLECTION_IN, Depth: 2},
m_blocks={Type: COLLECTION_IN, Depth: 2},
v_blocks={Type: COLLECTION_IN, Depth: 2},
out_blocks=COLLECTION_OUT)
def _transform(blocks, m_blocks, v_blocks, out_blocks):
x = Array._merge_blocks(blocks)
mean = Array._merge_blocks(m_blocks)
var = Array._merge_blocks(v_blocks)
sparse = issparse(x)
if sparse:
x = x.toarray()
mean = mean.toarray()
var = var.toarray()
scaled_x = (x - mean) / np.sqrt(var)
constructor_func = np.array if not sparse else csr_matrix
start, end = 0, 0
for i, block in enumerate(blocks[0]):
end += block.shape[1]
out_blocks[i] = constructor_func(scaled_x[:, start:end])