forked from AzimAhmadzadeh/mvtsdata_toolkit
-
Notifications
You must be signed in to change notification settings - Fork 3
/
normalizer.py
executable file
·222 lines (174 loc) · 9.03 KB
/
normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import pandas as pd
import numpy as np
'''
This module contains several normalization methods.
See ReadMe.md for usage example.
'''
def zero_one_normalize(df: pd.DataFrame, excluded_colnames: list = None) -> pd.DataFrame:
"""
Applies the MinMaxScaler from the module sklearn.preprocessing to find
the min and max of each column and transforms the values into the range
of [0,1]. The transformation is given by::
X_scaled = (X - X.min(axis=0)) / ranges
where::
range = X.max(axis=0) - X.min(axis=0)
Note: In case multiple dataframes are used (i.e., several partitions of
the dataset in training and testing), make sure that all of them will
be passed to this method at once, and as one single dataframe. Otherwise,
the normalization will be carried out on local (as opposed to global)
extrema, which is incorrect.
:param df: The dataframe to be normalized.
:param excluded_colnames: The name of non-numeric columns (e.g. TimeStamp,
ID etc.) that must be excluded before normalization takes place.
They will be added back to the normalized data.
:return: The same dataframe as input, with the label column unchanged,
except that now the numerical values are transformed into a [0, 1] range.
"""
from sklearn.preprocessing import MinMaxScaler
excluded_colnames = excluded_colnames if excluded_colnames else []
colnames_original_order = list(df)
# Separate data (numeric) from those to be excluded (ids and class_labels)
included_cnames = [colname for colname in list(df) if colname not in excluded_colnames]
# Exclude all non-numeric columns
df_numeric = df[included_cnames].select_dtypes(include=np.number)
# set-difference between the original and numeric columns
excluded_cnames = list(set(colnames_original_order) - set(list(df_numeric)))
df_excluded = df[excluded_cnames]
# prepare normalizer and normalize
scaler = MinMaxScaler()
res_ndarray = scaler.fit_transform(df_numeric)
df_numeric = pd.DataFrame(res_ndarray, columns=list(df_numeric), dtype=float)
# Reset the indices (so that they match)
df_excluded.reset_index()
df_numeric.reset_index()
# Add the excluded columns back
df_norm = df_excluded.join(df_numeric)
# Restore the original oder of columns
df_norm = df_norm[colnames_original_order]
return df_norm
def negativeone_one_normalize(df: pd.DataFrame, excluded_colnames: list = None) -> pd.DataFrame:
"""
Applies the `MinMaxScaler` from the module `sklearn.preprocessing` to find
the min and max of each column and transforms the values into the range
of [-1,1]. The transformation is given by::
X_scaled = scale * X - 1 - X.min(axis=0) * scale
where::
scale = 2 / (X.max(axis=0) - X.min(axis=0))
Note: In case multiple dataframes are used (i.e., several partitions of
the dataset in training and testing), make sure that all of them will
be passed to this method at once, and as one single dataframe. Otherwise,
the normalization will be carried out on local (as opposed to global)
extrema, which is incorrect.
:param df: The dataframe to be normalized.
:param excluded_colnames: The name of non-numeric columns (e.g. TimeStamp,
ID etc) that must be excluded before normalization takes place.
They will be added back to the normalized data.
:return: The same dataframe as input, with the label column unchanged,
except that now the numerical values are transformed into a [-1, 1] range.
"""
from sklearn.preprocessing import MinMaxScaler
excluded_colnames = excluded_colnames if excluded_colnames else []
colnames_original_order = list(df)
# Separate data (numeric) from those to be excluded (ids and class_labels)
included_cnames = [colname for colname in list(df) if colname not in excluded_colnames]
# Exclude all non-numeric columns
df_numeric = df[included_cnames].select_dtypes(include=np.number)
# set-difference between the original and numeric columns
excluded_cnames = list(set(colnames_original_order) - set(list(df_numeric)))
df_excluded = df[excluded_cnames]
# prepare normalizer and normalize
scaler = MinMaxScaler((-1, 1))
res_ndarray = scaler.fit_transform(df_numeric)
df_numeric = pd.DataFrame(res_ndarray, columns=list(df_numeric), dtype=float)
# Reset the indices (so that they match)
df_excluded.reset_index()
df_numeric.reset_index()
# Add the excluded columns back
df_norm = df_excluded.join(df_numeric)
# Restore the original oder of columns
df_norm = df_norm[colnames_original_order]
return df_norm
def standardize(df: pd.DataFrame, excluded_colnames: list = None) -> pd.DataFrame:
"""
Applies the StandardScaler from the module sklearn.preprocessing by
removing the mean and scaling to unit variance. The transformation
is given by:
.. math::
z = (x - u) / s
where `x` is a feature vector, `u` is the mean of the vector, and `s`
represents its standard deviation.
Note: In case multiple dataframes are used (i.e., several partitions of
the dataset in training and testing), make sure that all of them will
be passed to this method at once, and as one single dataframe. Otherwise,
the normalization will be carried out on local (as opposed to global)
extrema, which is incorrect.
:param df: The dataframe to be normalized.
:param excluded_colnames: The name of non-numeric columns (e.g. TimeStamp,
ID etc) that must be excluded before normalization takes place. They will
be added back to the normalized data.
:return: The same dataframe as input, with the label column unchanged,
except that now the numeric values are transformed into a range with mean
at 0 and unit standard deviation.
"""
from sklearn.preprocessing import StandardScaler
excluded_colnames = excluded_colnames if excluded_colnames else []
colnames_original_order = list(df)
# Separate data (numeric) from those to be excluded (ids and class_labels)
included_cnames = [colname for colname in list(df) if colname not in excluded_colnames]
# Exclude all non-numeric columns
df_numeric = df[included_cnames].select_dtypes(include=np.number)
# set-difference between the original and numeric columns
excluded_cnames = list(set(colnames_original_order) - set(list(df_numeric)))
df_excluded = df[excluded_cnames]
# prepare normalizer and normalize
scaler = StandardScaler()
res_ndarray = scaler.fit_transform(df_numeric)
df_numeric = pd.DataFrame(res_ndarray, columns=list(df_numeric), dtype=float)
# Reset the indices (so that they match)
df_excluded.reset_index()
df_numeric.reset_index()
# Add the excluded columns back
df_norm = df_excluded.join(df_numeric)
# Restore the original oder of columns
df_norm = df_norm[colnames_original_order]
return df_norm
def robust_standardize(df: pd.DataFrame, excluded_colnames: list = None) -> pd.DataFrame:
"""
Applies the RobustScaler from the module sklearn.preprocessing by
removing the median and scaling the data according to the quantile
range (IQR). This transformation is robust to outliers.
Note: In case multiple dataframes are used (i.e., several partitions of
the dataset in training and testing), make sure that all of them will
be passed to this method at once, and as one single dataframe. Otherwise,
the normalization will be carried out on local (as opposed to global)
extrema, hence unrepresentative IQR. This is a bad practice.
:param df: The dataframe to be normalized.
:param excluded_colnames: The name of non-numeric (e.g., TimeStamp,
ID etc.) that must be excluded before normalization takes place.
They will be added back to the normalized data.
:return: The same dataframe as input, with the label column unchanged,
except that now the numerical values are transformed into new range
determined by IQR.
"""
from sklearn.preprocessing import RobustScaler
excluded_colnames = excluded_colnames if excluded_colnames else []
colnames_original_order = list(df)
# Separate data (numeric) from those to be excluded (ids and class_labels)
included_cnames = [colname for colname in list(df) if colname not in excluded_colnames]
# Exclude all non-numeric columns
df_numeric = df[included_cnames].select_dtypes(include=np.number)
# set-difference between the original and numeric columns
excluded_cnames = list(set(colnames_original_order) - set(list(df_numeric)))
df_excluded = df[excluded_cnames]
# prepare normalizer and normalize
scaler = RobustScaler()
res_ndarray = scaler.fit_transform(df_numeric)
df_numeric = pd.DataFrame(res_ndarray, columns=list(df_numeric), dtype=float)
# Reset the indices (so that they match)
df_excluded.reset_index()
df_numeric.reset_index()
# Add the excluded columns back
df_norm = df_excluded.join(df_numeric)
# Restore the original oder of columns
df_norm = df_norm[colnames_original_order]
return df_norm