/
transforming.py
413 lines (340 loc) · 12.8 KB
/
transforming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
'''
transforming (:mod:`calour.transforming`)
=========================================
.. warning:: Some of the functions require dense matrix and thus will change the sparse matrix to dense matrix.
.. currentmodule:: calour.transforming
Functions
^^^^^^^^^
.. autosummary::
:toctree: generated
normalize
normalize_by_subset_features
normalize_compositional
scale
random_permute_data
binarize
log_n
transform
center_log_ratio
subsample_count
'''
# ----------------------------------------------------------------------------
# Copyright (c) 2016--, Calour development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from logging import getLogger
from copy import deepcopy
from collections import defaultdict
import numpy as np
from sklearn import preprocessing
from skbio.stats.composition import clr, centralize as skbio_centralize
from skbio.stats import subsample_counts
from . import Experiment
from ._doc import ds
logger = getLogger(__name__)
@Experiment._record_sig
def normalize(exp: Experiment, total=10000, axis=0, inplace=False):
'''Normalize the sum of each sample (axis=0) or feature (axis=1) to sum total
Parameters
----------
total : float
the sum (along axis) to normalize to
axis : 0, 1, 's', or 'f', optional
the axis to normalize. 0 or 's' (default) is normalize each sample;
1 or 'f' to normalize each feature
inplace : bool, optional
False (default) to create a copy, True to replace values in exp
Returns
-------
Experiment
the normalized experiment
'''
if isinstance(total, bool):
raise ValueError('Normalization total (%s) not numeric' % total)
if total <= 0:
raise ValueError('Normalization total (%s) must be positive' % total)
if not inplace:
exp = deepcopy(exp)
exp.data = preprocessing.normalize(exp.data, norm='l1', axis=1-axis) * total
# store the normalization depth into the experiment metadata
exp.normalized = total
return exp
@Experiment._record_sig
def rescale(exp: Experiment, total=10000, axis=0, inplace=False):
'''Rescale the data to mean sum of all samples (axis=0) or features (axis=1) to be total.
This function rescales by multiplying ALL entries in exp.data by same number.
Parameters
----------
total : float
the mean sum (along axis) to normalize to
axis : 0, 1, 's', or 'f', optional
the axis to normalize. 0 or 's' (default) is normalize each sample;
1 or 'f' to normalize each feature
inplace : bool, optional
False (default) to create a copy, True to replace values in exp
Returns
-------
Experiment
the normalized experiment
'''
if not inplace:
exp = deepcopy(exp)
current_mean = np.mean(exp.data.sum(axis=1-axis))
exp.data = exp.data * total / current_mean
return exp
@Experiment._record_sig
def scale(exp: Experiment, axis=0, inplace=False):
'''Standardize a dataset along an axis
.. warning:: It will convert the ``Experiment.data`` from the sparse matrix to dense array.
Parameters
----------
axis : 0, 1, 's', or 'f'
0 or 's' means scaling occur sample-wise; 1 or 'f' feature-wise.
Returns
-------
Experiment
'''
logger.debug('scaling the data, axis=%d' % axis)
if not inplace:
exp = deepcopy(exp)
if exp.sparse:
exp.sparse = False
preprocessing.scale(exp.data, axis=1-axis, copy=False)
return exp
@Experiment._record_sig
def binarize(exp: Experiment, threshold=1, inplace=False):
'''Binarize the data with a threshold.
It calls scikit-learn to do the real work.
Parameters
----------
threshold : Numeric
the cutoff value. Any values below or equal to this will be replaced by 0,
above it by 1.
Returns
-------
Experiment
'''
logger.debug('binarizing the data. threshold=%f' % threshold)
if not inplace:
exp = deepcopy(exp)
preprocessing.binarize(exp.data, threshold=threshold, copy=False)
return exp
@Experiment._record_sig
def log_n(exp: Experiment, n=1, inplace=False):
'''Log transform the data
Parameters
----------
n : numeric, optional
cap the tiny values and then log transform the data.
inplace : bool, optional
Returns
-------
Experiment
'''
logger.debug('log_n transforming the data, min. threshold=%f' % n)
if not inplace:
exp = deepcopy(exp)
if exp.sparse:
exp.sparse = False
exp.data[exp.data < n] = n
exp.data = np.log2(exp.data)
return exp
@ds.get_sectionsf('transforming.transform')
@Experiment._record_sig
def transform(exp: Experiment, steps=[], inplace=False, **kwargs):
'''Chain transformations together.
Parameters
----------
steps : list of callable
each callable is a transformer that takes :class:`.Experiment` object as
its 1st argument and has a boolean parameter of ``inplace``. Each
callable should return an :class:`.Experiment` object.
inplace : bool
transformation occuring in the original data or a copy
kwargs : dict
keyword arguments to pass to each transformers. The key should
be in the form of "<transformer_name>__<param_name>". For
example, "transform(exp: Experiment, steps=[log_n], log_n__n=3)" will set
"n" of function "log_n" to 3
Returns
-------
Experiment
with its data transformed
'''
if not inplace:
exp = deepcopy(exp)
params = defaultdict(dict)
for k, v in kwargs.items():
transformer, param_name = k.split('__')
if param_name == 'inplace':
raise ValueError('You should not give %s argument. It should be '
'set thru `inplace` argument for this function.')
params[transformer][param_name] = v
for step in steps:
step(exp, inplace=True, **params[step.__name__])
return exp
@Experiment._record_sig
def normalize_by_subset_features(exp: Experiment, features, total=10000, negate=True, inplace=False):
'''Normalize each sample by their total sums without a list of features
Normalizes all features (including in the exclude list) by the
total sum calculated without the excluded features. This is to
alleviate the compositionality in the data set by only keeping the
features that you think are not changing across samples.
.. note:: sum is not identical in all samples after normalization
(since also keeps the excluded features)
Parameters
----------
features : list of str
The feature IDs to exclude (or include if negate=False)
total : int, optional
The total abundance for the non-excluded features per sample
negate : bool, optional
True (default) to calculate normalization factor without features in features list.
False to calculate normalization factor only with features in features list.
inplace : bool, optional
False (default) to create a new experiment, True to normalize in place
Returns
-------
Experiment
The normalized experiment
'''
feature_pos = exp.feature_metadata.index.isin(features)
if negate:
feature_pos = np.invert(feature_pos)
data = exp.get_data(sparse=False)
use_reads = np.sum(data[:, feature_pos], axis=1)
if inplace:
newexp = exp
else:
newexp = deepcopy(exp)
newexp.data = total * data / use_reads[:, None]
# store the normalization depth into the experiment metadata
newexp.exp_metadata['normalized'] = total
return newexp
def normalize_compositional(exp: Experiment, min_frac=0.05, total=10000, inplace=False):
'''Normalize each sample by ignoring the features with mean>=min_frac in all the experiment
This assumes that the majority of features have less than min_frac mean, and that the majority of features don't change
between samples in a constant direction
Parameters
----------
min_frac : float, optional
ignore features with mean (over all samples) >= min_frac.
total : int, optional
The total abundance for the non-excluded features per sample
inplace : bool, optional
False (default) to create a new experiment, True to normalize in place
Returns
-------
Experiment
The normalized experiment. Note that all features are normalized (including the ones with mean>=min_frac)
'''
comp_features = exp.filter_mean_abundance(min_frac)
logger.info('ignoring %d features' % comp_features.shape[1])
newexp = exp.normalize_by_subset_features(comp_features.feature_metadata.index.values,
total=total, negate=True, inplace=inplace)
return newexp
def random_permute_data(exp: Experiment, normalize=True):
'''Shuffle independently the reads of each feature
Creates a new experiment with no dependence between the features.
Parameters
----------
normalize : bool, optional
True (default) to normalize each sample after completing the feature shuffling.
False to not normalize
Returns
-------
Experiment
With each feature shuffled independently
'''
newexp = exp.copy()
newexp.sparse = False
for cfeature in range(newexp.shape[1]):
np.random.shuffle(newexp.data[:, cfeature])
if normalize:
newexp.normalize(np.mean(exp.data.sum(axis=1)), inplace=True)
return newexp
@Experiment._record_sig
def center_log_ratio(exp: Experiment, method=lambda matrix: matrix + 1, centralize=False, inplace=False):
""" Performs a clr transform to normalize each sample.
Parameters
----------
method : callable, optional
An optional function to specify how the pseudocount method should be
handled (to deal with zeros in the matrix)
centralize : bool, optional
centralize feature-wise to zero or not
inplace : bool, optional
False (default) to create a new experiment, True to normalize in place
Returns
-------
Experiment
The normalized experiment. Note that all features are clr normalized.
See Also
--------
skbio.stats.composition.clr
skbio.stats.composition.centralize
"""
logger.debug('clr transforming the data')
if not inplace:
exp = deepcopy(exp)
if exp.sparse:
exp.sparse = False
if centralize:
exp.data = clr(skbio_centralize(method(exp.data)))
else:
exp.data = clr(method(exp.data))
return exp
@Experiment._record_sig
def subsample_count(exp: Experiment, total, replace=False, inplace=False, random_seed=None):
"""Randomly subsample each sample to the same number of counts.
.. warning:: This function will change the :attr:`Experiment.data`
object from sparse to dense. The input ``Experiment`` object
should not have been normalized by total sum and its data
should be discrete count. The samples that have few total count
than ``total`` will be dropped.
.. note:: This function may not work on Windows OS. It relies on
the :func:`skbio.stats.subsample_counts` which have
`ValueError: Buffer dtype mismatch, expected 'int64_t' but got
'long'` in `_subsample_counts_without_replacement` function of
`skbio/stats/__subsample.pyx`
Parameters
----------
total : int, optional
cap the tiny values and then clr transform the data.
replace : bool, optional
If True, subsample with replacement. If False (the default), subsample without replacement
inplace : bool, optional
False (default) to create a new experiment, True to do it in place
random_seed : int or None, optional, default=None
passed to :func:`numpy.random.seed`
Returns
-------
Experiment
The subsampled experiment.
See Also
--------
:func:`skbio.stats.subsample_counts`
"""
if inplace:
newexp = exp
else:
newexp = deepcopy(exp)
if newexp.sparse:
newexp.sparse = False
# subsample_counts() require int as input; if not, raise error
if exp.data.dtype.kind not in {'u', 'i'}:
raise ValueError('Your `Experiment` object is normalized: subsample operates on integer raw data, not on normalized data.')
drops = []
np.random.seed(random_seed)
for row in range(newexp.data.shape[0]):
counts = newexp.data[row, :]
if total > counts.sum() and not replace:
drops.append(row)
else:
newexp.data[row, :] = subsample_counts(counts, n=total, replace=replace)
newexp.reorder([i not in drops for i in range(newexp.data.shape[0])], inplace=True)
newexp.normalized = total
return newexp