/
amplicon_experiment.py
289 lines (240 loc) · 10.5 KB
/
amplicon_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
'''
amplicon experiment (:mod:`calour.amplicon_experiment`)
=======================================================
.. currentmodule:: calour.amplicon_experiment
Classes
^^^^^^^
.. autosummary::
:toctree: generated
AmpliconExperiment
'''
# ----------------------------------------------------------------------------
# Copyright (c) 2016--, Calour development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from logging import getLogger
import numpy as np
import skbio
from .experiment import Experiment
from .util import _get_taxonomy_string, _to_list
logger = getLogger(__name__)
class AmpliconExperiment(Experiment):
'''This class stores amplicon data and associated metadata.
This is a child class of :class:`.Experiment`
Parameters
----------
data : numpy.ndarray or scipy.sparse.csr_matrix
The abundance table for OTUs, metabolites, genes, etc. Samples
are in row and features in column
sample_metadata : pandas.DataFrame
The metadata on the samples
feature_metadata : pandas.DataFrame
The metadata on the features
description : str
name of experiment
sparse : bool
store the data array in :class:`scipy.sparse.csr_matrix`
or :class:`numpy.ndarray`
Attributes
----------
data : numpy.ndarray or scipy.sparse.csr_matrix
The abundance table for OTUs, metabolites, genes, etc. Samples
are in row and features in column
sample_metadata : pandas.DataFrame
The metadata on the samples
feature_metadata : pandas.DataFrame
The metadata on the features
exp_metadata : dict
metadata about the experiment (data md5, filenames, etc.)
shape : tuple of (int, int)
the dimension of data
sparse : bool
store the data as sparse matrix (scipy.sparse.csr_matrix) or dense numpy array.
description : str
name of the experiment
'''
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.heatmap_databases = ('dbbact',)
def filter_taxonomy(exp: Experiment, values, negate=False, inplace=False, substring=True):
'''filter keeping only observations with taxonomy string matching taxonomy
if substring=True, look for partial match instead of identity.
Matching is case insensitive
Parameters
----------
values : str or list of str
the taxonomy string/strings to filter (can be partial if substring is True)
negate : bool, optional
False (default) to keep matching taxonomies, True to remove matching taxonomies
inplace : bool, optional
do the filtering on the original :class:`.Experiment` object or a copied one.
substring : bool, optional
True (default) to do partial (substring) matching for the taxonomy string,
False to do exact matching
Returns
-------
AmpliconExperiment
With only features with matching taxonomy
'''
if 'taxonomy' not in exp.feature_metadata.columns:
logger.warning('No taxonomy field in experiment')
return None
values = _to_list(values)
taxstr = exp.feature_metadata['taxonomy'].str.lower()
select = np.zeros(len(taxstr), dtype=bool)
for cval in values:
if substring:
select += [cval.lower() in ctax for ctax in taxstr]
else:
select += [cval.lower() == ctax for ctax in taxstr]
if negate is True:
select = ~ select
logger.info('%s remaining' % np.sum(select))
return exp.reorder(select, axis=1, inplace=inplace)
def filter_fasta(exp: Experiment, filename, negate=False, inplace=False):
'''Filter features from experiment based on fasta file
Parameters
----------
filename : str
the fasta filename containing the sequences to use for filtering
negate : bool, optional
False (default) to keep only sequences matching the fasta file;
True to remove sequences in the fasta file.
inplace : bool, optional
False (default) to create a copy of the experiment, True to filter inplace
Returns
-------
newexp : Experiment
filtered so contains only sequence present in exp and in the fasta file
'''
logger.debug('Filter by sequence using fasta file %s' % filename)
okpos = []
tot_seqs = 0
for cseq in skbio.read(filename, format='fasta'):
tot_seqs += 1
cseq = str(cseq).upper()
if cseq in exp.feature_metadata.index:
pos = exp.feature_metadata.index.get_loc(cseq)
okpos.append(pos)
logger.debug('loaded %d sequences. found %d sequences in experiment' % (tot_seqs, len(okpos)))
if negate:
okpos = np.setdiff1d(np.arange(len(exp.feature_metadata.index)), okpos, assume_unique=True)
newexp = exp.reorder(okpos, axis=1, inplace=inplace)
return newexp
@Experiment._record_sig
def sort_taxonomy(exp: Experiment, inplace=False):
'''Sort the features based on the taxonomy
Sort features based on the taxonomy (alphabetical)
Parameters
----------
inplace : bool, optional
False (default) to create a copy
True to Replace data in exp
Returns
-------
Experiment
sorted by taxonomy
'''
logger.debug('sort features by taxonomies')
taxonomy = _get_taxonomy_string(exp, remove_underscore=True)
sort_pos = np.argsort(taxonomy, kind='mergesort')
exp = exp.reorder(sort_pos, axis=1, inplace=inplace)
return exp
@Experiment._record_sig
def filter_orig_reads(exp, minreads, **kwargs):
'''Filter keeping only samples with >= minreads in the original reads column
Note this function uses the _calour_original_abundance field rather than the current sum of sequences per sample.
So if you start with a sample with 100 reads, normalizing and filtering with other functions with not change the original reads column
(which will remain 100).
If you want to filter based on current total reads, use ``filter_by_data()`` instead
Parameters
----------
minreads : numeric
Keep only samples with >= minreads reads (when loaded - not affected by normalization)
Returns
-------
AmpliconExperiment - with only samples with enough original reads
'''
origread_field = '_calour_original_abundance'
if origread_field not in exp.sample_metadata.columns:
raise ValueError('%s field not initialzed. Did you load the data with calour.read_amplicon() ?' % origread_field)
good_pos = (exp.sample_metadata[origread_field] >= minreads).values
newexp = exp.reorder(good_pos, axis=0, **kwargs)
return newexp
def collapse_taxonomy(exp: Experiment, level='genus', inplace=False):
'''Collapse all features sharing the same taxonomy up to level into a single feature
Sums abundances of all features sharing the same taxonomy up to level.
Parameters
----------
level: str or int, optional
the level to bin the taxonmies. can be int (0=kingdom, 1=phylum,...6=species)
or a string ('kingdom' or 'k' etc.)
inplace : bool, optional
False (default) to create a copy
True to Replace data in exp
'''
level_dict = {'kingdom': 0, 'k': 0,
'phylum': 1, 'p': 1,
'class': 2, 'c': 2,
'order': 3, 'o': 3,
'family': 4, 'f': 4,
'genus': 5, 'g': 5,
'species': 6, 's': 6}
if not isinstance(level, int):
if level not in level_dict:
raise ValueError('Unsupported taxonomy level %s. Please use out of %s' % (level, list(level_dict.keys())))
level = level_dict[level]
if inplace:
newexp = exp
else:
newexp = exp.copy()
def _tax_level(tax_str, level):
# local function to get taxonomy up to given level
ctax = tax_str.split(';')
level += 1
if len(ctax) < level:
ctax.extend(['other'] * (level - len(ctax)))
return ';'.join(ctax[:level])
newexp.feature_metadata['_calour_tax_group'] = newexp.feature_metadata['taxonomy'].apply(_tax_level, level=level)
newexp.aggregate_by_metadata('_calour_tax_group', agg='sum', axis=1, inplace=True)
newexp.feature_metadata['taxonomy'] = newexp.feature_metadata['_calour_tax_group']
return newexp
def split_taxonomy(self, field='taxonomy', sep=';',
names=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']):
'''Split taxonomy column into individual column per level.
Assume the taxonomy string is in QIIME style:
"k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae;g__Staphylococcus;s__"
Parameters
----------
sep : str
the separator between taxa levels
names : list
the column names for the new columns split from ``field``
'''
self.feature_metadata[names] = self.feature_metadata[field].str.split(sep, expand=True)
# return so you can chain the functions
return self
def find_lowest_taxonomy(self, field='taxonomy', new_field='taxa'):
'''Create a new column that contains the taxonomy of lowest possible level.
For example, 'k__Bacteria; p__Firmicutes; c__Bacilli,
o__Lactobacillales; f__Enterococcaceae; g__Enterococcus,
s__' will return 'g__Enterococcus'
Parameters
----------
field : str
column name that contains all levels of taxonomy
new_field : str
new column name
Returns
-------
AmpliconExperiment
'''
def find_highest(s):
levels = s.split(';')
b = [len(i) > 3 for i in levels]
return np.array(levels)[b][-1]
self.feature_metadata[new_field] = self.feature_metadata[field].apply(find_highest)
return self