/
_interval.py
1025 lines (857 loc) · 35.3 KB
/
_interval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import operator
import copy
import functools
from ._intersection import IntervalTree
from skbio.util._decorator import experimental, classonlymethod
class Interval:
"""Stores the bounds and metadata of an interval feature.
This class stores an interval feature. An interval feature
is defined as a sub-region of a biological sequence or sequence
alignment that is a functional entity, e.g., a gene, a riboswitch,
an exon, etc. It can span a single contiguous region or multiple
non-contiguous regions (e.g. multiple exons in a transcript, or
multiple genes in an operon).
Parameters
----------
interval_metadata : object
A reference to the ``IntervalMetadata`` object that this
``Interval`` object is associated to.
bounds : iterable of tuple of int
Tuples representing start and end coordinates. It is *zero-based*
numbering. It is always inclusive on start bound and exclusive on
end bound.
fuzzy : iterable of tuple of bool, optional
Tuples representing the fuzziness of each bound coordinates.
If this isn't specified, then the fuzziness of all bound
coordinates are ``False``. If any of the coordinate fuzziness
is ``True``, it indicates that the exact bound point of a
interval feature is unknown. The bound may begin or end at
some points outside the specified coordinates. This
accommodates the location format [1]_ of INSDC.
metadata : dict, optional
Dictionary of attributes storing information of the feature
such as "strand", "gene_name", or "product".
See Also
--------
skbio.metadata.IntervalMetadata
Notes
-----
While the construction of an ``Interval`` object automatically add
itself to its associated ``IntervalMetadata`` object,
``IntervalMetadata.add`` is the typical/easier way to
create and add it to ``IntervalMetadata``.
References
----------
.. [1] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html#3.4.3
Examples
--------
Hypothetically, let's say we have a gene called "genA" with 10 nt
as shown in the following diagram. The second row represents the
two exons (indicated by "=") on this gene:
::
TGGATTCTGC
-====--==-
0123456789
We can create an ``Interval`` object to represent the exons of the gene:
>>> from skbio.metadata import Interval, IntervalMetadata
>>> interval_metadata = IntervalMetadata(10)
Remember the coordinates are inclusive in lower bound and exclusive on
upper bound:
>>> gene = Interval(interval_metadata,
... bounds=[(1, 5), (7, 9)],
... metadata={'name': 'genA'})
>>> gene # doctest: +ELLIPSIS
Interval(interval_metadata=..., bounds=[(1, 5), (7, 9)], \
fuzzy=[(False, False), (False, False)], metadata={'name': 'genA'})
"""
def __init__(self, interval_metadata, bounds,
fuzzy=None, metadata=None):
if not isinstance(interval_metadata, IntervalMetadata):
raise TypeError('You need to provide an IntervalMetadata'
'object, not %r' % interval_metadata)
# Intervals
self._interval_metadata = interval_metadata
self._bounds_fuzzy_setter(bounds, fuzzy)
# Metadata
if metadata is None:
metadata = {}
self.metadata = metadata
# add this interval feature to the associated IntervalMetadata
self._add()
def _add(self):
"""Add the current ``Interval`` to the IntervalMetadata object."""
for bound in self.bounds:
start, end = bound
self._interval_metadata._interval_tree.add(start, end, self)
self._interval_metadata._intervals.append(self)
@experimental(as_of='0.5.1')
def __eq__(self, other):
'''Test if this ``Interval`` object is equal to another.
The equality is performed by checking if the ``metadata``,
``bounds`` and ``fuzzy`` are equal. Since the ``bounds``
and the ``fuzzy`` are sorted, the permutations of them during
the ``Interval`` construction or assignment won't matter.
Parameters
----------
other : Interval
Interval to test for equality against.
Returns
-------
bool
Indicates if the two objects are equal.
'''
return ((self.metadata == other.metadata) and
(self.bounds == other.bounds) and
(self.fuzzy == other.fuzzy))
@experimental(as_of='0.5.1')
def __ne__(self, other):
'''Test if this ``Interval`` object is not equal to another.
Parameters
----------
other : Interval
Interval to test for inequality against.
Returns
-------
bool
Indicates if the two objects are not equal.
'''
return not (self == other)
@experimental(as_of='0.5.1')
def __repr__(self):
'''Return a string representation of this ``Interval`` object.
Returns
-------
str
String representation of this ``Interval`` object.
'''
if self.dropped:
s = ('{}(dropped=True, bounds={!r}, '
'fuzzy={!r}, metadata={!r})')
return s.format(self.__class__.__name__,
self.bounds, self.fuzzy, self.metadata)
else:
s = ('{}(interval_metadata=<{!r}>, bounds={!r}, '
'fuzzy={!r}, metadata={!r})')
return s.format(self.__class__.__name__,
id(self._interval_metadata),
self.bounds, self.fuzzy, self.metadata)
@experimental(as_of='0.5.1')
def drop(self):
'''Drop this ``Interval`` object from the interval metadata it links to.
If the ``Interval`` object is dropped, you can still get values of
``bounds``, ``fuzzy``, and ``metadata`` attributes, but you
can not change their values with the setters.
See Also
--------
skbio.metadata.IntervalMetadata.drop
'''
if not self.dropped:
self._interval_metadata.drop([self])
def _bounds_fuzzy_setter(self, bounds=None, fuzzy=None):
if self.dropped:
raise RuntimeError('Cannot change `bounds` or `fuzzy` '
'on a dropped Interval object.')
# Casts to `list`, validation, sorting, and setting of `bounds`
# and `fuzzy` happen here.
if bounds is not None:
# check iterability
try:
# check iterability
bounds = list(bounds)
except TypeError:
raise TypeError('Cannot give an non-iterable (%r) '
'to `bounds`.' % bounds)
# check it is not empty
if not bounds:
raise ValueError('Cannot give empty `bounds`.')
# check each contiguous span is in right format
for bound in bounds:
_assert_valid_bound(bound)
spans = len(bounds)
else:
spans = len(self.bounds)
if fuzzy is not None:
try:
fuzzy = list(fuzzy)
except TypeError:
raise TypeError(
'Cannot give a non-iterable (%r) '
'to `fuzzy`.' % fuzzy)
if len(fuzzy) != spans:
raise ValueError(
'The length of fuzzy must '
'be equal to the length of bounds.')
for fuzzy_i in fuzzy:
_assert_valid_fuzzy(fuzzy_i)
if bounds is None:
# `bounds` and `fuzzy` cannot both be omitted.
if fuzzy is None:
raise ValueError('Cannot give `None` to both `bounds` '
'and `fuzzy`.')
# If only `fuzzy` is provided, set `self.fuzzy` and don't
# change `self.bounds`.
else:
self._fuzzy = fuzzy
else:
# If only `bounds` is provided, reset `self.fuzzy` to
# all `False`.
if fuzzy is None:
bounds.sort()
self._check_bounds(bounds)
self._bounds = bounds
# reset all the fuzzy to False!!
del self.fuzzy
# If both `bounds` and `fuzzy` are provided, set
# `self.bounds` and `self.fuzzy`.
else:
bounds, fuzzy = [
list(e) for e in zip(*sorted(zip(bounds, fuzzy)))]
self._check_bounds(bounds)
self._bounds = bounds
self._fuzzy = fuzzy
self._interval_metadata._is_stale_tree = True
def _check_bounds(self, bounds):
'''input `bounds` must be sorted.'''
upper_bound = self._interval_metadata.upper_bound
lower_bound = self._interval_metadata.lower_bound
if upper_bound is not None and bounds[-1][-1] > upper_bound:
raise ValueError('Cannot set `bounds` (%r) with coordinate '
'larger than upper bound (%r)' %
(bounds, upper_bound))
if bounds[0][0] < lower_bound:
raise ValueError('Cannot set `bounds` (%r) with coordinate '
'smaller than lower bound (%r).' %
(bounds, lower_bound))
@property
@experimental(as_of='0.5.1')
def fuzzy(self):
'''The openness of each coordinate.
This indicates that the exact bound of a interval feature
is unknown. The bound may begin or end at some points outside
the specified coordinates. This accommodates the bound format [1]_
of INSDC.
References
----------
.. [1] ftp://ftp.ebi.ac.uk/pub/databases/embl/doc/FT_current.html#3.4.3
'''
return self._fuzzy
@fuzzy.setter
@experimental(as_of='0.5.1')
def fuzzy(self, value):
'''Set ``fuzzy``.
The ``value`` should an iterable matching ``self.bounds``.
'''
self._bounds_fuzzy_setter(fuzzy=value)
@fuzzy.deleter
@experimental(as_of='0.5.1')
def fuzzy(self):
'''Delete ``fuzzy``.
This set all fuzzy to be ``False``.
'''
if self.dropped:
raise RuntimeError('Cannot change fuzzy on dropped '
'Interval object.')
self._fuzzy = [(False, False)] * len(self.bounds)
@property
@experimental(as_of='0.5.1')
def bounds(self):
'''The coordinates of the interval feature.
It should be a list of tuples of int pair. Each tuple stores
the start and end coordinates of a span of the interval
feature. The coordinates are *zero-based*. They are inclusive on
the start and exclusive on the end.
'''
return self._bounds
@bounds.setter
@experimental(as_of='0.5.1')
def bounds(self, value):
'''Set ``bounds``.
WARNING: setting ``bounds`` will reset ``fuzzy`` value to ``False``.
This is not totally surprising because it is justifiable your old
``fuzzy`` don't fit the new bounds.
'''
self._bounds_fuzzy_setter(bounds=value)
@property
@experimental(as_of='0.5.1')
def metadata(self):
'''The metadata of the interval feature.
It stores the metadata (eg. gene name, function, ID, etc.) of
the interval feature as a ``dict``.
'''
return self._metadata
@metadata.setter
@experimental(as_of='0.5.1')
def metadata(self, value):
if self.dropped:
raise RuntimeError('Cannot change metadata on dropped '
'Interval object.')
if not isinstance(value, dict):
raise TypeError("metadata must be a dict, not %r" % value)
self._metadata = value
@metadata.deleter
@experimental(as_of='0.5.1')
def metadata(self):
'''Delete metadata.
This sets metadata to be empty dict.
'''
if self.dropped:
raise RuntimeError('Cannot change metadata to dropped '
'Interval object.')
self._metadata = {}
@property
@experimental(as_of='0.5.1')
def dropped(self):
'''Boolean value indicating if the ``Interval`` object is dropped.
If it is dropped, it means it is not associated with IntervalMetadata
object any more.
Notes
-----
This property is not writable.
See Also
--------
skbio.metadata.Interval.drop
skbio.metadata.IntervalMetadata.drop
'''
return self._interval_metadata is None
class IntervalMetadata():
"""Stores the interval features.
``IntervalMetadata`` object allows storage, modification, and
querying of interval features covering a region of a single coordinate
system. For instance, this can be used to store functional annotations
about genes across a genome. This object is also applied to the sequence
alignment.
This object is typically coupled with another object, such as a
``Sequence`` object (or its child class), or a ``TabularMSA`` object.
Parameters
----------
upper_bound : int or None
Defines the exclusive upper bound of the interval features. No
coordinate can be greater than it. ``None``
means that the coordinate space is unbounded.
copy_from : IntervalMetadata or None, optional
Create a new object from the input ``IntervalMetadata`` object by
shallow copying if it is not ``None``. The upper bound of the new
object will be updated with the ``upper_bound`` parameter specified.
Notes
-----
This class stores coordinates of all feature bounds into a interval
tree. It allows the speed up of query-by-bound. The building of
interval tree is deferred until necessary to save computation. It is
updated from all coordinates only when you need to fetch info from
the interval tree.
When you add a method into this class and if you method need to fetch
info from ``IntervalMetadata._interval_tree``, you should decorate it with
``_rebuild_tree``. This decorator will check if the current interval tree
is stale and will update it if so. Additionally, if your method add,
delete, or changes the coordinates of any interval features, you should
set ``self._is_stale_tree`` to ``True`` at the end of your method to
indicate the interval tree becomes stale.
See Also
--------
skbio.metadata.Interval
Examples
--------
Let's say we have a sequence of length 10 and want to add annotation
to it. Create an ``IntervalMetadata`` object:
>>> from skbio.metadata import Interval, IntervalMetadata
>>> im = IntervalMetadata(10)
Let's add annotations of 3 genes:
>>> im.add(bounds=[(3, 9)],
... metadata={'gene': 'sagB'}) # doctest: +ELLIPSIS
Interval(interval_metadata=..., bounds=[(3, 9)], \
fuzzy=[(False, False)], metadata={'gene': 'sagB'})
>>> im.add(bounds=[(3, 7)],
... metadata={'gene': 'sagC'}) # doctest: +ELLIPSIS
Interval(interval_metadata=..., bounds=[(3, 7)], \
fuzzy=[(False, False)], metadata={'gene': 'sagC'})
>>> im.add(bounds=[(1, 2), (4, 7)],
... metadata={'gene': 'sagA'}) # doctest: +ELLIPSIS
Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \
fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})
Show the object representation:
>>> im # doctest: +ELLIPSIS
3 interval features
-------------------
Interval(interval_metadata=..., bounds=[(3, 9)], \
fuzzy=[(False, False)], metadata={'gene': 'sagB'})
Interval(interval_metadata=..., bounds=[(3, 7)], \
fuzzy=[(False, False)], metadata={'gene': 'sagC'})
Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \
fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})
We can sort the genes by their bounds:
>>> im.sort()
>>> im # doctest: +ELLIPSIS
3 interval features
-------------------
Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \
fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})
Interval(interval_metadata=..., bounds=[(3, 7)], \
fuzzy=[(False, False)], metadata={'gene': 'sagC'})
Interval(interval_metadata=..., bounds=[(3, 9)], \
fuzzy=[(False, False)], metadata={'gene': 'sagB'})
Query the genes by bound and/or metadata:
>>> intvls = im.query([(1, 2)], metadata={'gene': 'foo'})
>>> list(intvls)
[]
>>> intvls = im.query([(7, 9)])
>>> list(intvls) # doctest: +ELLIPSIS
[Interval(interval_metadata=..., bounds=[(3, 9)], \
fuzzy=[(False, False)], metadata={'gene': 'sagB'})]
>>> intvls = im.query(metadata={'gene': 'sagA'})
>>> intvls = list(intvls)
>>> intvls # doctest: +ELLIPSIS
[Interval(interval_metadata=..., bounds=[(1, 2), (4, 7)], \
fuzzy=[(False, False), (False, False)], metadata={'gene': 'sagA'})]
Drop the gene(s) we get from query:
>>> im.drop(intvls)
>>> im.sort()
>>> im # doctest: +ELLIPSIS
2 interval features
-------------------
Interval(interval_metadata=..., bounds=[(3, 7)], \
fuzzy=[(False, False)], metadata={'gene': 'sagC'})
Interval(interval_metadata=..., bounds=[(3, 9)], \
fuzzy=[(False, False)], metadata={'gene': 'sagB'})
"""
default_write_format = 'gff3'
def __init__(self, upper_bound, copy_from=None):
self._upper_bound = upper_bound
if self.upper_bound is not None:
if self.upper_bound < self.lower_bound:
raise ValueError('Cannot set `upper_bound` (%r) '
'smaller than `lower_bound` (%r)'
% (self.upper_bound, self.lower_bound))
# List of Interval objects.
self._intervals = []
# IntervalTree object to allow faster querying of interval objects.
self._interval_tree = IntervalTree()
# Indicates if the IntervalTree needs to be rebuilt.
self._is_stale_tree = False
if copy_from is not None:
for interval in copy_from._intervals:
bounds_cp = interval.bounds[:]
fuzzy_cp = interval.fuzzy[:]
metadata_cp = copy.copy(interval.metadata)
self.add(bounds_cp, fuzzy=fuzzy_cp, metadata=metadata_cp)
@property
@experimental(as_of='0.5.1')
def upper_bound(self):
'''The exclusive upper bound of interval features.'''
return self._upper_bound
@property
@experimental(as_of='0.5.1')
def lower_bound(self):
'''The inclusive lower bound of interval features.'''
return 0
@property
@experimental(as_of='0.5.1')
def num_interval_features(self):
'''The total number of interval features.'''
return len(self._intervals)
def _rebuild_tree(method):
"""Rebuild the IntervalTree."""
@functools.wraps(method)
def inner(self, *args, **kwargs):
if self._is_stale_tree is False:
return method(self, *args, **kwargs)
self._interval_tree = IntervalTree()
for f in self._intervals:
for start, end in f.bounds:
self._interval_tree.add(start, end, f)
self._is_stale_tree = False
return method(self, *args, **kwargs)
return inner
def _reverse(self):
"""Reverse ``IntervalMetadata`` object.
This operation reverses all of the interval coordinates.
For instance, this can be used to compare coordinates
in the forward strand to coordinates in the reversal strand.
"""
for f in self._intervals:
try:
intvls = [(self.upper_bound - x[1], self.upper_bound - x[0])
for x in reversed(f.bounds)]
except TypeError:
raise TypeError('You cannot reverse the coordinates '
'when the upper bound is `None`')
f.bounds = intvls
# DON'T forget this!!!
self._is_stale_tree = True
@classonlymethod
@experimental(as_of="0.5.1")
def concat(cls, interval_metadata):
'''Concatenate an iterable of ``IntervalMetadata`` objects.
It concatenates the multiple ``IntervalMetadata`` objects into
one coordinate space. The order of the objects in the input
iterable matters. The coordinate of the second
``InterableMetadata`` will be shifted up with the length of
the first ``IntervalMetadata`` object.
This function is useful when you concatenate multiple sequences.
Parameters
----------
interval_metadata : Iterable (IntervalMetadata)
The interval metadata to concatenate.
Returns
-------
IntervalMetadata
Concatenated interval metadata.
Examples
--------
>>> from skbio.metadata import IntervalMetadata
Create two ``IntervalMetadata`` objects:
>>> im1 = IntervalMetadata(3)
>>> _ = im1.add([(0, 2)], [(True, False)], {'gene': 'sagA'})
>>> im2 = IntervalMetadata(4)
>>> _ = im2.add([(1, 4)], [(True, True)], {'gene': 'sagB'})
Concatenate them into a single coordinate space. The second
``IntervalMetadata``'s interval features are all shifted
up. The resulting ``IntervalMetadata``'s upper bound is the
sum of upper bounds of concatenated objects:
>>> im = IntervalMetadata.concat([im1, im2])
>>> im # doctest: +ELLIPSIS
2 interval features
-------------------
Interval(interval_metadata=<...>, bounds=[(0, 2)], \
fuzzy=[(True, False)], metadata={'gene': 'sagA'})
Interval(interval_metadata=<...>, bounds=[(4, 7)], \
fuzzy=[(True, True)], metadata={'gene': 'sagB'})
>>> im.upper_bound
7
'''
interval_metadata = list(interval_metadata)
if len(interval_metadata) == 0:
return cls(0)
upper_bound = 0
for im in interval_metadata:
try:
upper_bound += im.upper_bound
except TypeError:
raise TypeError('You cannot concat the interval metadata '
'because its upper bound is `None`:\n%r' % im)
new = cls(upper_bound)
length = 0
for i, im in enumerate(interval_metadata):
for intvl in im._intervals:
bounds = intvl.bounds
fuzzy = intvl.fuzzy
if i != 0:
bounds = [(start + length, end + length)
for start, end in bounds]
new.add(bounds, fuzzy, intvl.metadata)
length += im.upper_bound
return new
@experimental(as_of='0.5.1')
def merge(self, other):
'''Merge the interval features of another ``IntervalMetadata`` object.
It adds all the interval features of the other object into
``self``. Note this will not check if there are any duplicates
of interval features after merge.
Notes
-----
It will raise error if you merge an unbounded
``IntervalMetadata`` object to the current object if it is
bounded. This avoids partially updating the current object if
the merge fails in the middle of the process due to the
possibility that some interval features to be merged may live
outside the current defined upper bound.
Parameters
----------
other : ``IntervalMetadata``
The other ``IntervalMetadata`` to be merged.
'''
if self.upper_bound is not None:
if other.upper_bound is None:
raise ValueError(
'Cannot merge an unbound IntervalMetadata object '
'to a bounded one')
elif self.upper_bound != other.upper_bound:
raise ValueError(
'The upper bounds of the two IntervalMetadata objects '
'are not equal (%r != %r)' % (
self.upper_bound, other.upper_bound))
if self.lower_bound != other.lower_bound:
raise ValueError(
'The lower bounds of the two IntervalMetadata objects '
'are not equal (%d != %d)' % (
self.lower_bound, other.lower_bound))
for intvl in other._intervals:
self.add(intvl.bounds, intvl.fuzzy, intvl.metadata)
@experimental(as_of='0.5.1')
def sort(self, ascending=True):
'''Sort interval features by their coordinates.
It sorts by the start coordinate first. If they are the same between
two interval features, they will be sorted by comparing their end
coordinates. For example, an interval feature with [(1, 2), (4, 7)]
will be sorted in front of another one with [(1, 2), (3, 8)].
Parameters
----------
ascending : bool, optional
sort in ascending or descending coordinates.
'''
self._intervals.sort(
key=lambda i: [i.bounds[0][0], i.bounds[-1][1]],
reverse=not ascending)
@experimental(as_of='0.5.1')
def add(self, bounds, fuzzy=None, metadata=None):
"""Create and add an ``Interval`` to this ``IntervalMetadata``.
This method creates an ``Interval`` object and inserts it into
the ``IntervalMetadata`` object.
Parameters
----------
bounds : iterable of tuple of ints
Tuples representing start and end coordinates. It is *zero-based*
numbering. It is always inclusive on start bound and exclusive on
end bound.
fuzzy : iterable of tuple of bool, optional
Tuples representing the fuzziness of each bound coordinates.
metadata : dict, optional
A dictionary of key-value pairs associated with the
``Interval`` object.
Returns
-------
Interval
The ``Interval`` object added.
See Also
--------
skbio.metadata.Interval
"""
# Add an interval to the tree. Note that the add functionality is
# built within the Interval constructor.
return Interval(interval_metadata=self,
bounds=bounds,
fuzzy=fuzzy,
metadata=metadata)
@_rebuild_tree
def _query_interval(self, bound):
"""Yield ``Interval`` objects that overlap with the bound."""
_assert_valid_bound(bound)
start, end = bound
intvls = self._interval_tree.find(start, end)
# if a ``Interval`` has many non-contiguous spans and
# multiple of them overlap with the bound, then
# this ``Interval`` object will be returned
# multiple times. So we need to remove duplicates.
seen = set()
for intvl in intvls:
if id(intvl) not in seen:
seen.add(id(intvl))
yield intvl
def _query_attribute(self, metadata, intervals=None):
"""Yield ``Interval`` objects based on query attributes.
Parameters
----------
metadata : dict or ``None``
If it is ``None``, return empty iterator; if it is
``{}``, return an interator of all the ``Interval``
objects.
intervals : an iterable of ``Interval`` objects
"""
if metadata is None:
return
if intervals is None:
intervals = self._intervals
for intvl in intervals:
for (key, value) in metadata.items():
if (key not in intvl.metadata or
intvl.metadata[key] != value):
break
else:
yield intvl
@experimental(as_of='0.5.1')
@_rebuild_tree
def query(self, bounds=None, metadata=None):
"""Yield ``Interval`` object with the bounds and attributes.
The ``Interval`` objects must meet both requirements: 1) overlap
with any of the spans specified by ``bounds``; 2) satisfy
``metadata`` specification. For instance, you can identify
all the recA genes that overlap with (10, 100) or (900, 1000)
with this code ``interval_metadata.query([(10, 100),
(900, 1000)], {'gene': 'recA'})``.
Parameters
----------
bounds : iterable of tuples of int pair, optional
Specifies bounds to look for the ``Interval``
objects. An satisfying interval feature only need to overlap with
one bound. Default (``None``) means all ``Intervals`` meet
this requirement.
metadata : dict, optional
A dictionary of key word attributes associated with the
``Interval`` object. It specifies what metadata keywords and
values to look for. Default (``None``) means all ``Intervals``
meet this requirement.
Yields
------
Interval
``Interval`` object satisfying the search criteria.
"""
if bounds is None:
for intvl in self._query_attribute(metadata):
yield intvl
else:
for loc in bounds:
intvls = self._query_interval(loc)
if metadata is None:
metadata = {}
for intvl in self._query_attribute(metadata, intvls):
yield intvl
@experimental(as_of='0.5.1')
def drop(self, intervals, negate=False):
"""Drops Interval objects.
The given ``Interval`` objects will be removed and their
associated ``IntervalMetadata`` will be set to ``None``.
Parameters
----------
intervals : iterable of ``Interval``
``Interval`` objects to drop from this object.
negate : bool
Negate the drop operation, i.e. keeping the specified intervals
instead of dropping them.
"""
to_delete = {id(f) for f in intervals}
new_intvls = []
# iterate through queries and drop them
for intvl in self._intervals:
drop = id(intvl) in to_delete
if negate is True:
drop = not drop
if drop:
intvl._interval_metadata = None
else:
new_intvls.append(intvl)
self._intervals = new_intvls
self._is_stale_tree = True
@experimental(as_of='0.5.1')
def __eq__(self, other):
'''Test if this object is equal to another.
It checks if the coordinate spaces are the same between the
two objects. If so, then check if all the interval features
are equal between the two objects after sorting them by
bounds.
Parameters
----------
other : IntervalMetadata
Interval metadata to test for equality against.
Returns
-------
bool
Indicates if the two objects are equal.
'''
if self.upper_bound != other.upper_bound or \
self.lower_bound != other.lower_bound:
return False
else:
self_intervals = sorted(self._intervals,
key=operator.attrgetter('bounds'))
other_intervals = sorted(other._intervals,
key=operator.attrgetter('bounds'))
return self_intervals == other_intervals
@experimental(as_of='0.5.1')
def __ne__(self, other):
'''Test if this object is not equal to another.
Parameters
----------
other : IntervalMetadata
Interval metadata to test for inequality against.
Returns
-------
bool
Indicates if the two objects are not equal.
See Also
--------
skbio.metadata.IntervalMetadata.__eq__
'''
return not (self == other)
@experimental(as_of='0.5.1')
def __repr__(self):
'''Return a string representation of this object.
Returns
-------
str
String representation of this ``IntervalMetadata`` object.
'''
n = self.num_interval_features
l1 = '{} interval feature'.format(n)
if n != 1:
l1 += 's'
l2 = '-' * len(l1)
if n <= 5:
items = [repr(i) for i in self._intervals]
else:
# intentionally overwrite items[2] to make code cleaner
items = [repr(self._intervals[i]) for i in [0, 1, 2, n-2, n-1]]
items[2] = '...'
return '\n'.join([l1, l2] + items)
@experimental(as_of='0.5.1')
def __copy__(self):
'''Return a shallow copy.
Notes
-----
The ``IntervalMetadata`` copy will have copies of the
``Interval`` objects present in this object. The ``metadata``
dictionary of each ``Interval`` object will be a shallow copy.
See Also
--------
__deepcopy__
'''
return self._copy(False, {})
@experimental(as_of='0.5.1')
def __deepcopy__(self, memo):
'''Return a deep copy.
Notes
-----
The ``IntervalMetadata`` copy will have copies of the
``Interval`` objects present in this object. The ``metadata``
dictionary of each ``Interval`` object will be a deep copy.
See Also
--------
__copy__
'''
return self._copy(True, memo)
def _copy(self, deep, memo):
cp = IntervalMetadata(self.upper_bound)
for interval in self._intervals:
# Only need to shallow-copy `bounds` and `fuzzy`
# because their elements are immutable.
bounds_cp = interval.bounds[:]
fuzzy_cp = interval.fuzzy[:]
if deep:
metadata_cp = copy.deepcopy(interval.metadata, memo)
else:
metadata_cp = copy.copy(interval.metadata)
cp.add(bounds_cp,
fuzzy=fuzzy_cp,
metadata=metadata_cp)
return cp
def _assert_valid_bound(bound):
if isinstance(bound, tuple):
try: