-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
PhyloXML.py
1162 lines (972 loc) · 42.8 KB
/
PhyloXML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright (C) 2009 by Eric Talevich (eric.talevich@gmail.com)
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Classes corresponding to phyloXML elements.
See U{ http://phyloxml.org/ } for the official specification.
See also Han and Zmasek (2009) doi:10.1186/1471-2105-10-356
"""
__docformat__ = "epytext en"
import re
import warnings
from Bio import Alphabet
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
import BaseTree
import _sugar
class PhyloXMLWarning(Warning):
"""Warning for non-compliance with the phyloXML specification."""
pass
def check_str(text, testfunc):
"""Check a string using testfunc, and warn if there's no match."""
if text is not None and not testfunc(text):
warnings.warn("String %s doesn't match the given regexp" % text,
PhyloXMLWarning, stacklevel=2)
# Core elements
class PhyloElement(BaseTree.TreeElement):
"""Base class for all PhyloXML objects."""
def __str__(self):
"""Show the class name and an identifying attribute."""
if hasattr(self, 'name') and self.name:
return _sugar.trim_str(self.name, maxlen=40)
if hasattr(self, 'value') and self.value:
return _sugar.trim_str(unicode(self.value), maxlen=40)
if hasattr(self, 'id') and self.id:
return str(self.id)
return self.__class__.__name__
class Phyloxml(PhyloElement):
"""Root node of the PhyloXML document.
Contains an arbitrary number of Phylogeny elements, possibly followed by
elements from other namespaces.
@param attributes: (XML namespace definitions)
@param phylogenies: list of phylogenetic trees
@param other: list of arbitrary non-phyloXML elements, if any
"""
def __init__(self, attributes, phylogenies=None, other=None):
self.attributes = attributes
self.phylogenies = phylogenies or []
self.other = other or []
def __getitem__(self, index):
"""Get a phylogeny by index or name."""
if isinstance(index, int) or isinstance(index, slice):
return self.phylogenies[index]
if not isinstance(index, basestring):
raise KeyError, "can't use %s as an index" % type(index)
for tree in self.phylogenies:
if tree.name == index:
return tree
else:
raise KeyError, "no phylogeny found with name " + repr(index)
def __iter__(self):
"""Iterate through the phylogenetic trees in this object."""
return iter(self.phylogenies)
def __len__(self):
"""Number of phylogenetic trees in this object."""
return len(self.phylogenies)
class Other(PhyloElement):
"""Container for non-phyloXML elements in the tree.
Usually, an Other object will have either a 'value' or a non-empty list
of 'children', but not both. This is not enforced here, though.
@param tag: local tag for the XML node
@param namespace: XML namespace for the node -- should not be the default
phyloXML namespace.
@param attributes: string attributes on the XML node
@param value: text contained directly within this XML node
@param children: list of child nodes, if any (also Other instances)
"""
def __init__(self, tag, namespace=None, attributes=None, value=None,
children=None):
self.tag = tag
self.namespace = namespace
self.attributes = attributes
self.value = value
self.children = children or []
def __iter__(self):
"""Iterate through the children of this object (if any)."""
return iter(self.children)
class Phylogeny(PhyloElement, BaseTree.Tree):
"""A phylogenetic tree.
@param root: the root node/clade of this tree
@param rooted: True if this tree is rooted
@param rerootable: True if this tree is rerootable
@param branch_length_unit: unit for branch_length values on clades
@type type: str
@param name: string identifier for this tree, not required to be unique
@param id: unique identifier for this tree (type Id)
@param description: plain-text description
@param date: date for the root node of this tree (type Date)
@param confidences: list of Confidence objects for this tree
@param clade_relations: list of CladeRelation objects
@param sequence_relations: list of SequenceRelation objects
@param properties: list of Property objects
@param other: list of non-phyloXML elements (type Other)
"""
def __init__(self, root=None, rooted=True,
rerootable=None, branch_length_unit=None, type=None,
# Child nodes
name=None, id=None, description=None, date=None,
# Collections
confidences=None, clade_relations=None, sequence_relations=None,
properties=None, other=None,
):
assert isinstance(rooted, bool)
self.root = root
self.rooted = rooted
self.rerootable = rerootable
self.branch_length_unit = branch_length_unit
self.type = type
self.name = name
self.id = id
self.description = description
self.date = date
self.confidences = confidences or []
self.clade_relations = clade_relations or []
self.sequence_relations = sequence_relations or []
self.properties = properties or []
self.other = other or []
@classmethod
def from_tree(cls, tree, **kwargs):
phy = cls(
root=Clade.from_subtree(tree.root),
rooted=tree.rooted,
name=tree.name,
id=(tree.id is not None) and Id(str(tree.id)) or None)
phy.__dict__.update(kwargs)
return phy
@classmethod
def from_subtree(cls, subtree, **kwargs):
return Clade.from_subtree(subtree).to_phylogeny(**kwargs)
def to_phyloxml(self, **kwargs):
"""Create a new PhyloXML object containing just this phylogeny."""
return Phyloxml(kwargs, phylogenies=[self])
def to_alignment(self):
"""Construct an alignment from the aligned sequences in this tree."""
def is_aligned_seq(elem):
if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
return True
return False
seqs = self._filter_search(is_aligned_seq, 'preorder', True)
try:
first_seq = seqs.next()
except StopIteration:
# No aligned sequences were found --> empty MSA
return MultipleSeqAlignment([])
msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
first_seq.get_alphabet())
msa.extend(seq.to_seqrecord() for seq in seqs)
return msa
# Singular property for plural attribute
def _get_confidence(self):
"""Equivalent to self.confidences[0] if there is only 1 value.
See also: Clade.confidence, Clade.taxonomy
"""
if len(self.confidences) == 0:
return None
if len(self.confidences) > 1:
raise AttributeError("more than 1 confidence value available; "
"use Phylogeny.confidences")
return self.confidences[0]
def _set_confidence(self, value):
if isinstance(value, float) or isinstance(value, int):
value = Confidence(value)
elif not isinstance(value, Confidence):
raise ValueError("value must be a number or Confidence instance")
if len(self.confidences) == 0:
self.confidences.append(value)
elif len(self.confidences) == 1:
self.confidences[0] = value
else:
raise ValueError("multiple confidence values already exist; "
"use Phylogeny.confidences instead")
confidence = property(_get_confidence, _set_confidence)
class Clade(PhyloElement, BaseTree.Subtree):
"""Describes a branch of the current phylogenetic tree.
Used recursively, describes the topology of a phylogenetic tree.
Both 'color' and 'width' elements should be interpreted by client code as
applying to the whole clade, including all descendents, unless overwritten
in-sub clades. This module doesn't automatically assign these attributes to
sub-clades to achieve this cascade -- and neither should you.
@param branch_length: parent branch length of this clade
@param id_source: link other elements to a clade (on the xml-level)
@param name: short string label for this clade
@param confidences: list of Confidence objects, used to indicate the
support for a clade/parent branch.
@param width: branch width for this clade (including branch from parent)
@param color: color used for graphical display of this clade
@param node_id: unique identifier for the root node of this clade
@param taxonomies: list of Taxonomy objects
@param sequences: list of Sequence objects
@param events: describe such events as gene-duplications at the root
node/parent branch of this clade
@param binary_characters: a BinaryCharacters object
@param distributions: list of Distribution objects
@param date: a date for the root node of this clade (type Date)
@param references: list of Reference objects
@param properties: list of Property objects
@param clades: list of sub-clades (type Clade)
@param other: list of non-phyloXML objects
"""
def __init__(self,
# Attributes
branch_length=None, id_source=None,
# Child nodes
name=None, width=None, color=None, node_id=None, events=None,
binary_characters=None, date=None,
# Collections
confidences=None, taxonomies=None, sequences=None,
distributions=None, references=None, properties=None, clades=None,
other=None,
):
self.branch_length = branch_length
self.id_source = id_source
self.name = name
self.width = width
self.color = color
self.node_id = node_id
self.events = events
self.binary_characters = binary_characters
self.date = date
self.confidences = confidences or []
self.taxonomies = taxonomies or []
self.sequences = sequences or []
self.distributions = distributions or []
self.references = references or []
self.properties = properties or []
self.clades = clades or []
self.other = other or []
@classmethod
def from_subtree(cls, subtree, **kwargs):
"""Create a new Clade from a BaseTree.Subtree object."""
clade = cls(branch_length=subtree.branch_length,
name=subtree.name)
clade.clades = [cls.from_subtree(st) for st in subtree.clades]
clade.__dict__.update(kwargs)
return clade
def to_phylogeny(self, **kwargs):
"""Create a new phylogeny containing just this clade."""
phy = Phylogeny(root=self, date=self.date)
phy.__dict__.update(kwargs)
return phy
# Shortcuts for list attributes that are usually only 1 item
def _get_confidence(self):
if len(self.confidences) == 0:
return None
if len(self.confidences) > 1:
raise AttributeError("more than 1 confidence value available; "
"use Clade.confidences")
return self.confidences[0]
def _set_confidence(self, value):
if isinstance(value, float) or isinstance(value, int):
value = Confidence(value)
elif not isinstance(value, Confidence):
raise ValueError("value must be a number or Confidence instance")
if len(self.confidences) == 0:
self.confidences.append(value)
elif len(self.confidences) == 1:
self.confidences[0] = value
else:
raise ValueError("multiple confidence values already exist; "
"use Phylogeny.confidences instead")
confidence = property(_get_confidence, _set_confidence)
def _get_taxonomy(self):
if len(self.taxonomies) == 0:
return None
if len(self.taxonomies) > 1:
raise AttributeError("more than 1 taxonomy value available; "
"use Clade.taxonomies")
return self.taxonomies[0]
def _set_taxonomy(self, value):
if not isinstance(value, Taxonomy):
raise ValueError("assigned value must be a Taxonomy instance")
if len(self.taxonomies) == 0:
self.taxonomies.append(value)
elif len(self.taxonomies) == 1:
self.taxonomies[0] = value
else:
raise ValueError("multiple taxonomy values already exist; "
"use Phylogeny.taxonomies instead")
taxonomy = property(_get_taxonomy, _set_taxonomy)
# Syntax sugar for setting the branch color
def _get_color(self):
return self._color
def _set_color(self, arg):
if arg is None or isinstance(arg, BranchColor):
self._color = arg
elif isinstance(arg, basestring):
if arg in BranchColor.color_names:
# Known color name
self._color = BranchColor.from_name(arg)
elif arg.startswith('#') and len(arg) == 7:
# HTML-style hex string
self._color = BranchColor.from_hex(arg)
else:
raise ValueError("invalid color string %s" % arg)
elif hasattr(arg, '__iter__') and len(arg) == 3:
# RGB triplet
self._color = BranchColor(*arg)
else:
raise ValueError("invalid color value %s" % arg)
color = property(_get_color, _set_color, doc="Branch color.")
# PhyloXML-specific complex types
class Accession(PhyloElement):
"""Captures the local part in a sequence identifier.
Example: In 'UniProtKB:P17304', the Accession instance attribute 'value' is
'P17304' and the 'source' attribute is 'UniProtKB'.
"""
def __init__(self, value, source):
self.value = value
self.source = source
class Annotation(PhyloElement):
"""The annotation of a molecular sequence.
It is recommended to annotate by using the optional 'ref' attribute (some
examples of acceptable values for the ref attribute: 'GO:0008270',
'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1').
@type ref: str
@param source: plain-text source for this annotation
@param evidence: describe evidence as free text (e.g. 'experimental')
@type type: str
@param desc: free text description
@param confidence: state the type and value of support (type Confidence)
@param properties: list of typed and referenced annotations from external
resources
@type uri: Uri
"""
re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
def __init__(self,
# Attributes
ref=None, source=None, evidence=None, type=None,
# Child nodes
desc=None, confidence=None, uri=None,
# Collection
properties=None):
check_str(ref, self.re_ref.match)
self.ref = ref
self.source = source
self.evidence = evidence
self.type = type
self.desc = desc
self.confidence = confidence
self.uri = uri
self.properties = properties or []
class BinaryCharacters(PhyloElement):
"""The names and/or counts of binary characters present, gained, and lost
at the root of a clade.
"""
def __init__(self,
# Attributes
type=None, gained_count=None, lost_count=None, present_count=None,
absent_count=None,
# Child nodes (flattened into collections)
gained=None, lost=None, present=None, absent=None):
self.type=type
self.gained_count=gained_count
self.lost_count=lost_count
self.present_count=present_count
self.absent_count=absent_count
self.gained=gained or []
self.lost=lost or []
self.present=present or []
self.absent=absent or []
class BranchColor(PhyloElement):
"""Indicates the color of a clade when rendered graphically.
The color should be interpreted by client code (e.g. visualization
programs) as applying to the whole clade, unless overwritten by the
color(s) of sub-clades.
Color values must be integers from 0 to 255.
"""
color_names = {
'red': (255, 0, 0),
'r': (255, 0, 0),
'yellow': (255, 255, 0),
'y': (255, 255, 0),
'green': ( 0, 128, 0),
'g': ( 0, 128, 0),
'cyan': ( 0, 255, 255),
'c': ( 0, 255, 255),
'blue': ( 0, 0, 255),
'b': ( 0, 0, 255),
'magenta': (255, 0, 255),
'm': (255, 0, 255),
'black': ( 0, 0, 0),
'k': ( 0, 0, 0),
'white': (255, 255, 255),
'w': (255, 255, 255),
# Names standardized in HTML/CSS spec
# http://w3schools.com/html/html_colornames.asp
'maroon': (128, 0, 0),
'olive': (128, 128, 0),
'lime': ( 0, 255, 0),
'aqua': ( 0, 255, 255),
'teal': ( 0, 128, 128),
'navy': ( 0, 0, 128),
'fuchsia': (255, 0, 255),
'purple': (128, 0, 128),
'silver': (192, 192, 192),
'gray': (128, 128, 128),
# More definitions from matplotlib/gcolor2
'grey': (128, 128, 128),
'pink': (255, 192, 203),
'salmon': (250, 128, 114),
'orange': (255, 165, 0),
'gold': (255, 215, 0),
'tan': (210, 180, 140),
'brown': (165, 42, 42),
}
def __init__(self, red, green, blue):
for color in (red, green, blue):
assert (isinstance(color, int)
and 0 <= color <= 255
), "Color values must be integers between 0 and 255."
self.red = red
self.green = green
self.blue = blue
@classmethod
def from_hex(cls, hexstr):
"""Construct a BranchColor object from a hexadecimal string.
The string format is the same style used in HTML and CSS, such as
'#FF8000' for an RGB value of (255, 128, 0).
"""
assert (isinstance(hexstr, basestring)
and hexstr.startswith('#')
and len(hexstr) == 7
), "need a 24-bit hexadecimal string, e.g. #000000"
def unpack(cc):
return int('0x'+cc, base=16)
RGB = hexstr[1:3], hexstr[3:5], hexstr[5:]
return cls(*map(unpack, RGB))
@classmethod
def from_name(cls, colorname):
"""Construct a BranchColor object by the color's name."""
return cls(*cls.color_names[colorname])
def to_hex(self):
"""Return a 24-bit hexadecimal RGB representation of this color.
The returned string is suitable for use in HTML/CSS, as a color
parameter in matplotlib, and perhaps other situations.
Example:
>>> bc = BranchColor(12, 200, 100)
>>> bc.to_hex()
'#0cc864'
"""
return '#' + hex(
self.red * (16**4)
+ self.green * (16**2)
+ self.blue)[2:].zfill(6)
def to_rgb(self):
"""Return a tuple of RGB values (0 to 255) representing this color.
Example:
>>> bc = BranchColor(255, 165, 0)
>>> bc.to_rgb()
(255, 165, 0)
"""
return (self.red, self.green, self.blue)
def __repr__(self):
"""Preserve the standard RGB order when representing this object."""
return ('%s(red=%d, green=%d, blue=%d)'
% (self.__class__.__name__, self.red, self.green, self.blue)
).encode('utf-8')
def __str__(self):
"""Show the color's RGB values."""
return "(%d, %d, %d)" % (self.red, self.green, self.blue)
class CladeRelation(PhyloElement):
"""Expresses a typed relationship between two clades.
For example, this could be used to describe multiple parents of a clade.
@type id_ref_0: str
@type id_ref_1: str
@type distance: str
@type type: str
@type confidence: Confidence
"""
def __init__(self, type, id_ref_0, id_ref_1,
distance=None, confidence=None):
self.distance = distance
self.type = type
self.id_ref_0 = id_ref_0
self.id_ref_1 = id_ref_1
self.confidence = confidence
class Confidence(PhyloElement):
"""A general purpose confidence element.
For example, this can be used to express the bootstrap support value of a
clade (in which case the 'type' attribute is 'bootstrap').
@type value: float
@type type: str
"""
def __init__(self, value, type='unknown'):
self.value = value
self.type = type
def __float__(self):
return float(self.value)
def __int__(self):
return int(self.value)
class Date(PhyloElement):
"""A date associated with a clade/node.
Its value can be numerical by using the 'value' element and/or free text
with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it
is recommended to employ the 'unit' attribute.
@param unit: type of numerical value (e.g. 'mya' for 'million years ago')
@type value: float
@param desc: plain-text description of the date
@param minimum: lower bound on the date value
@param maximum: upper bound on the date value
"""
def __init__(self, value=None, unit=None, desc=None,
minimum=None, maximum=None):
self.value = value
self.unit = unit
self.desc = desc
self.minimum = minimum
self.maximum = maximum
def __str__(self):
"""Show the class name and the human-readable date."""
if self.unit and self.value is not None:
return '%s %s' % (self.value, self.unit)
if self.desc is not None:
return self.desc
return self.__class__.__name__
class Distribution(PhyloElement):
"""Geographic distribution of the items of a clade (species, sequences).
Intended for phylogeographic applications.
The location can be described either by free text in the 'desc' element
and/or by the coordinates of one or more 'Points' (similar to the 'Point'
element in Google's KML format) or by 'Polygons'.
"""
def __init__(self, desc=None, points=None, polygons=None):
self.desc = desc
self.points = points or []
self.polygons = polygons or []
class DomainArchitecture(PhyloElement):
"""Domain architecture of a protein.
@param length: total length of the protein sequence (type int)
@param domains: list of ProteinDomain objects
"""
def __init__(self, length=None, domains=None):
self.length = length
self.domains = domains
class Events(PhyloElement):
"""Events at the root node of a clade (e.g. one gene duplication).
All attributes are set to None by default, but this object can also be
treated as a dictionary, in which case None values are treated as missing
keys and deleting a key resets that attribute's value back to None.
"""
ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other',
'mixed', 'unassigned'))
def __init__(self, type=None, duplications=None, speciations=None,
losses=None, confidence=None):
check_str(type, self.ok_type.__contains__)
self.type = type
self.duplications = duplications
self.speciations = speciations
self.losses = losses
self.confidence = confidence
def iteritems(self):
return ((k, v) for k, v in self.__dict__.iteritems() if v is not None)
def iterkeys(self):
return (k for k, v in self.__dict__.iteritems() if v is not None)
def itervalues(self):
return (v for v in self.__dict__.itervalues() if v is not None)
def items(self):
return list(self.iteritems())
def keys(self):
return list(self.iterkeys())
def values(self):
return list(self.itervalues())
def __len__(self):
return len(self.values())
def __getitem__(self, key):
if not hasattr(self, key):
raise KeyError(key)
val = getattr(self, key)
if val is None:
raise KeyError("%s has not been set in this object" % repr(key))
return val
def __setitem__(self, key, val):
setattr(self, key, val)
def __delitem__(self, key):
setattr(self, key, None)
def __iter__(self):
return iter(self.iterkeys())
def __contains__(self, key):
return (hasattr(self, key) and getattr(self, key) is not None)
class Id(PhyloElement):
"""A general-purpose identifier element.
Allows to indicate the provider (or authority) of an identifier, e.g. NCBI,
along with the value itself.
"""
def __init__(self, value, provider=None):
self.value = value
self.provider = provider
class MolSeq(PhyloElement):
"""Store a molecular sequence.
@param value: the sequence, as a string
@param is_aligned: True is mol_seq is aligned (usu. meaning gaps are
introduced and all aligned seqs are the same length)
"""
re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+')
def __init__(self, value, is_aligned=None):
check_str(value, self.re_value.match)
self.value = value
self.is_aligned = is_aligned
def __str__(self):
return self.value
class Point(PhyloElement):
"""Geographic coordinates of a point, with an optional altitude.
Used by element 'Distribution'.
@param geodetic_datum: indicate the geodetic datum (also called 'map
datum'). For example, Google's KML uses 'WGS84'. (required)
@param lat: latitude
@param long: longitude
@param alt: altitude
@param alt_unit: unit for the altitude (e.g. 'meter')
"""
def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
self.geodetic_datum = geodetic_datum
self.lat = lat
self.long = long
self.alt = alt
self.alt_unit = alt_unit
class Polygon(PhyloElement):
"""A polygon defined by a list of 'Points' (used by element 'Distribution').
@param points: list of 3 or more points representing vertices.
"""
def __init__(self, points=None):
self.points = points or []
class Property(PhyloElement):
"""A typed and referenced property from an external resources.
Can be attached to 'Phylogeny', 'Clade', and 'Annotation' objects.
@param ref: reference to an external resource, e.g. "NOAA:depth"
@param unit: the unit of the property, e.g. "METRIC:m" (optional)
@param datatype: indicates the type of a property and is limited to
xsd-datatypes (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer',
'xsd:decimal', 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI').
@param applies_to: indicates the item to which a property applies to (e.g.
'node' for the parent node of a clade, 'parent_branch' for the parent
branch of a clade, or just 'clade').
@param id_ref: allows to attached a property specifically to one element
(on the xml-level). (optional)
@type value: str
"""
re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation',
'parent_branch', 'other'))
ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float',
'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date',
'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay',
'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI',
'xsd:normalizedString', 'xsd:token', 'xsd:integer',
'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int',
'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong',
'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte',
'xsd:positiveInteger'))
def __init__(self, value, ref, applies_to, datatype,
unit=None, id_ref=None):
check_str(ref, self.re_ref.match)
check_str(applies_to, self.ok_applies_to.__contains__)
check_str(datatype, self.ok_datatype.__contains__)
check_str(unit, self.re_ref.match)
self.unit = unit
self.id_ref = id_ref
self.value = value
self.ref = ref
self.applies_to = applies_to
self.datatype = datatype
class ProteinDomain(PhyloElement):
"""Represents an individual domain in a domain architecture.
The locations use 0-based indexing, as most Python objects including
SeqFeature do, rather than the usual biological convention starting at 1.
This means the start and end attributes can be used directly as slice
indexes on Seq objects.
@param start: start of the domain on the sequence, using 0-based indexing
@type start: non-negative integer
@param end: end of the domain on the sequence
@type end: non-negative integer
@param confidence: can be used to store e.g. E-values. (type float)
@param id: unique identifier/name
"""
# TODO: confirm that 'start' counts from 1, not 0
def __init__(self, value, start, end, confidence=None, id=None):
self.value = value
self.start = start
self.end = end
self.confidence = confidence
self.id = id
@classmethod
def from_seqfeature(cls, feat):
return ProteinDomain(feat.id,
feat.location.nofuzzy_start,
feat.location.nofuzzy_end,
confidence=feat.qualifiers.get('confidence'))
def to_seqfeature(self):
feat = SeqFeature(location=FeatureLocation(self.start, self.end),
id=self.value)
if hasattr(self, 'confidence'):
feat.qualifiers['confidence'] = self.confidence
return feat
class Reference(PhyloElement):
"""Literature reference for a clade.
It is recommended to use the 'doi' attribute instead of the free text
'desc' element whenever possible.
"""
re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+')
def __init__(self, doi=None, desc=None):
check_str(doi, self.re_doi.match)
self.doi = doi
self.desc = desc
class Sequence(PhyloElement):
"""A molecular sequence (Protein, DNA, RNA) associated with a node.
One intended use for 'id_ref' is to link a sequence to a taxonomy (via the
taxonomy's 'id_source') in case of multiple sequences and taxonomies per
node.
@param type: type of sequence ('dna', 'rna', or 'protein').
@type id_ref: str
@type id_source: str
@param symbol: short symbol of the sequence, e.g. 'ACTM' (max. 10 chars)
@type accession: Accession
@param name: full name of the sequence, e.g. 'muscle Actin'
@param location: location of a sequence on a genome/chromosome.
@type mol_seq: MolSeq
@type uri: Uri
@param annotations: list of Annotation objects
@param domain_architecture: protein domains on this sequence (type
DomainArchitecture)
@param other: list of non-phyloXML elements (type Other)
"""
alphabets = {'dna': Alphabet.generic_dna,
'rna': Alphabet.generic_rna,
'protein': Alphabet.generic_protein}
re_symbol = re.compile(r'\S{1,10}')
def __init__(self,
# Attributes
type=None, id_ref=None, id_source=None,
# Child nodes
symbol=None, accession=None, name=None, location=None,
mol_seq=None, uri=None, domain_architecture=None,
# Collections
annotations=None, other=None,
):
check_str(type, self.alphabets.__contains__)
check_str(symbol, self.re_symbol.match)
self.type = type
self.id_ref = id_ref
self.id_source = id_source
self.symbol = symbol
self.accession = accession
self.name = name
self.location = location
self.mol_seq = mol_seq
self.uri = uri
self.domain_architecture = domain_architecture
self.annotations = annotations or []
self.other = other or []
@classmethod
def from_seqrecord(cls, record, is_aligned=None):
"""Create a new PhyloXML Sequence from a SeqRecord object."""
if is_aligned == None:
is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped)
params = {
'accession': Accession(record.id, ''),
'symbol': record.name,
'name': record.description,
'mol_seq': MolSeq(str(record.seq), is_aligned),
}
if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
params['type'] = 'dna'
elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
params['type'] = 'rna'
elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
params['type'] = 'protein'
# Unpack record.annotations
for key in ('id_ref', 'id_source', 'location'):
if key in record.annotations:
params[key] = record.annotations[key]
if isinstance(record.annotations.get('uri'), dict):
params['uri'] = Uri(**record.annotations['uri'])
# Build a Sequence.annotation object
if record.annotations.get('annotations'):
params['annotations'] = []
for annot in record.annotations['annotations']:
ann_args = {}
for key in ('ref', 'source', 'evidence', 'type', 'desc'):
if key in annot:
ann_args[key] = annot[key]
if isinstance(annot.get('confidence'), list):
ann_args['confidence'] = Confidence(
*annot['confidence'])
if isinstance(annot.get('properties'), list):
ann_args['properties'] = [Property(**prop)
for prop in annot['properties']
if isinstance(prop, dict)]
params['annotations'].append(Annotation(**ann_args))
# Unpack record.features
if record.features:
params['domain_architecture'] = DomainArchitecture(
length=len(record.seq),
domains=[ProteinDomain.from_seqfeature(feat)
for feat in record.features])
return Sequence(**params)
def to_seqrecord(self):
"""Create a SeqRecord object from this Sequence instance.
The seqrecord.annotations dictionary is packed like so::
{ # Sequence attributes with no SeqRecord equivalent:
'id_ref': self.id_ref,
'id_source': self.id_source,
'location': self.location,
'uri': { 'value': self.uri.value,
'desc': self.uri.desc,
'type': self.uri.type },
# Sequence.annotations attribute (list of Annotations)
'annotations': [{ 'ref': ann.ref,
'source': ann.source,
'evidence': ann.evidence,
'type': ann.type,
'confidence': [ ann.confidence.value,
ann.confidence.type ],
'properties': [{ 'value': prop.value,
'ref': prop.ref,
'applies_to': prop.applies_to,
'datatype': prop.datatype,
'unit': prop.unit,