-
Notifications
You must be signed in to change notification settings - Fork 11
/
record_operations.py
163 lines (127 loc) · 4.41 KB
/
record_operations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from copy import copy
try:
# Biopython <1.78
from Bio.Alphabet import DNAAlphabet
has_dna_alphabet = True
except ImportError:
# Biopython >=1.78
has_dna_alphabet = False
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
def complement(dna_sequence):
"""Return the complement of the DNA sequence.
For instance ``complement("ATGCCG")`` returns ``"TACGGC"``.
Uses BioPython for speed.
"""
return str(Seq(dna_sequence).complement())
def set_record_topology(record, topology):
"""Set the Biopython record's topology, possibly passing if already set.
This actually sets the ``record.annotations['topology']``.The ``topology``
parameter can be "circular", "linear", "default_to_circular" (will default
to circular if ``annotations['topology']`` is not already set) or
"default_to_linear".
"""
valid_topologies = [
"circular",
"linear",
"default_to_circular",
"default_to_linear",
]
if topology not in valid_topologies:
raise ValueError("topology should be one of %s." % ", ".join(valid_topologies))
annotations = record.annotations
default_prefix = "default_to_"
if topology.startswith(default_prefix):
if "topology" not in annotations:
annotations["topology"] = topology[len(default_prefix) :]
else:
annotations["topology"] = topology
def record_is_linear(record, default=True):
if "topology" not in record.annotations:
return default
else:
return record.annotations["topology"] == "linear"
def reverse_complement(sequence):
"""Return the reverse-complement of the DNA sequence.
For instance ``complement("ATGCCG")`` returns ``"GCCGTA"``.
Uses BioPython for speed.
"""
return complement(sequence)[::-1]
def sequence_to_biopython_record(
sequence, id="<unknown id>", name="same_as_id", features=()
):
"""Return a SeqRecord of the sequence, ready to be Genbanked."""
if has_dna_alphabet:
seqrecord = SeqRecord(
Seq(sequence, alphabet=DNAAlphabet()),
id=id,
name=id if name == "same_as_id" else name,
features=list(features),
)
else:
seqrecord = SeqRecord(
Seq(sequence),
id=id,
name=id if name == "same_as_id" else name,
features=list(features),
)
seqrecord.annotations["molecule_type"] = "DNA"
return seqrecord
def annotate_record(
seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers
):
"""Add a feature to a Biopython SeqRecord.
Parameters
----------
seqrecord
The Biopython seqrecord to be annotated.
location
Either (start, end) or (start, end, strand). (strand defaults to +1)
feature_type
The type associated with the feature.
margin
Number of extra bases added on each side of the given location.
qualifiers
Dictionary that will be the Biopython feature's `qualifiers` attribute.
"""
if location == "full":
location = (margin, len(seqrecord) - margin)
strand = location[2] if len(location) == 3 else 1
seqrecord.features.append(
SeqFeature(
FeatureLocation(location[0], location[1], strand),
qualifiers=qualifiers,
type=feature_type,
)
)
def crop_record_with_saddling_features(record, start, end, filters=()):
"""Crop the Biopython record, but keep features that are only partially in.
Parameters
----------
record
The Biopython record to crop.
start, end
Coordinates of the segment to crop.
filters
list of functions (feature=>True/False). Any feature that doesn't pass
at least one filter will be filtered out.
"""
cropped = record[start:end]
def is_saddling(f_start, f_end):
return (f_start < start <= f_end) or (f_start <= end < f_end)
saddling_features = [
copy(f)
for f in record.features
if all([fl(f) for fl in filters])
and f.location is not None
and is_saddling(f.location.start, f.location.end)
]
for f in saddling_features:
f.location = FeatureLocation(
start=max(f.location.start, start),
end=min(f.location.end, end),
strand=f.location.strand,
)
cropped.features.append(f)
return cropped