-
Notifications
You must be signed in to change notification settings - Fork 188
/
umi_methods.py
239 lines (185 loc) · 7.56 KB
/
umi_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
'''
umi_methods.py - Methods for dealing with UMIs
=========================================================
'''
from __future__ import absolute_import
import itertools
import collections
import pysam
import numpy as np
# required to make iteritems python2 and python3 compatible
from future.utils import iteritems
from builtins import dict
import umi_tools.Utilities as U
from umi_tools._dedup_umi import edit_distance
RANGES = {
'phred33': (33, 77),
'solexa': (59, 106),
'phred64': (64, 106),
}
###############################################################################
# The code for Record and fastqIterate are taken from CGAT.Fastq:
# https://github.com/CGATOxford/cgat/blob/master/CGAT/Fastq.py
###############################################################################
class Record:
"""A record representing a :term:`fastq` formatted record.
Attributes
----------
identifier : string
Sequence identifier
seq : string
Sequence
quals : string
String representation of quality scores.
format : string
Quality score format. Can be one of ``sanger``,
``phred33``, ``phred64`` or ``solexa``.
"""
def __init__(self, identifier, seq, quals, entry_format=None):
self.identifier, self.seq, self.quals, entry_format = (
identifier, seq, quals, entry_format)
def guessFormat(self):
'''return quality score format -
might return several if ambiguous.'''
c = [ord(x) for x in self.quals]
mi, ma = min(c), max(c)
r = []
for entry_format, v in iteritems(RANGES):
m1, m2 = v
if mi >= m1 and ma < m2:
r.append(entry_format)
return r
def __str__(self):
return "@%s\n%s\n+\n%s" % (self.identifier, self.seq, self.quals)
def fastqIterate(infile, remove_suffix=False):
'''iterate over contents of fastq file. If remove_suffic is True then /1
and /2 is removed from the first field of the identifier. Raises ValueError
if remove_suffix is true and /1 or /2 is not present at the end of the
first field of any reads'''
def convert2string(b):
if type(b) == str:
return b
else:
return b.decode("utf-8")
def removeReadIDSuffix(line):
'''Removes /1 or /2 from the a provided identifier :param:line.
Identifier must be in the first field of the identifier (with ' ' as
the field seperator) and sepreated with a '/'. Raises ValueError if this
is not the case'''
components = line.split(' ')
read_id = components[0]
if not read_id[-2:] in ['/1', '/2']:
raise ValueError(
'read suffix must be /1 or /2. Observed: %s' % read_id[-2:])
read_id = read_id[:-2]
components[0] = read_id
line = ' '.join(components)
return(line)
while 1:
line1 = convert2string(infile.readline()).strip()
if not line1:
break
if not line1.startswith('@'):
U.error("parsing error: expected '@' in line %s" % line1)
line2 = convert2string(infile.readline())
line3 = convert2string(infile.readline())
if not line3.startswith('+'):
U.error("parsing error: expected '+' in line %s" % line3)
line4 = convert2string(infile.readline())
# incomplete entry
if not line4:
U.error("incomplete entry for %s" % line1)
if remove_suffix:
line1 = removeReadIDSuffix(line1)
yield Record(line1[1:], line2[:-1], line4[:-1])
# End of FastqIterate()
###############################################################################
###############################################################################
def joinedFastqIterate(fastq_iterator1, fastq_iterator2,
strict=True):
'''This will return an iterator that returns tuples of fastq records.
At each step it will confirm that the first field of the read name
(before the first whitespace character) is identical between the
two reads. The response if it is not depends on the value of
:param:`strict`. If strict is true an error is returned. If strict
is `False` the second file is advanced until a read that matches
is found.
This allows for protocols where read one contains cell barcodes, and these
reads have been filtered and corrected before processing without regard
to read2
If provided iterators were created with `remove_suffix=True`, /1 and /2 will
be removed from the end of read1 and read2, respectively before
checking their names are identical
'''
def getReadID(read):
return(read.identifier.split(' ')[0])
for read1 in fastq_iterator1:
read2 = next(fastq_iterator2)
read1_id = getReadID(read1)
read2_id = getReadID(read2)
if not strict:
while read2_id != read1_id:
read2 = next(fastq_iterator2)
read2_id = getReadID(read2)
if not read2_id == read1_id:
raise ValueError("\nRead pairs do not match\n%s != %s" %
(read1_id, read2_id))
yield (read1, read2)
def get_average_umi_distance(umis):
if len(umis) == 1:
return -1
dists = [edit_distance(x, y) for
x, y in itertools.combinations(umis, 2)]
return float(sum(dists))/(len(dists))
class random_read_generator:
''' class to generate umis at random based on the
distributon of umis in a bamfile '''
def __init__(self, bamfile, chrom, barcode_getter):
inbam = pysam.Samfile(bamfile)
if chrom:
self.inbam = inbam.fetch(reference=chrom)
else:
self.inbam = inbam.fetch()
self.umis = collections.defaultdict(int)
self.barcode_getter = barcode_getter
self.random_fill_size = 100000 # Higher = faster, more memory
self.first_kerror = 1
self.fill()
def refill_random(self):
''' refill the list of random_umis '''
self.random_umis = np.random.choice(
list(self.umis.keys()), self.random_fill_size, p=self.prob)
self.random_ix = 0
def fill(self):
''' parse the BAM to obtain the frequency for each UMI'''
self.frequency2umis = collections.defaultdict(list)
for read in self.inbam:
if read.is_unmapped:
continue
if read.is_read2:
continue
try:
self.umis[self.barcode_getter(read)[0]] += 1
except KeyError:
continue
self.umis_counter = collections.Counter(self.umis)
total_umis = sum(self.umis_counter.values())
U.info("total_umis %i" % total_umis)
U.info("#umis %i" % len(self.umis_counter))
self.prob = self.umis_counter.values()
sum_prob = sum(self.prob)
self.prob = [float(x) / sum_prob for x in self.prob]
self.refill_random()
def getUmis(self, n):
''' return n umis from the random_umis atr.'''
if n < (self.random_fill_size - self.random_ix):
barcodes = self.random_umis[self.random_ix: self.random_ix+n]
else:
# could use the end of the random_umis but
# let's just make a new random_umis
if n > self.random_fill_size: # ensure random_umis is long enough
self.random_fill_size = n * 2
self.refill_random()
barcodes = self.random_umis[self.random_ix: self.random_ix+n]
self.random_ix += n
return barcodes