forked from jgurtowski/ectools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gc_count.py
68 lines (48 loc) · 1.82 KB
/
gc_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
import sys
import operator
from seqio import fastaIterator
from itertools import groupby,izip,imap
from nucio import NucRecord, NucRecordTypes, getNucmerAlignmentIterator
from cov import fillc, getMarkedRanges, getCoverageFromNucAlignments
from gccontent import getGCSlidingWindow
GC_WINDOW_SIZE = 300
GC_THRESHOLD = 0.7
MIN_COV_GAP = 100
if not len(sys.argv) == 4:
print "gc_count.py reads.fa alignments.sc outprefix"
sys.exit(1)
rfh = open(sys.argv[1])
afh = open(sys.argv[2])
ofh = open(sys.argv[3]+".uncov.gc.bases","w")
reads = {}
for entry in fastaIterator(rfh):
reads[str(entry.name)] = str(entry.seq)
sys.stderr.write("Loaded reads\n")
alignmentIt = getNucmerAlignmentIterator(afh)
sys.stderr.write("Loaded Alignments\n");
counter = 0
for name,group in groupby(alignmentIt, lambda x: x.sname):
#build coverage vector
cov = getCoverageFromNucAlignments(group)
#mark the regions with 0 (no) coverage as 1 and change
#everything else to 0
cov_inv = map(lambda c: 1 if c == 0 else 0, cov)
#ranges with zero coverage
zero_cov_ranges = getMarkedRanges(cov_inv)
seq = reads[name]
#calculate GC % for windows of GC_WINDOW_SIZE
gc_sliding_window = getGCSlidingWindow(seq, GC_WINDOW_SIZE)
#filter gaps that are at > MIN_COV_GAP
#and have at least one base > GC_THRESHOLD
#take the sum of the lengths of all of the regions
gc_gap_bases = sum(map(lambda (s,e):
e-s if e-s > MIN_COV_GAP and any(map(lambda x: True if x > GC_THRESHOLD else False, gc_sliding_window[s:e])) else 0,
zero_cov_ranges))
ofh.write("%s\t%d\n" % (name,gc_gap_bases))
if counter % 10000 == 0:
sys.stderr.write("Read num: %d\n" % counter)
counter += 1
rfh.close()
afh.close()
ofh.close()