-
Notifications
You must be signed in to change notification settings - Fork 0
/
gess.py
executable file
·139 lines (115 loc) · 4.77 KB
/
gess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
#docstring
"""
gess: a tool to estimate genome size from Illumina reads
Uses k-mer counting: requires kmc
input: illumina fastq reads - R1 and R2
output: estimated genome size
"""
#global variables
MIN_KMER = 16
MAX_KMER = 127
MIN_BINS = 10
MAX_BINS = 300
MIN_CUTOFF = 3
MAX_CUTOFF = 20
#import modules
import subprocess
import sys
import os
import logging
import re #regular expressions
import argparse
from argparse import RawTextHelpFormatter
def main():
# make parser, add arguments, read in args from user
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='estimates genome size', usage='\n %(prog)s args.fastq_file', prog='gess')
parser.add_argument('--version', '-v', action='version', version='%(prog)s 0.1')
parser.add_argument("fastq_file")
parser.add_argument('--kmer', '-k', type=int, default=25, help='kmer size for KMC')
#kmer size can affect the estimate; depends on noise and depth
parser.add_argument('--nbins', '-n', type=int, default=200, help='number of bins')
parser.add_argument('--cutoff', '-ci', type=int, default=3, help='discard kmers if frequency < cutoff')
parser.add_argument('--tmpdir', '-t', default='/tmp')
parser.add_argument('--quiet', '-q', action="store_true") #if user specifies this arg, it has value True
parser.add_argument('--threads', type=int, default=1)
parser.add_argument('--citation', action="store_true")
parser.add_argument('--check') ##TODO - check kmc runs
args = parser.parse_args()
#set logging level
if args.quiet:
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.DEBUG)
#citation
if args.citation:
print("Citation: Gess, a tool for genome size estimation. GitHub")
#check input is correct
if args.kmer<MIN_KMER or args.kmer>MAX_KMER:
logging.error("kmer value should be between {} and {}".format(MIN_KMER, MAX_KMER))
#TODO find out kmc's allowed kmers
sys.exit(1)
if args.nbins<MIN_BINS or args.nbins>MAX_BINS: #TODO check suitable bin sizes
logging.error("nbins should be between {} and {}".format(MIN_BINS, MAX_BINS))
sys.exit(1)
if args.cutoff<MIN_CUTOFF or args.cutoff>MAX_CUTOFF: #TODO check suitable cutoffs
logging.error("cutoff should be between {} and {}".format(MIN_CUTOFF, MAXCUTOFF))
sys.exit(1)
if args.threads < 1:
logging.error("threads should be greater than 0")
sys.exit(1)
# use process ID to name temp files
pid = str(os.getpid())
logging.info("the pid is: " + pid)
#TODO: check that kmc works
#get the version of kmc
#kmc_cmd = ["kmc"]
#kmc_check = subprocess.check_output(kmc_cmd)
#kmc_pattern = re.search(r'ver.\s(d+)', kmc_check)
# check kmc runs at all
# try / except
# talk to the user
logging.info("You provided this fastq file: {}". format(args.fastq_file))
logging.info("Now running KMC with these commands:")
cmd = ["kmc", "-k"+str(args.kmer), "-n"+str(args.nbins), "-ci"+str(args.cutoff), "-t"+str(args.threads), args.fastq_file, pid, args.tmpdir]
logging.debug(cmd)
# -ci3 option discards kmers with freq <3
# -n200 sets number of bins to 200 because you can only have 256 files open at once
# could also change this with ulimit
#/tmp is the working directory
# check that kmc ran properly
try:
result = subprocess.check_output(cmd) #this makes a bytes thing
#subprocess.check_output is equivalent to subprocess.run with the default arg check=True
except:
logging.exception("error running kmc")
sys.exit(1)
#delete unnecessary files
logging.info("removing unnecessary files")
os.remove(pid + ".kmc_pre")
os.remove(pid + ".kmc_suf")
#Report the output
#print(type(result))
#print(result)
converted = str(result, 'utf-8') #converts bytes to string
#print(type(converted))
#print(converted)
#No. of unique counted k-mers : 2784627
pattern = re.search(r'unique counted k-mers\s*:\s*(\d+)', converted)
#use re.search not re.match
genome_size = (pattern.groups())
#print(genome_size[0]) #extract first item of tuple
#can exclude this step now
#split = converted.splitlines() # a list of strings
#print(split)
#for line in split:
# if "unique counted" in line:
# #print(line)
# numbers = line
# numbers = ''.join(n for n in numbers if n in '0123456789')
genome_size_inMbp = int(genome_size[0])/1_000_000
logging.info("Estimated genome size is {0:.2f} Mbp".format(genome_size_inMbp))
print("Estimated genome size is {0:.2f} Mbp".format(genome_size_inMbp))
#print(genome_size[0]) #TODO make stdout?
if __name__=="__main__":
main()