/
parse_fasta.py
executable file
·76 lines (70 loc) · 3.78 KB
/
parse_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#! /usr/bin/env python3
"""
Parse/reformat a fasta file
--Weronika Patena, nov2008
Read standard fasta files:
The input is a a generator - call as :parse_fasta(open(filename)):
The output is a generator - creates on the fly an iterator over
all the (header, seq) pairs of a fasta file, to use like this:
>> for (header,seq) in parse_fasta(open(infile)): <do stuff>
"""
import sys
debug = 0
def print_seq(line):
""" printing help function - takes (header,seq) tuples from fasta, or line strings from text. """
# if the line is a string instead of a (header,seq) tuple, just print it
if isinstance(line,str):
print(line)
# otherwise you got a (header,seq) tuple
else:
(header,seq) = line
print(">%s\n%s"%(header,seq))
def parse_fasta(INPUT, not_standard_nucleotides=False):
""" Usage: for (header,seq) in parse_fasta(input): <do stuff>. Input can be a filename or generator. """
header,seq = '',''
# If the input is a filename, open it first
if type(INPUT)==str:
isfile = True
INPUT = open(INPUT)
else:
isfile = False
for line in INPUT:
# DON'T skip empty lines - there may be empty sequences in the file!
#if not line.strip(): continue
# if you find a header line, start a new entry, first yielding the previous entry
# (unless there is no previous entry, i.e no header, since a sequence can be empty)
if line and line[0]=='>':
if header:
yield (header,''.join(seq))
header = line[1:].strip() # line[1:] to get rid of the initial >
seq = []
# otherwise it's a seq line - add it to the current sequence (strip spaces/newlines)
else:
# get rid of all whitespace characters by splitting and joining (there's probably a better way...)
curr_seq = ''.join(line.strip().split())
# if there are illegal seq characters, exit with an error
# (checking for illegal characters by using translate to delete all legal characters and seeing if there's anything left)
if not not_standard_nucleotides:
# maketrans makes a translation table - translate arg1 characters to arg2 characters and delete arg3 ones
# (I don't know why it's a method instead of a function, it's stupid, since it doesn't use the parent string!)
wrong_characters = curr_seq.upper().translate(''.maketrans('', '', 'ACTGURYKMSWBDHVN.-*'))
# in python 2 this looked like this instead (empty translation table and a separate list of chars to delete)
# from string import maketrans
# wrong_characters = seq.upper().translate(maketrans('',''), 'ACTGURYKMSWBDHVN.-*'})
if wrong_characters:
raise Exception("Error: invalid sequence line! %s - wrong characters \"%s\""%(curr_seq, wrong_characters))
# TODO shouldn't really hard-code the allowed bases...
# MAYBE-TODO include option for proteins to check those? And maybe an option for just ACTG?
seq.append(curr_seq)
if not header:
raise Exception("Error: Found a sequence line without a header first! %s"%curr_seq)
if isfile: INPUT.close()
# also return the last entry, if it's non-empty!
if header:
yield (header,''.join(seq))
### If called directly, test everything
if __name__ == '__main__':
# TODO unfortunately if the file's wrong read_input will still read it whole before noticing! But if the parse_fasta function is called directly, it can be given a filename and read it line-by-line.
import read_input
input = read_input.read_input()
for (header,seq) in parse_fasta(input): print_seq((header,seq)) # nice printing