/
matchELMpattern.py
executable file
·82 lines (73 loc) · 2.93 KB
/
matchELMpattern.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#---------------------matchELMpattern.py-
#
# Author Perry Evans
# evansjp@mail.med.upenn.edu
# 2008
#
#----------------------------------------
""" Match regular expressions from ELM given a pattern file and fasta file.
../../../school/Data/Protein_Annotations/ELM/elm2pattern
fasta file """
import string, re, sys, utils
import utils_scripting
from collections import defaultdict
special_p = re.compile("Q.[^FHWY][ILM][^P][^FHILVWYP][DHFM][FMY]..")
# add one to start b/c that part if 0 index based
# end is used for matching, and is not really the end site
def printResult(protein, elm, match, seq, offset):
print protein + '\t' + str(offset+match.start()+1) \
+ '\t' + str(offset+match.end()) \
+ '\t' + elm + '\t' \
+ seq[int(match.start()):int(match.end())] + '\tELM'
def matchSeq(protein, seq, pattern2elm, pattern2regex):
for elm_pattern in pattern2regex:
#elm_pattern = elm2pattern[elm]#.replace('(','').replace(')','')
#p = re.compile(elm_pattern)
p = pattern2regex[elm_pattern]
match = p.search(seq)
# there's no need to search for more matches
# if the match must occur at the amino end
if 'LIG_PCNA' in pattern2elm[elm_pattern]:
#p = re.compile("Q.[^FHWY][ILM][^P][^FHILVWYP][DHFM][FMY]..")
p = special_p
tempSeq = ''
for s in seq:
tempSeq = tempSeq + s
offset = 0
while match:
printResult(protein, elm, match, tempSeq, offset)
tempSeq = tempSeq[int(match.start())+1:]
offset += int( match.start() ) + 1
match = p.search(tempSeq)
elif elm_pattern[0] == '^' or elm_pattern[0:2]=='(^':
if match:
printResult(protein, elm, match, seq, 0)
else:
tempSeq = ''
for s in seq:
tempSeq = tempSeq + s
offset = 0
while match:
for elm in pattern2elm[elm_pattern]:
printResult(protein, elm,
match, tempSeq, offset)
tempSeq = tempSeq[int(match.start())+1:]
offset += int( match.start() ) + 1
match = p.search(tempSeq)
req_args = ['pattern file',
'fasta file']
examples = ['../../Data/ELM/elm2pattern',
'../../Data/FASTA/Human/hprd.intr.fasta']
utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)
input_pattern_file = sys.argv[1]
fasta_file = sys.argv[2]
pattern2regex = {}
pattern2elm = defaultdict(dict)
with open(input_pattern_file) as f:
for line in f:
elm, pattern = line.strip().split('\t')
pattern2elm[pattern][elm] = True
for pattern in pattern2elm:
pattern2regex[pattern] = re.compile(pattern)
for protein, seq in utils.fasta_iter(fasta_file):
matchSeq(protein, seq, pattern2elm, pattern2regex)