/
look_at_polPA_mapk.py
121 lines (104 loc) · 3.7 KB
/
look_at_polPA_mapk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from collections import defaultdict
import os, utils
def get_host_freqs(afile):
freqs = {}
len_counts = defaultdict(utils.init_zero)
with open(afile) as f:
for line in f:
elm, seq, count, freq = line.strip().split('\t')
if elm == 'LIG_MAPK_1':
freqs[elm + ':' + seq + ':' + str(len(seq))] = float(count)
len_counts[str(len(seq))] += float(count)
for elm_seq_len in freqs:
alen = elm_seq_len.split(':')[2]
freqs[elm_seq_len] /= len_counts[alen]
return freqs
def get_freqs(afile, seq_percents):
"""Look at flu seq % coverage"""
seen = {}
with open(afile) as f:
for line in f:
protein, elmseq, freq = line.strip().split('\t')
key = ':'.join([protein, elmseq.split(':')[1],
str(len( elmseq.split(':')[1]))])
if key not in seen:
seq_percents[key].append(float(freq))
seen[key] = True
def get_annotations(afile):
"""Get ELM seqs that are conserved"""
d = defaultdict(dict)
with open(afile) as f:
for line in f:
protein, elm = line.strip().split('\t')
if protein == 'polymerase PA':
if 'LIG_MAPK_1' in elm:
seq = elm.split(':')[1] + ':' + str(len(elm.split(':')[1]))
d[protein][seq] = True
return d
def get_uniq(uniq, ls1, ls2):
"""Of the conserved ELM seqs, which ones are unique?"""
for protein in ls1:
for elm in ls1[protein]:
if elm not in ls2[protein]:
uniq[protein + ':' + elm] = True
human_elmseqs_file = 'working/Jul7/mammal_elms'
bird_elmseqs_file = 'working/Jul7/bird_elms'
human_elmseqs = get_annotations(human_elmseqs_file)
bird_elmseqs = get_annotations(bird_elmseqs_file)
uniq_human = {}
uniq_bird = {}
get_uniq(uniq_human, human_elmseqs, bird_elmseqs)
get_uniq(uniq_bird, bird_elmseqs, human_elmseqs)
print len(uniq_human), len(uniq_bird)
dir = 'working/Jul7'
years = range(2000,2011,1)
# bird
birds = ('chicken','duck')
strains = ('H9N2', 'H5N1')
seq_percents_bird = defaultdict(list)
total_bird = 0
for bird in birds:
for strain in strains:
for year in years:
file = os.path.join(dir, '.'.join((bird, strain, str(year),
'elms.conservation')))
if os.path.exists(file):
total_bird += 1
get_freqs(file, seq_percents_bird)
# human
hosts = ('human',)
strains = ('H5N1', 'H1N1', 'H3N2', 'H3N8')
seq_percents_human = defaultdict(list)
total_human = 0
for host in hosts:
for strain in strains:
for year in years:
file = os.path.join(dir, '.'.join((host, strain, str(year),
'elms.conservation')))
if os.path.exists(file):
total_human += 1
get_freqs(file, seq_percents_human)
human_host_file = 'working/Jul7/elmdict_H_sapiens.RWinit'
chicken_host_file = 'working/Jul7/elmdict_Gallus_gallus.RWinit'
human_host_freqs = get_host_freqs(human_host_file)
chicken_host_freqs = get_host_freqs(chicken_host_file)
print 'BIRD'
sum = float(0)
for elmseq in uniq_bird:
seq = elmseq.split(':')[1] + ':' + elmseq.split(':')[2]
elmseq = 'LIG_MAPK_1' + ':' + seq
diff = chicken_host_freqs[elmseq]-human_host_freqs[elmseq]
print elmseq, diff
if diff > float(0):
sum += 1
print sum
print 'MAMMAL'
s = float(0)
for elmseq in uniq_human:
seq = elmseq.split(':')[1] + ':' + elmseq.split(':')[2]
elmseq = 'LIG_MAPK_1' + ':' + seq
diff = human_host_freqs[elmseq]-chicken_host_freqs[elmseq]
print elmseq, diff
if diff > float(0):
s += 1
print s