/
js_elmDist_host_flu.py
114 lines (98 loc) · 3.68 KB
/
js_elmDist_host_flu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""Count ELMs (not ELM sequences) on hosts
and use Jensen-Shannon
divergence between human and chicken.
This is not finished."""
import itertools, utils, global_settings, os, random, utils_graph
from collections import defaultdict
def get_flu_counts(flus, proteins):
""" this is not finished"""
flu_elmCounts = {}
i=1/0
def get_host_counts(ls_of_hosts):
"""Count total ELM hits for each
host in ls_of_hosts.
Return counts & ELMs found."""
host_elmCounts = {}
seen_elms = defaultdict(dict)
for host in ls_of_hosts:
host_elmCounts[host] = defaultdict(utils.init_zero)
with open('results/roundup_all/elmdict_' + host + '.init') as f:
for line in f:
(elm, seq, count, fq) = line.strip().split('\t')
host_elmCounts[host][elm] += int(count)
seen_elms[elm][host] = True
use_elms = {}
for elm in seen_elms:
if len(seen_elms[elm]) == len(ls_of_hosts):
use_elms[elm] = True
print len(use_elms)
return (host_elmCounts, use_elms)
def mk_vec(counts, all_elmSeqs):
"""mk long vector of ELM:seq counts for this host's counts"""
vec = []
for elmseq in all_elmSeqs:
if elmseq in counts:
vec.append(counts[elmseq])
else:
vec.append(float(0))
return vec
def mk_count_vecs(counts, all_elmSeqs):
"""mk long vector of ELM:seq counts for all hosts"""
vecs = {}
for host in counts:
vecs[host] = mk_vec(counts[host],
all_elmSeqs)
return vecs
def mk_count_dists(vecs):
"""change count vectors into distributions"""
dists = {}
for host in vecs:
dists[host] = utils.getDistFromCount(vecs[host])
return dists
hosts = ('H_sapiens', 'Gallus_gallus')
flus = ('human', 'chicken')
proteins = ('hemagglutinin', 'neuraminidase', 'nucleocapsid protein',
'matrix protein 1', 'nonstructural protein 1', 'matrix protein 2',
'nonstructural protein 2', 'polymerase PA', 'polymerase PB2',
'polymerase PB1', 'PB1-F2 protein')
flu_counts, flu_elms = get_flu_counts(flus, proteins)
host_elmCounts, elms = get_host_counts(hosts)
host_vecs = mk_count_vecs(host_elmCounts, elms)
host_dists = mk_count_dists(host_vecs)
tmp_input = 'tmp_data'
tmp_r = 'tmp_r' + str(random.randint(0,100))
tmp_labels = 'labels' + str(random.randint(0,100))
out_file = 'working/js_elm_host_phylogeny.png'
js_distances = defaultdict(dict)
for host1, host2 in itertools.combinations(host_dists, 2):
js_dis = utils.jensen_shannon_dists(host_dists[host1],
host_dists[host2])
js_distances[host1][host2] = js_dis
js_distances[host2][host1] = js_dis
with open(tmp_input, 'w') as f:
for host1 in host_dists:
line = ''
for host2 in host_dists:
if host1 == host2:
line += '0\t'
else:
line += str(js_distances[host1][host2]) + '\t'
f.write(line.strip('\t') + '\n')
with open(tmp_labels, 'w') as f:
f.write('\t'.join(host_dists) + '\n')
with open(tmp_r, 'w') as f:
f.write("source('funcs.R')\n")
f.write("library('MASS')\n")
f.write("d<-read.delim('"
+ tmp_input
+ "',header=FALSE,sep='\\t')\n")
f.write('dist.r<-as.dist(d)\n')
f.write("labels.d<-read.delim('"
+ tmp_labels
+ "',header=FALSE,sep='\\t')\n")
f.write('labels<-as.matrix(labels.d)\n')
f.write("h<-hclust(dist.r,method='average')\n")
f.write("png('" + out_file + "')\n")
f.write("plot(h,hang=-1,labels=labels[1,],main='Species Dendrogram')\n")
f.write('dev.off()\n')
os.system('R < ' + tmp_r + ' --no-save')