-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarize_annotations.py
125 lines (112 loc) · 3.53 KB
/
summarize_annotations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
import csv
import gzip
import sys
from collections import Counter
sample = "$sample"
hits_file = "$hits"
results_file = sample + "_summary.txt"
gzopen = lambda f: gzip.open(f, "rt") if f.endswith(".gz") else open(f)
levels = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]
required = ["prokka_gene", "prokka_EC_number", "prokka_product", "swissprot_gene", "swissprot_EC_number", "swissprot_product"]
summaries = Counter()
def standard_database(observed, expected):
for e in expected:
if e not in observed:
return False
return True
with gzopen(hits_file) as in_fh, open(results_file, "w") as out_fh:
reader = csv.DictReader(in_fh, delimiter="\\t")
if not standard_database(reader.fieldnames, required):
print("custom databases without the expected columns are not supported in the summary function", file=out_fh)
sys.exit(0)
for i, row in enumerate(reader):
if row["status"] == "U":
continue
for assignment, level in zip(row["taxonomic_lineage"].split(";"), levels):
if assignment.strip() == "NA":
continue
summaries.update([level])
if (
row["prokka_gene"]
or row["prokka_EC_number"]
or row["prokka_product"]
or row["swissprot_gene"]
or row["swissprot_EC_number"]
or row["swissprot_product"]
):
summaries.update(["function"])
hypothetical = False
for assignment in [
row["prokka_gene"],
row["prokka_EC_number"],
row["prokka_product"],
row["swissprot_gene"],
row["swissprot_EC_number"],
row["swissprot_product"],
]:
if "hypothetical" in assignment:
hypothetical = True
if not hypothetical:
summaries.update(["nonhypothetical"])
if row["prokka_EC_number"] or row["swissprot_EC_number"]:
summaries.update(["ec"])
print("Sequences: ", i, file=out_fh)
i = i / 100.0
print("Taxonomy assignments", file=out_fh)
print(
" Superkingdom:",
summaries["superkingdom"],
"(%.2f%%)" % (summaries["superkingdom"] / i),
file=out_fh,
)
print(
" Phylum:",
summaries["phylum"],
"(%.2f%%)" % (summaries["phylum"] / i),
file=out_fh,
)
print(
" Class:",
summaries["class"],
"(%.2f%%)" % (summaries["class"] / i),
file=out_fh,
)
print(
" Order:",
summaries["order"],
"(%.2f%%)" % (summaries["order"] / i),
file=out_fh,
)
print(
" Family:",
summaries["family"],
"(%.2f%%)" % (summaries["family"] / i),
file=out_fh,
)
print(
" Genus:",
summaries["genus"],
"(%.2f%%)" % (summaries["genus"] / i),
file=out_fh,
)
print(
" Species:",
summaries["species"],
"(%.2f%%)" % (summaries["species"] / i),
file=out_fh,
)
print("Functional assignments", file=out_fh)
print(
" Function:",
summaries["function"],
"(%.2f%%)" % (summaries["function"] / i),
file=out_fh,
)
print(
" Non-hypothetical:",
summaries["nonhypothetical"],
"(%.2f%%)" % (summaries["nonhypothetical"] / i),
file=out_fh,
)
print(" EC:", summaries["ec"], "(%.2f%%)" % (summaries["ec"] / i), file=out_fh)