/
uniprot_query_cluster.py
239 lines (217 loc) · 9.09 KB
/
uniprot_query_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python
"""Classify a set of proteins from an InterPro query based on descriptions.
This uses a download of InterPro IDs and groups them based on metadata retrieved
through Zemanta analysis of the functional descriptions. This will be useful for
well-characterized organisms that contain hand-written descriptions.
Usage:
interpro_query_classify.py <input tab file> <API key>
"""
from __future__ import with_statement
import sys
import os
import urllib, urllib2
import xml.etree.ElementTree as ET
import time
import simplejson
import shelve
import operator
import collections
import numpy
from Bio import Cluster
def main(target_id, in_file, api_key):
cache_dir = os.path.join(os.getcwd(), "cache")
uniprot_retriever = UniprotRestRetrieval(cache_dir)
cur_db = shelve.open("%s.db" % os.path.splitext(in_file)[0])
# load the database
with open(in_file) as in_handle:
in_handle.readline() # header
for index, line in enumerate(in_handle):
uniprot_id = line.split()[0].strip()
if uniprot_id not in cur_db.keys():
cur_terms = get_description_terms(uniprot_retriever,
uniprot_id, api_key)
if len(cur_terms) > 0:
cur_db[uniprot_id] = cur_terms
# cluster and print out cluster details
term_matrix, uniprot_ids = organize_term_array(cur_db)
cluster_ids, error, nfound = Cluster.kcluster(term_matrix,
nclusters=10, npass=20, method='a', dist='e')
cluster_dict = collections.defaultdict(lambda: [])
for i, cluster_id in enumerate(cluster_ids):
cluster_dict[cluster_id].append(uniprot_ids[i])
for cluster_group in cluster_dict.values():
if target_id in cluster_group:
for item in cluster_group:
print item, cur_db[item]
cur_db.close()
def organize_term_array(cur_db):
"""Organize a set of terms into a binary matrix for classification.
The rows in the final matrix are the database ids, while the columns are
terms. Each value is 1 if the term is relevant to that ID and 0 otherwise.
"""
# flatten all terms and get a unique set
all_terms = reduce(operator.add, cur_db.values())
term_counts = collections.defaultdict(lambda: 0)
for term in all_terms:
term_counts[term] += 1
all_terms = list(set(all_terms))
term_matrix = []
all_ids = []
for uniprot_id, cur_terms in cur_db.items():
cur_row = [(1 if t in cur_terms else 0) for t in all_terms]
term_matrix.append(cur_row)
all_ids.append(uniprot_id)
return numpy.array(term_matrix), all_ids
def get_description_terms(retriever, cur_id, api_key):
metadata = retriever.get_xml_metadata(cur_id)
if metadata.has_key("function_descr"):
#print metadata["function_descr"]
keywords = zemanta_link_kws(metadata["function_descr"], api_key)
if len(keywords) > 0:
return keywords
return []
def zemanta_link_kws(search_text, api_key):
"""Query Zemanta for keywords linked out to wikipedia or freebase.
"""
gateway = 'http://api.zemanta.com/services/rest/0.0/'
args = {'method': 'zemanta.suggest',
'api_key': api_key,
'text': search_text,
'return_categories': 'dmoz',
'return_images': 0,
'return_rdf_links' : 1,
'format': 'json'}
args_enc = urllib.urlencode(args)
raw_output = urllib2.urlopen(gateway, args_enc).read()
output = simplejson.loads(raw_output)
link_kws = []
for link in output['markup']['links']:
for target in link['target']:
if target['type'] in ['wikipedia', 'rdf']:
link_kws.append(target['title'])
return list(set(link_kws))
class _BaseCachingRetrieval:
"""Provide a base class for web retrieval with local file caching.
"""
def __init__(self, cache_dir):
self._cache_dir = cache_dir
if not(os.path.exists(cache_dir)):
os.makedirs(cache_dir)
# cache 404 errors so we don't call the page multiple times
self._not_found_file = os.path.join(self._cache_dir,
'404_not_found.txt')
self._not_found = []
if os.path.exists(self._not_found_file):
with open(self._not_found_file) as in_handle:
self._not_found = in_handle.read().split()
def _get_open_handle(self, full_url):
if full_url in self._not_found:
return None
url_parts = [p for p in full_url.split("/") if p]
cache_file = os.path.join(self._cache_dir, "_".join(url_parts[1:]))
if not os.path.exists(cache_file):
#print full_url, cache_file
in_handle = self._safe_open(full_url)
if in_handle is None:
return None
with open(cache_file, 'w') as out_handle:
out_handle.write(in_handle.read())
in_handle.close()
return open(cache_file, 'r')
def _safe_open(self, url):
while 1:
try:
in_handle = urllib2.urlopen(url)
return in_handle
except urllib2.URLError, msg:
if str(msg).find("404: Not Found") >= 0:
self._add_not_found(url)
return None
print msg
time.sleep(5)
def _add_not_found(self, url):
with open(self._not_found_file, 'a') as out_handle:
out_handle.write("%s\n" % url)
self._not_found.append(url)
class UniprotRestRetrieval(_BaseCachingRetrieval):
"""Retrieve RDF data from UniProt for proteins of interest.
"""
def __init__(self, cache_dir):
_BaseCachingRetrieval.__init__(self, cache_dir)
self._server = "http://www.uniprot.org"
self._xml_ns = "{http://uniprot.org/uniprot}"
def get_xml_metadata(self, uniprot_id):
"""Retrieve data from the UniProt XML for a record.
XXX This retrieves only a subset of metadata right now. Needs to
be complete.
"""
url_base = "%s/uniprot/%s.xml"
full_url = url_base % (self._server, uniprot_id)
# check for empty files -- which have been deleted
with self._get_open_handle(full_url) as in_handle:
if in_handle.readline() == "":
return {}
metadata = {}
with self._get_open_handle(full_url) as in_handle:
root = ET.parse(in_handle).getroot()
metadata = self._get_org_metadata(root, metadata)
metadata = self._get_interpro_metadata(root, metadata)
metadata = self._get_function_metadata(root, metadata)
return metadata
def _get_org_metadata(self, root, metadata):
"""Retrieve the organism information from UniProt XML.
"""
org = root.find("%sentry/%sorganism" % (self._xml_ns, self._xml_ns))
for org_node in org:
if org_node.tag == "%sname" % self._xml_ns:
if org_node.attrib["type"] == "scientific":
metadata["org_scientific_name"] = org_node.text
elif org_node.attrib["type"] == "common":
metadata["org_common_name"] = org_node.text
elif org_node.tag == "%slineage" % self._xml_ns:
metadata["org_lineage"] = [n.text for n in org_node]
return metadata
def _get_interpro_metadata(self, root, metadata):
"""Retrieve InterPro domains present in the protein.
"""
db_refs = root.findall("%sentry/%sdbReference" % (self._xml_ns,
self._xml_ns))
all_refs = []
for db_ref in db_refs:
if db_ref.attrib["type"] in ["InterPro"]:
all_refs.append("%s:%s" % (db_ref.attrib["type"],
db_ref.attrib["id"]))
if len(all_refs) > 0:
metadata["db_refs"] = all_refs
return metadata
def _get_function_metadata(self, root, metadata):
"""Retrieve an InterPro function description.
"""
comments = root.findall("%sentry/%scomment" % (self._xml_ns,
self._xml_ns))
for comment in comments:
if comment.attrib["type"] in ["function"]:
for comment_node in comment:
if comment_node.tag == "%stext" % (self._xml_ns):
metadata["function_descr"] = comment_node.text
return metadata
def get_rdf_metadata(self, uniprot_id):
"""Retrieve RDF metadata for the given UniProt accession.
XXX Not finished. XML parsing looks to be more straightforward
"""
from rdflib import ConjunctiveGraph as Graph
url_base = "%s/uniprot/%s.rdf"
full_url = url_base % (self._server, uniprot_id)
graph = Graph()
with self._get_open_handle(full_url) as in_handle:
graph.parse(in_handle)
main_subject = [s for s in list(set(graph.subjects())) if
s.split('/')[-1] == uniprot_id][0]
for sub, pred, obj in graph:
print sub, pred, obj
if __name__ == "__main__":
if len(sys.argv) != 4:
print "Incorrect arguments"
print __doc__
sys.exit()
main(sys.argv[1], sys.argv[2], sys.argv[3])