-
Notifications
You must be signed in to change notification settings - Fork 0
/
RDFizer.py
323 lines (262 loc) · 15.7 KB
/
RDFizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
##########################################################################################
# RDFizer.py
# Purpose: convert SemRep predications into RDF triples
# version 1.0.0
# date: 12.19.2017
# Python 2.7.13
# ###################################
# Licensing: The SemRep (Database Version 3.0_A; SemRep Version 1.7; Date 12/31/2016) is the original source of the
# predications. Neither the United States Government, the U.S. Department of Health and Human Services, the National
# Institutes of Health, the National Library of Medicine, the Lister Hill National Center for Biomedical Communications,
# nor any of their agencies, contractors, subcontractors, or employees make any warranties, expressed or implied, with
# respect to the SKR resources, and furthermore, they assume no liability for any party's use, or the results of such
# use, of any part of these tools. For the SemRepRDF-Open version, minor modifications to the original SemRep
# predications include the mapping of UMLS CUIs to open resource concept identifiers.
##########################################################################################
##NEEDS:
#1. Dockers for triples and docker that can download SemMedDB instance
#2. autmoatic build
#3. add github topics to repo/create a wikipage for it
#4. write testing
#5. think through combining with PubAnnotation - and or the way that we will map the UMLS concepts to other
# terminologies
#6. add wording to top of output file to be in compliance with terms and use agreement.
# install needed libraries
import argparse
import base64
from functools import partial
import hashlib
import multiprocessing
from mysql.connector import Error, errorcode
import MySQLdb
import os
import sys
from rdflib import Namespace
from rdflib.namespace import RDF, DCTERMS, RDFS
from rdflib import Graph
from rdflib import URIRef, Literal
def DBConnect(host, username, password, db):
"""
Function takes several strings and an integer that contain information on how to connect to a MySQL database
containing the SemMedDB database instance. The function returns a string representing the cursor object necessary
for querying the database as well as a string that stores the database connection (needed to close the connection
to the database after querying is complete).
:param host: string representing the database host name
:param username: string representing the database username
:param password: string representing the database password
:param db: string representing the database name
:return: strings representing the cursor object necessary for querying the database and the database object
"""
try:
cnx = MySQLdb.connect(host=host,
user=username,
passwd=password,
db=db)
# verify that connection to database is valid
except Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print("Something is wrong with your user name or password")
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print("Database does not exist")
else:
print(err)
else:
return cnx
def QueryRunner(db_info, query):
"""
Function takes a list storing information needed to access MySQL database and a string storing an SQL query as
input. The query is then processed against the database via the cursor. Prior to returning results, the function
verifies that query returned data, otherwise an exception is raised. The function returns a list of lists where
each list contains one row of output.
:param db_info (list): list of information needed to connect to a database
:param query (str): string storing an SQL query
:return: list of lists where each list contains one row of output
"""
# execute query against a MySQL DB
db_cnx = DBConnect(db_info[0], db_info[1], db_info[2], db_info[3])
cursor = db_cnx.cursor()
cursor.execute(query)
# verify that query returned results
if int(cursor.rowcount) > 0:
return cursor.fetchall()
else:
raise Exception("Query returned no results")
def SentenceSpan(db_info, sentence_id):
"""Function takes a list storing information needed to access MySQL database and a string storing an identifier. The
function uses this information to run a query against the database. The query uses the string identifier to retrieve
all start and end indices for the given identifier. Prior to returning results, the function verifies that the
returned start and stop indices for the sentence make sense (i.e., that the stop index is equal to or larger
than the start index, otherwise an exception is raised. The function returns a list of start and stop indexes.
:param db_info (list): list of information needed to connect to a database
:param sentence_id (str): sentence identifier
:return: a list of start and stop indices for the input sentence identifier
"""
sent_span = QueryRunner(db_info,
("select p.SENTENCE_ID, "
" MIN(LEAST(pa.PREDICATE_START_INDEX, pa.SUBJECT_START_INDEX, pa.OBJECT_START_INDEX)),"
" MAX(GREATEST(pa.PREDICATE_END_INDEX, pa.SUBJECT_END_INDEX, pa.OBJECT_END_INDEX))"
" from SemMedDB.PREDICATION p "
" inner join SemMedDB.PREDICATION_AUX pa on pa.PREDICATION_ID = p.PREDICATION_ID"
" where p.SENTENCE_ID = " + "'" + str(sentence_id) + "'" +
" GROUP BY p.SENTENCE_ID"))
# verify that query returned results
if int(sent_span[0][2]) >= int(sent_span[0][1]):
return str(sent_span[0][1]), str(sent_span[0][2])
else:
raise Exception("Sentence start and end indices are incorrect")
def TripleMaker(pmid, db_info):
"""
:param pmid:
:param db_info:
:return:
"""
## GET DATA FOR TRIPLES ##
# run query to get results needed for generating triples
res = [map(str, x) for x in QueryRunner(db_info,
("select *"
" from SemMedDB.PREDICATION p"
" inner join SemMedDB.SENTENCE s on s.SENTENCE_ID = p.SENTENCE_ID" \
" inner join "
"SemMedDB.CITATIONS c on c.PMID = p.PMID" \
" inner join SemMedDB.PREDICATION_AUX pa on pa.PREDICATION_ID = "
"p.PREDICATION_ID" \
" where p.PMID = " + "'" + str(pmid) + "'"
" order by p.PREDICATION_ID"))]
# get database version information
vers = [map(str, x) for x in QueryRunner(db_info, ("select * from SemMedDB.METAINFO"))]
version = base64.b64encode(hashlib.sha1("semrep" + str(vers[0][1])).digest())
##CREATE TRIPLES
# add namespaces
nlm = Namespace("https://skr3.nlm.nih.gov/")
oa = Namespace("http://www.w3.org/ns/oa#")
oa_ext = Namespace("http://www.weneedaurl.now/")
obo = Namespace("http://purl.obolibrary.org/obo/")
prov = Namespace("http://www.w3.org/ns/prov#")
sep = Namespace("http://purl.obolibrary.org/obo/sep/")
swo = Namespace("http://purl.obolibrary.org/obo/swo/")
tao = Namespace("http://pubannotation.org/ontology/tao.owl#")
umls = Namespace("http://uts-ws.nlm.nih.gov/rest/content/current/CUI/")
# add identifiers
prj = "http://pubannotation.org/projects/semrep/"
doc = "http://pubannotation.org/docs/sourcedb/PMC/sourceid/" + str(pmid) + "/divs/0"
sent = "http://pubannotation.org/projects/sentences/PMC-" + str(pmid) + "-0-sentence_"
span = "http://pubannotation.org/docs/sourcedb/PMC/sourceid/" + str(pmid) + "/divs/0/spans/"
concept = "http://pubannotation.org/projects/sentences/PMC-" + str(pmid) + "-0-T_"
# create graph
g = Graph()
annot_graph = base64.b64encode(hashlib.sha1(str(pmid)).digest())
# document provenance
g.add((URIRef(str(prj) + str(annot_graph)), URIRef(str(oa) + 'has_source'), URIRef(str(doc))))
g.add((URIRef(str(doc)), DCTERMS.published, Literal(str(res[0][21]))))
g.add((URIRef(str(doc)), RDF.type, URIRef(str(obo) + "IAO_0000310")))
g.add((URIRef(str(doc)), DCTERMS.identifier, Literal(str(pmid))))
# annotation description triples
g.add((URIRef(str(prj) + str(annot_graph)), URIRef(str(prov) + 'atTime'), URIRef(str(vers[0][2]))))
g.add((URIRef(str(prj) + str(annot_graph)), URIRef(str(prov) + 'wasGeneratedBy'), URIRef(str(nlm) + str(version))))
g.add((URIRef(str(prj) + str(annot_graph)), RDF.type, URIRef(str(prov) + 'Activity')))
g.add((URIRef(str(prj) + str(annot_graph)), RDF.type, URIRef(str(oa_ext) + 'OA_Concept_Annotation')))
g.add((URIRef(str(nlm) + str(version)), URIRef(str(sep) + 'SEP_00065'), Literal(str(vers[0][1]))))
g.add((URIRef(str(nlm) + str(version)), RDF.type, URIRef(str(swo) + 'SWO_0000001')))
g.add((URIRef(str(nlm) + str(version)), RDFS.label, Literal("SemRep")))
# loop over predications and add annotations to graph
for annot in res:
# get sentence span information
sent_span = SentenceSpan(db_info, annot[1])
# resource description
sents = str(sent) + str(annot[1])
g.add((URIRef(str(sents)), URIRef(str(tao) + 'belongs_to'), URIRef(str(doc))))
g.add((URIRef(str(sents)), URIRef(str(tao) + 'begins_at'), Literal(str(sent_span[0]))))
g.add((URIRef(str(sents)), URIRef(str(tao) + 'ends_at'), Literal(str(sent_span[1]))))
g.add((URIRef(str(sents)), DCTERMS.source, Literal(str(annot[14]))))
g.add((URIRef(str(sents)), URIRef(str(tao) + 'has_text'), Literal(str(annot[16]))))
g.add((URIRef(str(sents)), RDF.type, URIRef(str(tao) + 'Text_span')))
# subject
span1 = str(span) + str(annot[27]) + "-" + str(annot[28])
g.add((URIRef(str(sents)), URIRef(str(tao) + 'contains'), URIRef(str(span1))))
g.add((URIRef(str(span1)), URIRef(str(tao) + 'begins_at'), Literal(str(annot[27]))))
g.add((URIRef(str(span1)), URIRef(str(tao) + 'ends_at'), Literal(str(annot[28]))))
if '|' in annot[4]:
for con in annot[4].split('|'):
concept1 = str(concept) + str(con)
g.add((URIRef(str(span1)), URIRef(str(tao) + 'denotes'), URIRef(str(concept1))))
g.add((URIRef(str(concept1)), RDF.type, URIRef(str(tao) + 'Concept_entity')))
g.add((URIRef(str(concept1)), RDF.type, URIRef(str(RDF) + 'Subject')))
g.add((URIRef(str(RDF) + 'Subject'), DCTERMS.identifier, URIRef(str(umls) + str(annot[4]))))
g.add((URIRef(str(RDF) + 'Subject'), RDFS.label, Literal(str(annot[5]))))
else:
concept1 = str(concept) + str(annot[4])
g.add((URIRef(str(span1)), URIRef(str(tao) + 'denotes'), URIRef(str(concept1))))
g.add((URIRef(str(concept1)), RDF.type, URIRef(str(tao) + 'Concept_entity')))
g.add((URIRef(str(concept1)), RDF.type, URIRef(str(RDF) + 'Subject')))
g.add((URIRef(str(RDF) + 'Subject'), DCTERMS.identifier, URIRef(str(umls) + str(annot[4]))))
g.add((URIRef(str(RDF) + 'Subject'), RDFS.label, Literal(str(annot[5]))))
# predicate
span2 = str(span) + str(annot[31]) + "-" + str(annot[32])
g.add((URIRef(str(sents)), URIRef(str(tao) + 'contains'), URIRef(str(span2))))
g.add((URIRef(str(span2)), URIRef(str(tao) + 'begins_at'), Literal(str(annot[31]))))
g.add((URIRef(str(span2)), URIRef(str(tao) + 'ends_at'), Literal(str(annot[32]))))
if '|' in annot[3]:
for con in annot[3].split('|'):
concept2 = str(concept) + str(con)
g.add((URIRef(str(span2)), URIRef(str(tao) + 'denotes'), URIRef(str(concept1))))
g.add((URIRef(str(concept2)), RDF.type, URIRef(str(tao) + 'Concept_entity')))
g.add((URIRef(str(concept2)), RDF.type, URIRef(str(RDF) + 'Predicate')))
g.add((URIRef(str(RDF) + 'Predicate'), DCTERMS.identifier, URIRef(str(umls) + str(annot[3]))))
g.add((URIRef(str(RDF) + 'Predicate'), RDFS.label, Literal(str(annot[3]))))
else:
concept2 = str(concept) + str(annot[3])
g.add((URIRef(str(span2)), URIRef(str(tao) + 'denotes'), URIRef(str(concept1))))
g.add((URIRef(str(concept2)), RDF.type, URIRef(str(tao) + 'Concept_entity')))
g.add((URIRef(str(concept2)), RDF.type, URIRef(str(RDF) + 'Predicate')))
g.add((URIRef(str(RDF) + 'Predicate'), DCTERMS.identifier, URIRef(str(umls) + str(annot[3]))))
g.add((URIRef(str(RDF) + 'Predicate'), RDFS.label, Literal(str(annot[3]))))
# object
span3 = str(span) + str(annot[36]) + "-" + str(annot[37])
g.add((URIRef(str(sents)), URIRef(str(tao) + 'contains'), URIRef(str(span3))))
g.add((URIRef(str(span3)), URIRef(str(tao) + 'begins_at'), Literal(str(annot[36]))))
g.add((URIRef(str(span3)), URIRef(str(tao) + 'ends_at'), Literal(str(annot[37]))))
if '|' in annot[8]:
for con in annot[8].split('|'):
concept3 = str(concept) + str(con)
g.add((URIRef(str(span3)), URIRef(str(tao) + 'denotes'), URIRef(str(concept3))))
g.add((URIRef(str(concept3)), RDF.type, URIRef(str(tao) + 'Concept_entity')))
g.add((URIRef(str(concept3)), RDF.type, URIRef(str(RDF) + 'Object')))
g.add((URIRef(str(RDF) + 'Object'), DCTERMS.identifier, URIRef(str(umls) + str(annot[8]))))
g.add((URIRef(str(RDF) + 'Object'), RDFS.label, Literal(str(annot[9]))))
else:
concept3 = str(concept) + str(annot[8])
g.add((URIRef(str(span3)), URIRef(str(tao) + 'denotes'), URIRef(str(concept3))))
g.add((URIRef(str(concept3)), RDF.type, URIRef(str(tao) + 'Concept_entity')))
g.add((URIRef(str(concept3)), RDF.type, URIRef(str(RDF) + 'Object')))
g.add((URIRef(str(RDF) + 'Object'), DCTERMS.identifier, URIRef(str(umls) + str(annot[8]))))
g.add((URIRef(str(RDF) + 'Object'), RDFS.label, Literal(str(annot[9]))))
# graph-level triples
g.add((URIRef(str(span1)), URIRef(str(tao) + 'follows'), URIRef(str(span2))))
g.add((URIRef(str(span2)), URIRef(str(tao) + 'follows'), URIRef(str(span3))))
# serialize annotation graph for pmid
out = "RDF_output/semrep_" + str(pmid)
g.serialize(destination=str(out) + ".xml", format='xml')
def main():
parser = argparse.ArgumentParser(description='SemRepRDF: This program is designed to ')
parser.add_argument('-h', '--host', help='MySQL host information', required=True)
parser.add_argument('-u', '--user', help='MySQL username information', required=True)
parser.add_argument('-p', '--passwd', help='MySQL root information', required=True)
parser.add_argument('-d', '--db', help='MySQL password information', required=True)
args = parser.parse_args()
# set default encoding to utf8
reload(sys)
sys.setdefaultencoding('utf8') # needed to parse article titles
# connect to database
# db_info = [args.host, args.user, args.passwd, args.db]
db_info = ['', 'root', 'YRUS9bjb', 'SemMedDB']
# retrieve all pubmed ids to facilitate parallel processing
pmids = [str(id[0]) for id in QueryRunner(db_info, ("select PMID from SemMedDB.CITATIONS"))]
# configure multiprocessing settings
pool = multiprocessing.Pool() # set up pool
func = partial(TripleMaker, db_info)
triples = pool.map(func, pmids)
# close and join pool
pool.close()
pool.join()
if __name__ == '__main__':
main()