-
Notifications
You must be signed in to change notification settings - Fork 27
/
rdflite.rb
414 lines (341 loc) · 11.9 KB
/
rdflite.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
# RDFLite is a lightweight RDF database on top of sqlite3. It can act as adapter
# in ActiveRDF. It supports on-disk and in-memory usage, and allows keyword
# search if ferret is installed.
#
# Author:: Eyal Oren
# Copyright:: (c) 2005-2006 Eyal Oren
# License:: LGPL
require 'sqlite3'
require 'active_rdf'
require 'federation/connection_pool'
$log.info "loading RDFLite adapter"
begin
require 'ferret'
@@have_ferret = true
rescue LoadError
$log.info "Keyword search is disabled since we could not load Ferret. To
enable, please do \"gem install ferret\""
@@have_ferret = false
end
class RDFLite < ActiveRdfAdapter
ConnectionPool.register_adapter(:rdflite,self)
bool_accessor :keyword_search
# instantiates RDFLite database
# available parameters:
# * :location => filepath (defaults to memory)
# * :keyword => true/false (defaults to true)
# * :pidx, :oidx, etc. => true/false (enable/disable these indices)
def initialize(params = {})
$log.info "initialised rdflite with params #{params.to_s}"
# if no file-location given, we use in-memory store
file = params[:location] || ':memory:'
@db = SQLite3::Database.new(file)
# we enable keyword unless the user specifies otherwise
@keyword_search = if params[:keyword].nil?
true
else
params[:keyword]
end
# we can only do keyword search if ferret is found
@keyword_search &= @@have_ferret
$log.debug "we #{@keyword_search ? "do" : "don't"} have keyword search"
if @keyword_search
# we initialise the ferret index, either as a file or in memory
# we setup the fields not to store object's contents
infos = Ferret::Index::FieldInfos.new
infos.add_field(:subject, :store => :yes, :index => :no, :term_vector => :no)
infos.add_field(:object, :store => :no, :index => :omit_norms)
@ferret = if params[:location]
Ferret::I.new(:path => params[:location] + '.ferret', :field_infos => infos)
else
Ferret::I.new(:field_infos => infos)
end
end
# turn off filesystem synchronisation for speed
@db.synchronous = 'off'
# create triples table. since triples are unique, inserted duplicates are
@db.execute('create table if not exists triple(s,p,o, unique(s,p,o) on conflict ignore)')
create_indices(params)
$log.debug("opened connection to #{file}")
$log.debug("database contains #{size} triples")
end
# we can read and write to this adapter
def writes?; true; end
def reads?; true; end
# returns the number of triples in the datastore (incl. possible duplicates)
def size
@db.execute('select count(*) from triple')[0][0].to_i
end
# returns all triples in the datastore
def dump
@db.execute('select s,p,o from triple') do |s,p,o|
[s,p,o].join(' ')
end
end
# deletes all triples from datastore
def clear
@db.execute('delete from triple')
end
# adds triple(s,p,o) to datastore
# s,p must be resources, o can be primitive data or resource
def add(s,p,o)
# check illegal input
raise(ActiveRdfError, "adding non-resource #{s}") unless s.respond_to?(:uri)
raise(ActiveRdfError, "adding non-resource #{p}") unless p.respond_to?(:uri)
# transform triple into internal format <uri> and "literal"
s = "<#{s.uri}>"
p = "<#{p.uri}>"
o = case o
when RDFS::Resource
"<#{o.uri}>"
else
"\"#{o.to_s}\""
end
# add triple to database
add_internal(s,p,o)
end
# flushes openstanding changes to underlying sqlite3
def flush
# since we always write changes into sqlite3 immediately, we don't do
# anything here
true
end
# loads triples from file in ntriples format
def load(file)
ntriples = File.readlines(file)
@db.transaction do
ntriples.each do |triple|
nodes = triple.scan(Node)
add_internal(nodes[0], nodes[1], nodes[2])
end
end
$log.debug "read #{ntriples.size} triples from file #{file}"
end
# executes ActiveRDF query on datastore
def query(query)
# log received query
$log.debug "received query: #{query.to_sp}"
# construct query clauses
sql = translate(query)
# executing query, passing all where-clause values as parameters (so that
# sqlite will encode quotes correctly)
constraints = @right_hand_sides.collect { |value| value.to_s }
$log.debug format("executing: #{sql.gsub('?','"%s"')}", *constraints)
# executing query
results = @db.execute(sql, *constraints)
# if ASK query, we check whether we received a positive result count
if query.ask?
return [results[0][0].to_i > 0]
else
# otherwise we convert results to ActiveRDF nodes and return them
return wrap(query, results)
end
end
# translates ActiveRDF query into internal sqlite query string
def translate(query)
construct_select(query) + construct_join(query) + construct_where(query) +
construct_limit(query)
end
private
# adds s,p,o into sqlite and ferret
# s,p,o should be in internal format: <uri> and "literal"
def add_internal(s,p,o)
# insert the triple into the datastore
@db.execute('insert into triple values (?,?,?)', s,p,o)
# if keyword-search available, insert the object into keyword search
@ferret << {:subject => s, :object => o} if @keyword_search
end
# construct select clause
def construct_select(query)
# ASK queries counts the results, and return true if results > 0
return "select count(*)" if query.ask?
# add select terms for each selectclause in the query
# the term names depend on the join conditions, e.g. t0.s or t1.p
select = query.select_clauses.collect do |term|
variable_name(query, term)
end
# add possible distinct and count functions to select clause
select_clause = ''
select_clause << 'distinct ' if query.distinct?
select_clause << select.join(', ')
select_clause = "count(#{select_clause})" if query.count?
"select " + select_clause
end
# construct (optional) limit and offset clauses
def construct_limit(query)
clause = ""
# if no limit given, use limit -1 (no limit)
limit = query.limits.nil? ? -1 : query.limits
# if no offset given, use offset 0
offset = query.offsets.nil? ? 0 : query.offsets
clause << " limit #{limit} offset #{offset}"
clause
end
# construct join clause
# TODO: joins don't work this way, they have to be linear (in one direction
# only, and we should only alias tables we didnt alias yet)
# we should only look for one join clause in each where-clause: when we find
# one, we skip the rest of the variables in this clause.
def construct_join(query)
join_stmt = ''
# no join necessary if only one where clause given
return ' from triple as t0 ' if query.where_clauses.size == 1
where_clauses = query.where_clauses.flatten
considering = where_clauses.uniq.select{|w| w.is_a?(Symbol)}
# constructing hash with indices for all terms
# e.g. {?s => [1,3,5], ?p => [2], ... }
term_occurrences = Hash.new()
where_clauses.each_with_index do |term, index|
ary = (term_occurrences[term] ||= [])
ary << index
end
aliases = {}
where_clauses.each_with_index do |term, index|
# if the term has been joined with his buddy already, we can skip it
next unless considering.include?(term)
# we find all (other) occurrences of this term
indices = term_occurrences[term]
# if the term doesnt have a join-buddy, we can skip it
next if indices.size == 1
# construct t0,t1,... as aliases for term
# and construct join condition, e.g. t0.s
termalias = "t#{index / 3}"
termjoin = "#{termalias}.#{SPO[index % 3]}"
join = if join_stmt.include?(termalias)
""
else
"triple as #{termalias}"
end
indices.each do |i|
# skip the current term itself
next if i==index
# construct t0,t1, etc. as aliases for buddy,
# and construct join condition, e.g. t0.s = t1.p
buddyalias = "t#{i/3}"
buddyjoin = "#{buddyalias}.#{SPO[i%3]}"
# TODO: fix reuse of same table names as aliases, e.g.
# "from triple as t1 join triple as t2 on ... join t1 on ..."
# is not allowed as such by sqlite
# but on the other hand, restating the aliases gives ambiguity:
# "from triple as t1 join triple as t2 on ... join triple as t1 ..."
# is ambiguous
join << " join triple as #{buddyalias} on #{termjoin} = #{buddyjoin} "
end
join_stmt << join
# remove term from 'todo' list of still-considered terms
considering.delete(term)
end
if join_stmt == ''
return " from triple as t0 "
else
return " from #{join_stmt} "
end
end
# construct where clause
def construct_where(query)
# collecting where clauses, these will be added to the sql string later
where = []
# collecting all the right-hand sides of where clauses (e.g. where name =
# 'abc'), to add to query string later using ?-notation, because then
# sqlite will automatically encode quoted literals correctly
@right_hand_sides = []
# convert each where clause to SQL:
# add where clause for each subclause, except if it's a variable
query.where_clauses.each_with_index do |clause,level|
clause.each_with_index do |subclause, i|
# dont add where clause for variables
unless subclause.is_a?(Symbol)
where << "t#{level}.#{SPO[i]} = ?"
@right_hand_sides << case subclause
when RDFS::Resource
"<#{subclause.uri}>"
else
subclause.to_s
end
end
end
end
# if keyword clause given, convert it using keyword index
if query.keyword?
subjects = []
query.keywords.each do |subject, key|
@ferret.search_each("object:\"#{key}\"") do |idx,score|
subjects << @ferret[idx][:subject]
end
subjects.uniq! if query.distinct?
where << "#{variable_name(query,subject)} in (#{subjects.collect {'?'}.join(',')})"
@right_hand_sides += subjects
end
end
if where.empty?
''
else
"where " + where.join(' and ')
end
end
# returns sql variable name for a queryterm
def variable_name(query,term)
# look up the first occurence of this term in the where clauses, and compute
# the level and s/p/o position of it
index = query.where_clauses.flatten.index(term)
if index.nil?
# term does not appear in where clause
# but maybe it appears in a keyword clause
# index would not be nil if we had:
# select(:o).where(knud, knows, :o).where(:o, :keyword, 'eyal')
#
# the only possibility that index is nil is if we have:
# select(:o).where(:o, :keyword, :eyal) (selecting subject)
# or if we use a select clause that does not appear in any where clause
# so we check if we find the term in the keyword clauses, otherwise we throw
# an error
if query.keywords.flatten.include?(term)
return "t0.s"
else
raise ActiveRdfError,'unbound variable in select clause'
end
end
termtable = "t#{index / 3}"
termspo = SPO[index % 3]
return "#{termtable}.#{termspo}"
end
# wrap resources into ActiveRDF resources, literals into Strings
def wrap(query, results)
results.collect do |row|
row.collect do |result|
case result
when Resource
RDFS::Resource.new($1)
when Literal
String.new($1)
else
# when we do a count(*) query we get a number, not a resource/literal
results
end
end
end
end
def create_indices(params)
sidx = params[:sidx] || false
pidx = params[:pidx] || false
oidx = params[:oidx] || false
spidx = params[:spidx] || true
soidx = params[:soidx] || false
poidx = params[:poidx] || true
opidx = params[:opidx] || false
# creating lookup indices
@db.transaction do
@db.execute('create index if not exists sidx on triple(s)') if sidx
@db.execute('create index if not exists pidx on triple(p)') if pidx
@db.execute('create index if not exists oidx on triple(o)') if oidx
@db.execute('create index if not exists spidx on triple(s,p)') if spidx
@db.execute('create index if not exists soidx on triple(s,p)') if soidx
@db.execute('create index if not exists poidx on triple(p,o)') if poidx
@db.execute('create index if not exists opidx on triple(o,p)') if opidx
end
end
Resource = /<([^>]*)>/
Literal = /"([^"]*)"/
Node = Regexp.union(/<[^>]*>/,/"[^"]*"/)
SPO = ['s','p','o']
end