Skip to content

Commit

Permalink
Solr Cell attachment indexing patch
Browse files Browse the repository at this point in the history
  • Loading branch information
isaac committed May 24, 2010
1 parent 7dc6082 commit 7b127a9
Show file tree
Hide file tree
Showing 17 changed files with 434 additions and 15 deletions.
2 changes: 1 addition & 1 deletion sunspot/lib/sunspot.rb
Expand Up @@ -14,7 +14,7 @@

%w(util adapters configuration setup composite_setup text_field_setup field
field_factory data_extractor indexer query search session session_proxy
type dsl).each do |filename|
type dsl rich_document).each do |filename|
require File.join(File.dirname(__FILE__), 'sunspot', filename)
end

Expand Down
31 changes: 31 additions & 0 deletions sunspot/lib/sunspot/composite_setup.rb
Expand Up @@ -116,6 +116,18 @@ def all_more_like_this_fields
@more_like_this_fields ||= more_like_this_fields_hash.values.map { |set| set.to_a }.flatten
end

#
# Collection of all attachment fields configured for any of the enclosed types.
#
# === Returns
#
# Array:: Text fields configured for the enclosed types
#
def all_attachment_fields
@attachment_fields ||= attachment_fields_hash.values.map { |set| set.to_a }.flatten
end


private

#
Expand Down Expand Up @@ -146,6 +158,25 @@ def more_like_this_fields_hash
end
end

#
# Return a hash of field names to atachment field objects, containing all fields
# that are configured for any of the types enclosed.
#
# ==== Returns
#
# Hash:: Hash of field names to text field objects.
#
def attachment_fields_hash
@text_fields_hash ||=
setups.inject({}) do |hash, setup|
setup.all_attachment_fields.each do |text_field|
(hash[text_field.name] ||= Set.new) << text_field
end
hash
end
end


#
# Return a hash of field names to field objects, containing all fields
# that are common to all of the classes enclosed. In order for fields
Expand Down
11 changes: 10 additions & 1 deletion sunspot/lib/sunspot/dsl/fields.rb
Expand Up @@ -43,7 +43,16 @@ def text(*names, &block)
end
end

#
# Added an attachment field, the attachment filename is passed to Solr for
# indexing by tiqa

def attachment(*names)
names.each do |name|
@setup.add_attachment_field_factory(name)
end
end

#
# Specify a method or block that returns the geographical coordinates
# associated with the document. The object returned must respond to #first
# and #last (e.g., a two-element Array); or to #lat and one of #lng, #lon,
Expand Down
11 changes: 11 additions & 0 deletions sunspot/lib/sunspot/dsl/standard_query.rb
Expand Up @@ -99,8 +99,19 @@ def fulltext(keywords, options = {}, &block)
end
end
end

if !field_names && (!fulltext_dsl || !fulltext_dsl.fields_added?)
unless @setup.all_attachment_fields.empty?
@setup.all_attachment_fields.each do |attachment_text_field|
unless fulltext_dsl && fulltext_dsl.exclude_fields.include?(attachment_text_field.name)
fulltext_query.add_fulltext_field(attachment_text_field, attachment_text_field.default_boost)
end
end
end
end
end
end

alias_method :keywords, :fulltext

#
Expand Down
2 changes: 2 additions & 0 deletions sunspot/lib/sunspot/field.rb
Expand Up @@ -133,6 +133,8 @@ def indexed_name
# scoping, sorting, and faceting is done with attribute fields.
#
class AttributeField < Field #:nodoc:
attr_reader :default_boost

def initialize(name, type, options = {})
super(name, type, options)
@multiple = !!options.delete(:multiple)
Expand Down
13 changes: 13 additions & 0 deletions sunspot/lib/sunspot/field_factory.rb
Expand Up @@ -143,5 +143,18 @@ def populate_document(document, model)
end
end
end

class Attachment
def initialize(name = nil, &block)
if block
@data_extractor = DataExtractor::BlockExtractor.new(&block)
else
@data_extractor = DataExtractor::AttributeExtractor.new(name)
end
end

def populate_document(document, model)
end
end
end
end
22 changes: 20 additions & 2 deletions sunspot/lib/sunspot/indexer.rb
Expand Up @@ -98,7 +98,25 @@ def prepare(model)
end

def add_documents(documents)
@connection.add(documents)
documents_arr = Util.Array(documents)
docs_attach = []
docs_no_attach = []
documents_arr.each do |document|
if document.contains_attachment?
docs_attach << document
else
docs_no_attach << document
end
end

unless docs_no_attach.empty?
@connection.add(docs_no_attach)
else

Util.Array(docs_attach).each do |document|
document.add(@connection)
end
end
end

#
Expand All @@ -107,7 +125,7 @@ def add_documents(documents)
# pairs.
#
def document_for(model)
RSolr::Message::Document.new(
Sunspot::RichDocument.new(
:id => Adapters::InstanceAdapter.adapt(model).index_id,
:type => Util.superclasses_for(model.class).map { |clazz| clazz.name }
)
Expand Down
45 changes: 45 additions & 0 deletions sunspot/lib/sunspot/rich_document.rb
@@ -0,0 +1,45 @@
module Sunspot
class RichDocument < RSolr::Message::Document
include Enumerable

def contains_attachment?
@fields.each do |field|
if field.name.to_s.include?("_attachment")
return true
end
end
return false
end

def add(connection)
params = {
:wt => :ruby,
'idx.attr' => false, # don't index any attributes, unless explicitly mapped
'ignore.und.fl' => true, # ignore all undefined fields
'map.title' => 'title_text',
}

@fields.each do |f|
puts f.name.to_s + " " + f.value.to_s

if f.name.to_s.include?("_attachment")
params["resource.name"] = f.value # TIKA-154 workaround
params["stream.file"] = f.value
params['def.fl'] = f.name, # all text extracted goes to text_t (since it is a stored field, for highlighting)
params['fmap.content'] = f.name
else
param_name = "literal.#{f.name.to_s}"
params[param_name] = [] unless params.has_key?(param_name)
params[param_name] << f.value
end

# if f.boost
# params["boost.#{f.name.to_s}"] = f.boost
# end
end

solr_message = params
pp connection.send('update/extract', solr_message)
end
end
end
56 changes: 46 additions & 10 deletions sunspot/lib/sunspot/setup.rb
Expand Up @@ -5,12 +5,13 @@ module Sunspot
#
class Setup #:nodoc:
attr_reader :class_object_id

def initialize(clazz)
@class_object_id = clazz.object_id
@class_name = clazz.name
@field_factories, @text_field_factories, @dynamic_field_factories,
@field_factories, @text_field_factories, @dynamic_field_factories, @attachment_field_factories,
@field_factories_cache, @text_field_factories_cache,
@dynamic_field_factories_cache = *Array.new(6) { Hash.new }
@dynamic_field_factories_cache, @attachment_field_factories_cache = *Array.new(8) { Hash.new }
@stored_field_factories_cache = Hash.new { |h, k| h[k] = [] }
@more_like_this_field_factories_cache = Hash.new { |h, k| h[k] = [] }
@dsl = DSL::Fields.new(self)
Expand Down Expand Up @@ -38,6 +39,22 @@ def add_field_factory(name, type, options = {}, &block)
end

#
# Add field_factories for fulltext search on attachments
#
# ==== Parameters
#

def add_attachment_field_factory(name, options = {}, &block)
stored = options[:stored]
field_factory = FieldFactory::Static.new(name, Type::AttachmentType.instance, options, &block)
@attachment_field_factories[name] = field_factory
@attachment_field_factories_cache[field_factory.name] = field_factory
if stored
@attachment_field_factories_cache[field_factory.name] << field_factory
end
end

#
# Add field_factories for fulltext search
#
# ==== Parameters
Expand Down Expand Up @@ -131,14 +148,18 @@ def field(field_name)
# CompositeSetup objects might return more than one.
#
def text_fields(field_name)
text_field =
text_field =
if field_factory = @text_field_factories_cache[field_name.to_sym]
field_factory.build
else
raise(
UnrecognizedFieldError,
"No text field configured for #{@class_name} with name '#{field_name}'"
)
if field_factory = @attachment_field_factories_cache[field_name.to_sym]
field_factory.build
else
raise(
UnrecognizedFieldError,
"No text field configured for #{@class_name} with name '#{field_name}'"
)
end
end
[text_field]
end
Expand Down Expand Up @@ -200,7 +221,11 @@ def all_more_like_this_fields
end.flatten
end

#
def all_attachment_fields
attachment_field_factories.map { |field_factory| field_factory.build }
end

#
# Get the field_factories associated with this setup as well as all inherited field_factories
#
# ==== Returns
Expand All @@ -211,7 +236,18 @@ def field_factories
collection_from_inheritable_hash(:field_factories)
end

#
# Get the text field_factories associated with this setup as well as all inherited
# attachment field_factories
#
# ==== Returns
#
# Array:: Collection of all text field_factories associated with this setup
#
def attachment_field_factories
collection_from_inheritable_hash(:attachment_field_factories)
end

#
# Get the text field_factories associated with this setup as well as all inherited
# text field_factories
#
Expand All @@ -233,7 +269,7 @@ def text_field_factories
#
def all_field_factories
all_field_factories = []
all_field_factories.concat(field_factories).concat(text_field_factories).concat(dynamic_field_factories)
all_field_factories.concat(field_factories).concat(text_field_factories).concat(dynamic_field_factories).concat(attachment_field_factories)
all_field_factories << @coordinates_field_factory if @coordinates_field_factory
all_field_factories
end
Expand Down
14 changes: 14 additions & 0 deletions sunspot/lib/sunspot/type.rb
Expand Up @@ -110,6 +110,20 @@ def accepts_more_like_this?
end
end

class AttachmentType < AbstractType
def indexed_name(name) #:nodoc:
"#{name}_attachment"
end

def to_indexed(value) #:nodoc:
value.to_s if value
end

def cast(text)
text
end
end

#
# The String type represents string data.
#
Expand Down
32 changes: 32 additions & 0 deletions sunspot/spec/attachments/highlighting_spec.rb
@@ -0,0 +1,32 @@
require File.join(File.dirname(__FILE__), 'spec_helper')
require 'pp'

describe 'attachment keyword highlighting' do
before :all do
test_docs = File.expand_path(File.join(File.dirname(File.dirname(__FILE__)), 'test_docs'))
@posts = []
@posts << RichTextPost.new(:rich_attachment => File.join(test_docs, 'TestPDF.pdf'))
@posts << RichTextPost.new(:rich_attachment => File.join(test_docs, 'JustAnotherTest.pdf'), :title => "This is the title")
Sunspot.index!(*@posts)
@search_result = Sunspot.search(RichTextPost) { keywords 'lorem', :highlight => true }
end

it 'should include highlights in the results' do
@search_result.hits.first.highlights.length.should == 1
end

it 'should return formatted highlight fragments' do
@search_result.hits.first.highlights(:rich_attachment).should_not be_empty
@search_result.hits.first.highlights(:rich_attachment).first.format.should == "This is a test \nPDF file. <em>Lorem</em> ipsum dolor sit amet, consectetur adipiscing elit"
end

it 'should be empty for non-keyword searches' do
search_result = Sunspot.search(RichTextPost){ with :title, "This is the title" }
search_result.hits.first.highlights.should be_empty
end

it 'should return multple hits for multiple occurances' do
pp @search_result.hits.first.highlights(:rich_attachment)
@search_result.hits.first.highlights(:rich_attachment).length.should > 1
end
end

0 comments on commit 7b127a9

Please sign in to comment.