Skip to content

Commit

Permalink
Merge pull request #1 from ifad/features/txt_conversion
Browse files Browse the repository at this point in the history
Add support for conversion to text
  • Loading branch information
vjt committed Nov 17, 2015
2 parents e2fa09b + d648282 commit df3757f
Show file tree
Hide file tree
Showing 14 changed files with 176 additions and 3 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ gem 'sidekiq'
gem 'sidetiq'
gem 'rest-client'
gem 'mail'
gem 'nokogiri'

group :development do
gem 'rake'
Expand Down
7 changes: 7 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ GEM
mail (2.6.3)
mime-types (>= 1.16, < 3)
mime-types (2.4.3)
mini_portile (0.6.2)
multi_json (1.10.1)
netrc (0.10.2)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
rack (1.6.0)
rack-protection (1.5.3)
rack
Expand Down Expand Up @@ -97,6 +100,7 @@ DEPENDENCIES
json
mail
mime-types
nokogiri
rack-test
rake
rest-client
Expand All @@ -111,3 +115,6 @@ DEPENDENCIES
unicorn
wkhtmltopdf
yard

BUNDLED WITH
1.10.6
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,14 @@ will be a normal form post, sending these values:
* `action` - the conversion action performed
* `path` - a path to the converted file. You will have to tack the Colore URL base onto this

## Depedendencies

Colore expects the following commands to be available in it's PATH:

* `libreoffice` - From LibreOffice, `libreoffice` on Debian.
* `convert` - From ImageMagick, `imagemagick` on Debian.
* `pdftotext` - From Poppler, `poppler-utils` on Debian.

## Contributing

Want to contribute? Great!
Expand Down
5 changes: 5 additions & 0 deletions lib/heathen/job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,10 @@ def content_file suffix=''
end
@tempfile.path
end

# Call this to reset the tempfile between multisteps tasks
def reset_content_file!
@tempfile = nil
end
end
end
22 changes: 22 additions & 0 deletions lib/heathen/processor_methods/htmltotext.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
require 'nokogiri'

module Heathen
class Processor
def htmltotext
expect_mime_type 'text/html'

begin
doc = Nokogiri::HTML(File.open(job.content_file))

# Strip JS / CSS from the file so it doesn't appear in the output
doc.css('script, link').each { |node| node.remove }

text = doc.css('body').text
rescue Nokogiri::SyntaxError => e
raise ConversionFailed.new(e)
end

job.content = text
end
end
end
13 changes: 11 additions & 2 deletions lib/heathen/processor_methods/libreoffice.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,33 @@ def libreoffice( format: )
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'odt',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'ods',
'application/vnd.openxmlformats-officedocument.presentationml.presentation' => 'odp',
},
'txt' => {
'.*' => 'txt'
}
}

conversion_methods = {
'txt' => 'txt:Text'
}

raise InvalidParameterInStep.new('format', format) unless suffixes[format.to_s]
to_suffix = nil
suffixes[format.to_s].each do |k,v|
to_suffix = v if job.mime_type =~ /#{k}/
end
raise InvalidMimeTypeInStep.new('(various document formats)', job.mime_type) unless to_suffix

target_file = "#{job.content_file}.#{to_suffix}"
conversion_method = conversion_methods[to_suffix] || to_suffix
target_file = "#{job.content_file}.#{to_suffix}"
executioner.execute(
'libreoffice',
'--convert-to', to_suffix,
'--convert-to', conversion_method,
'--outdir', sandbox_dir,
job.content_file,
'--headless',
)

raise ConversionFailed.new(executioner.last_messages) if executioner.last_exit_status != 0
raise ConversionFailed.new("Cannot find converted file (looking for #{File.basename(target_file)})" ) unless File.exist? target_file
c = File.read(target_file)
Expand Down
17 changes: 17 additions & 0 deletions lib/heathen/processor_methods/pdftotext.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
module Heathen
class Processor
def pdftotext
expect_mime_type 'application/pdf'

target_file = temp_file_name
executioner.execute(
'pdftotext',
job.content_file,
target_file
)
raise ConversionFailed.new(executioner.last_messages) if executioner.last_exit_status != 0
job.content = File.read(target_file)
File.unlink(target_file)
end
end
end
15 changes: 15 additions & 0 deletions lib/heathen/task.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,13 @@ def task_key action, mime_type

Heathen::Task.register 'ocr', 'image/.*' do
convert_image to: :tiff, params: '-depth 8 -density 300 -background white +matte'
job.reset_content_file!
tesseract format: 'pdf'
end

Heathen::Task.register 'ocr_text', 'image/.*' do
convert_image to: :tiff, params: '-depth 8 -density 300 -background white +matte'
job.reset_content_file!
tesseract format: nil
end

Expand All @@ -82,6 +84,19 @@ def task_key action, mime_type
libreoffice format: 'ooffice'
end

Heathen::Task.register 'txt', '.*' do
case job.mime_type
when %r[image/*]
perform_task 'ocr_text'
when %r[text/html]
htmltotext
when %r[application/pdf]
pdftotext
else
libreoffice format: 'txt'
end
end

# support legacy method
Heathen::Task.register 'doc', '.*' do
perform_task 'msoffice'
Expand Down
Binary file added spec/fixtures/heathen/quickfox.bmp
Binary file not shown.
Binary file added spec/fixtures/heathen/quickfox.pdf
Binary file not shown.
18 changes: 18 additions & 0 deletions spec/heathen/processor_methods/htmltotext_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
require 'spec_helper'

describe Heathen::Processor do
let(:content) { File.read(fixture('heathen/quickfox.html')) }
let(:job) { Heathen::Job.new 'foo', content, 'en' }
let(:processor) { described_class.new job: job, logger: Logger.new($stderr) }

after do
processor.clean_up
end

context '#htmltotext' do
it 'converts HTML to TXT' do
processor.htmltotext
expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii'
end
end
end
13 changes: 13 additions & 0 deletions spec/heathen/processor_methods/libreoffice_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,18 @@ def new_job content
expect(@job.content.mime_type).to eq 'application/vnd.oasis.opendocument.presentation; charset=binary'
end
end

context 'convert to TXT' do
it 'from MS word' do
new_job ms_word_content
@processor.libreoffice format: 'txt'
expect(@job.content.mime_type).to eq 'text/plain; charset=utf-8'
end
it 'from OO word' do
new_job oo_word_content
@processor.libreoffice format: 'txt'
expect(@job.content.mime_type).to eq 'text/plain; charset=utf-8'
end
end
end
end
18 changes: 18 additions & 0 deletions spec/heathen/processor_methods/pdftotext_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
require 'spec_helper'

describe Heathen::Processor do
let(:content) { File.read(fixture('heathen/quickfox.pdf')) }
let(:job) { Heathen::Job.new 'foo', content, 'en' }
let(:processor) { described_class.new job: job, logger: Logger.new($stderr) }

after do
processor.clean_up
end

context '#pdftotext' do
it 'converts PDF to TXT' do
processor.pdftotext
expect(job.content.mime_type).to eq 'text/plain; charset=us-ascii'
end
end
end
42 changes: 41 additions & 1 deletion spec/integration/standard_tasks_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,17 @@
end

context 'ocr_text' do
it 'runs' do
it 'converts jpeg' do
content = fixture('heathen/quickfox.jpg').read
new_content = converter.convert 'ocr_text', content
expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
end

it 'converts bmp' do
content = fixture('heathen/quickfox.bmp').read
new_content = converter.convert 'ocr_text', content
expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
end
end

context 'pdf' do
Expand All @@ -34,18 +40,52 @@
new_content = converter.convert 'pdf', content
expect(new_content.mime_type).to eq 'application/pdf; charset=binary'
end

it 'converts HTML documents' do
content = fixture('heathen/quickfox.html').read
new_content = converter.convert 'pdf', content
expect(new_content.mime_type).to eq 'application/pdf; charset=binary'
end

it 'converts Office documents' do
content = fixture('heathen/msword.docx').read
new_content = converter.convert 'pdf', content
expect(new_content.mime_type).to eq 'application/pdf; charset=binary'
end
end

context 'txt' do
it 'converts odt' do
content = fixture('heathen/ooword.odt').read
new_content = converter.convert 'txt', content
expect(new_content.mime_type).to eq 'text/plain; charset=utf-8'
end

it 'converts docx' do
content = fixture('heathen/msword.docx').read
new_content = converter.convert 'txt', content
expect(new_content.mime_type).to eq 'text/plain; charset=utf-8'
end

it 'converts images' do
content = fixture('heathen/quickfox.jpg').read
new_content = converter.convert 'txt', content
expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
end

it 'converts pdf' do
content = fixture('heathen/quickfox.pdf').read
new_content = converter.convert 'txt', content
expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
end

it 'converts HTML documents' do
content = fixture('heathen/quickfox.html').read
new_content = converter.convert 'txt', content
expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
end
end

context 'msoffice' do
it 'runs' do
content = fixture('heathen/ooword.odt').read
Expand Down

0 comments on commit df3757f

Please sign in to comment.