Skip to content

Commit

Permalink
Merge pull request #3 from ifad/develop
Browse files Browse the repository at this point in the history
libreoffice text conversion and specs fixes
  • Loading branch information
vjt committed Jan 15, 2016
2 parents df3757f + 457dfc1 commit 63d4fe0
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 24 deletions.
40 changes: 26 additions & 14 deletions lib/heathen/processor_methods/libreoffice.rb
Expand Up @@ -37,30 +37,42 @@ def libreoffice( format: )
}
}

conversion_methods = {
'txt' => 'txt:Text'
}

raise InvalidParameterInStep.new('format', format) unless suffixes[format.to_s]
to_suffix = nil
suffixes[format.to_s].each do |k,v|
to_suffix = v if job.mime_type =~ /#{k}/
end
raise InvalidMimeTypeInStep.new('(various document formats)', job.mime_type) unless to_suffix

conversion_method = conversion_methods[to_suffix] || to_suffix
target_file = "#{job.content_file}.#{to_suffix}"
executioner.execute(
'libreoffice',
'--convert-to', conversion_method,
'--outdir', sandbox_dir,
job.content_file,
'--headless',
)
target_file = "#{job.content_file}.#{to_suffix}"

if to_suffix == 'txt'
executioner.execute(
'libreoffice',
'--convert-to', 'pdf',
'--outdir', sandbox_dir,
job.content_file,
'--headless',
)

executioner.execute(
'pdftotext',
"#{job.content_file}.pdf",
target_file
)
else
executioner.execute(
'libreoffice',
'--convert-to', to_suffix,
'--outdir', sandbox_dir,
job.content_file,
'--headless',
)
end

raise ConversionFailed.new(executioner.last_messages) if executioner.last_exit_status != 0
raise ConversionFailed.new("Cannot find converted file (looking for #{File.basename(target_file)})" ) unless File.exist? target_file
c = File.read(target_file)

job.content = File.read(target_file)
File.unlink(target_file)
end
Expand Down
10 changes: 5 additions & 5 deletions spec/heathen/processor_methods/libreoffice_spec.rb
Expand Up @@ -55,7 +55,7 @@ def new_job content
it 'from OO word' do
new_job oo_word_content
@processor.libreoffice format: 'msoffice'
expect(@job.content.mime_type).to eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary'
expect(ms_word_mime_types).to include(@job.content.mime_type)
end
it 'from OO spreadsheet' do
new_job oo_spreadsheet_content
Expand All @@ -74,12 +74,12 @@ def new_job content
it 'from MS word' do
new_job ms_word_content
@processor.libreoffice format: 'ooffice'
expect(@job.content.mime_type).to eq 'application/xml; charset=us-ascii'
expect(oo_mime_types).to include(@job.content.mime_type)
end
it 'from MS spreadsheet' do
new_job ms_spreadsheet_content
@processor.libreoffice format: 'ooffice'
expect(@job.content.mime_type).to eq 'application/xml; charset=us-ascii'
expect(oo_mime_types).to include(@job.content.mime_type)
end
it 'from MS powerpoint' do
new_job ms_ppt_content
Expand All @@ -92,12 +92,12 @@ def new_job content
it 'from MS word' do
new_job ms_word_content
@processor.libreoffice format: 'txt'
expect(@job.content.mime_type).to eq 'text/plain; charset=utf-8'
expect(@job.content.mime_type).to eq 'text/plain; charset=us-ascii'
end
it 'from OO word' do
new_job oo_word_content
@processor.libreoffice format: 'txt'
expect(@job.content.mime_type).to eq 'text/plain; charset=utf-8'
expect(@job.content.mime_type).to eq 'text/plain; charset=us-ascii'
end
end
end
Expand Down
9 changes: 9 additions & 0 deletions spec/helpers/mime_types.rb
@@ -0,0 +1,9 @@
def ms_word_mime_types
['application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary',
'application/zip; charset=binary']
end

def oo_mime_types
['application/xml; charset=us-ascii',
'application/octet-stream; charset=binary']
end
10 changes: 5 additions & 5 deletions spec/integration/standard_tasks_spec.rb
Expand Up @@ -58,13 +58,13 @@
it 'converts odt' do
content = fixture('heathen/ooword.odt').read
new_content = converter.convert 'txt', content
expect(new_content.mime_type).to eq 'text/plain; charset=utf-8'
expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
end

it 'converts docx' do
content = fixture('heathen/msword.docx').read
new_content = converter.convert 'txt', content
expect(new_content.mime_type).to eq 'text/plain; charset=utf-8'
expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii'
end

it 'converts images' do
Expand All @@ -90,23 +90,23 @@
it 'runs' do
content = fixture('heathen/ooword.odt').read
new_content = converter.convert 'msoffice', content
expect(new_content.mime_type).to eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary'
expect(ms_word_mime_types).to include(new_content.mime_type)
end
end

context 'ooffice' do
it 'runs' do
content = fixture('heathen/msword.docx').read
new_content = converter.convert 'ooffice', content
expect(new_content.mime_type).to eq 'application/xml; charset=us-ascii'
expect(oo_mime_types).to include(new_content.mime_type)
end
end

context 'doc' do
it 'runs' do
content = fixture('heathen/ooword.odt').read
new_content = converter.convert 'doc', content
expect(new_content.mime_type).to eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary'
expect(ms_word_mime_types).to include(new_content.mime_type)
end
end
end

0 comments on commit 63d4fe0

Please sign in to comment.