From bd38189bae98f4e9a06dab24c7b99bb59af62687 Mon Sep 17 00:00:00 2001 From: Antonio Delfin Martinez Date: Mon, 28 Dec 2015 16:52:29 +0100 Subject: [PATCH 1/6] -FIX problem in txt libreoffice conversion from presentations and spreadcheets --- lib/heathen/processor_methods/libreoffice.rb | 38 +++++++++++++------ .../processor_methods/libreoffice_spec.rb | 4 +- spec/integration/standard_tasks_spec.rb | 4 +- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/lib/heathen/processor_methods/libreoffice.rb b/lib/heathen/processor_methods/libreoffice.rb index 6d3aa1c..a4afe3c 100644 --- a/lib/heathen/processor_methods/libreoffice.rb +++ b/lib/heathen/processor_methods/libreoffice.rb @@ -37,10 +37,6 @@ def libreoffice( format: ) } } - conversion_methods = { - 'txt' => 'txt:Text' - } - raise InvalidParameterInStep.new('format', format) unless suffixes[format.to_s] to_suffix = nil suffixes[format.to_s].each do |k,v| @@ -48,18 +44,36 @@ def libreoffice( format: ) end raise InvalidMimeTypeInStep.new('(various document formats)', job.mime_type) unless to_suffix - conversion_method = conversion_methods[to_suffix] || to_suffix target_file = "#{job.content_file}.#{to_suffix}" - executioner.execute( - 'libreoffice', - '--convert-to', conversion_method, - '--outdir', sandbox_dir, - job.content_file, - '--headless', - ) + + if to_suffix == 'txt' + pdf_file = "#{job.content_file}.pdf" + executioner.execute( + 'libreoffice', + '--convert-to', 'pdf', + '--outdir', sandbox_dir, + job.content_file, + '--headless', + ) + + executioner.execute( + 'pdftotext', + pdf_file, + target_file + ) + else + executioner.execute( + 'libreoffice', + '--convert-to', to_suffix, + '--outdir', sandbox_dir, + job.content_file, + '--headless', + ) + end raise ConversionFailed.new(executioner.last_messages) if executioner.last_exit_status != 0 raise ConversionFailed.new("Cannot find converted file (looking for #{File.basename(target_file)})" ) unless File.exist? target_file + c = File.read(target_file) job.content = File.read(target_file) File.unlink(target_file) diff --git a/spec/heathen/processor_methods/libreoffice_spec.rb b/spec/heathen/processor_methods/libreoffice_spec.rb index 5ab00ae..b964276 100644 --- a/spec/heathen/processor_methods/libreoffice_spec.rb +++ b/spec/heathen/processor_methods/libreoffice_spec.rb @@ -92,12 +92,12 @@ def new_job content it 'from MS word' do new_job ms_word_content @processor.libreoffice format: 'txt' - expect(@job.content.mime_type).to eq 'text/plain; charset=utf-8' + expect(@job.content.mime_type).to eq 'text/plain; charset=us-ascii' end it 'from OO word' do new_job oo_word_content @processor.libreoffice format: 'txt' - expect(@job.content.mime_type).to eq 'text/plain; charset=utf-8' + expect(@job.content.mime_type).to eq 'text/plain; charset=us-ascii' end end end diff --git a/spec/integration/standard_tasks_spec.rb b/spec/integration/standard_tasks_spec.rb index d0f42f7..882bbfc 100644 --- a/spec/integration/standard_tasks_spec.rb +++ b/spec/integration/standard_tasks_spec.rb @@ -58,13 +58,13 @@ it 'converts odt' do content = fixture('heathen/ooword.odt').read new_content = converter.convert 'txt', content - expect(new_content.mime_type).to eq 'text/plain; charset=utf-8' + expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii' end it 'converts docx' do content = fixture('heathen/msword.docx').read new_content = converter.convert 'txt', content - expect(new_content.mime_type).to eq 'text/plain; charset=utf-8' + expect(new_content.mime_type).to eq 'text/plain; charset=us-ascii' end it 'converts images' do From 7fc10ceb8623db03858d8c78da186073e8ca2c3a Mon Sep 17 00:00:00 2001 From: Antonio Delfin Martinez Date: Mon, 28 Dec 2015 17:06:20 +0100 Subject: [PATCH 2/6] -FIX unuseful code and bad identation --- .../processor_methods/.libreoffice.rb.swp | Bin 0 -> 12288 bytes lib/heathen/processor_methods/libreoffice.rb | 3 +-- 2 files changed, 1 insertion(+), 2 deletions(-) create mode 100644 lib/heathen/processor_methods/.libreoffice.rb.swp diff --git a/lib/heathen/processor_methods/.libreoffice.rb.swp b/lib/heathen/processor_methods/.libreoffice.rb.swp new file mode 100644 index 0000000000000000000000000000000000000000..ead3e3a1c31620845b00acb79e353c8e3486de4b GIT binary patch literal 12288 zcmeI2%ZnUE9LFp1{fGv==s~FLfHNbRo{bT+1||j*Ln3anvbsUSGSp1h?DlSVS6fv* znayT{p7fH7-~;udc-2#mDkvxz74$DqL2~fmMerbqeybm|-MgEaT?7xY3O<>c?yBG8 z+f~1=B-NRmJ2g!YIgb&vy@Z^ee`Rhd_>5dTLr5T&n3T-i8QIw6DRspMkRlq7>6|f3e1*`&A0jq#jz$$QqD8O7ouHi5LM{Y(!`2T( z05jkucnTOT9|E7=M93%L3V0i=0v{{^+x~Y7a2~pZ()3S!Zuptpq-ZBRpF5*&=J!Th zh(q5Cmus5I1+LpY7_Gy*X-lHxbVaCnsA&svT)&kAE=OqW=aHDa)-%ge*lqwxzLGPy z9BX2dQpr5U=`j)Zxm2DAPcUTSIw3z_n>fb8P-xooLYE>T^w8YJPoLA8FT}DJF3_Hk zw6dD`=rF}Y7VwgvwfY3DQ^blxN15~kPw6LVLAddu@B#0}8sTy2_)KXWUQ&uJa8;*X zk8Zll2cB*#&2+5jz80M%nTJ$QNQss|E%UwFXyAO5YhtYKC+14p3Qsb5!o_0l<*@Ild5 zi&iyM%4bkP;_I2JtPY2-P2}U!Wp*m;Gv9Nky?{Tz67f^vtmaX&)~dBWlb*nQbwxK0 zOzFVN3Yb=POdJz$fI)4IQNz_Vbdv>MoZZ8Fh_Y`|Dw&|IOSD;8U0!QuL(^%=)w$HE zV}{Q=oOKrw552g&zkf07Su`Aaes;B#-VBqd0-DQWeyL&Pl71N`v8Cr+wYAJVrR(1? z3!F!)v~?tVFf?n)V=|Xl*9Vo>8H*y{>oQYCn*GpqM8v~E;Ab*7(i4!m`c5P{su@ZM zx(9ygXX;$IQF(6e%)WvH=Bh=`HOwcUx!rONs_xA0c_H2Gj3>%7Oy?Qp){841T5hx+ zs755ta~TOQ)cKe(nPN<;UfzGa4?gH}Kkq&!VVE~r<-JE357hNKRz)Gg?aYu1UX(h7 zeuFU&nki2u(<#?_i|GelOzph;H#ZmNy>b(V78IVzgraB$a18ELc--$v5mw> z8XmJOHV9J<)HbF|UWUqVspO?qtV4N)LnTfpp38f*@FJ^aCGl{!C8?N#CIqDoY`P@M zP$WXvY_*;>2er>62eq>yNu^8r2WoTwX|F2<63JgB>%9?iVm;Z?;cZ7}j-Q}47Otdm zWve+a)!DeCk*Vv4v#a#ntd@vso?;gi#kmiMPRB~q4yV|wQPDe`4TcZAOgtQ7r+N{q zCA*BxWB=n83O_xYa(;9ac5dADYi(0cW7&Cp$%cP6Vr=mDtpPr$|5q>wu aZun$+xSmM Date: Tue, 29 Dec 2015 13:31:17 +0100 Subject: [PATCH 3/6] -DELETE swp file commited by error --- .../processor_methods/.libreoffice.rb.swp | Bin 12288 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 lib/heathen/processor_methods/.libreoffice.rb.swp diff --git a/lib/heathen/processor_methods/.libreoffice.rb.swp b/lib/heathen/processor_methods/.libreoffice.rb.swp deleted file mode 100644 index ead3e3a1c31620845b00acb79e353c8e3486de4b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2%ZnUE9LFp1{fGv==s~FLfHNbRo{bT+1||j*Ln3anvbsUSGSp1h?DlSVS6fv* znayT{p7fH7-~;udc-2#mDkvxz74$DqL2~fmMerbqeybm|-MgEaT?7xY3O<>c?yBG8 z+f~1=B-NRmJ2g!YIgb&vy@Z^ee`Rhd_>5dTLr5T&n3T-i8QIw6DRspMkRlq7>6|f3e1*`&A0jq#jz$$QqD8O7ouHi5LM{Y(!`2T( z05jkucnTOT9|E7=M93%L3V0i=0v{{^+x~Y7a2~pZ()3S!Zuptpq-ZBRpF5*&=J!Th zh(q5Cmus5I1+LpY7_Gy*X-lHxbVaCnsA&svT)&kAE=OqW=aHDa)-%ge*lqwxzLGPy z9BX2dQpr5U=`j)Zxm2DAPcUTSIw3z_n>fb8P-xooLYE>T^w8YJPoLA8FT}DJF3_Hk zw6dD`=rF}Y7VwgvwfY3DQ^blxN15~kPw6LVLAddu@B#0}8sTy2_)KXWUQ&uJa8;*X zk8Zll2cB*#&2+5jz80M%nTJ$QNQss|E%UwFXyAO5YhtYKC+14p3Qsb5!o_0l<*@Ild5 zi&iyM%4bkP;_I2JtPY2-P2}U!Wp*m;Gv9Nky?{Tz67f^vtmaX&)~dBWlb*nQbwxK0 zOzFVN3Yb=POdJz$fI)4IQNz_Vbdv>MoZZ8Fh_Y`|Dw&|IOSD;8U0!QuL(^%=)w$HE zV}{Q=oOKrw552g&zkf07Su`Aaes;B#-VBqd0-DQWeyL&Pl71N`v8Cr+wYAJVrR(1? z3!F!)v~?tVFf?n)V=|Xl*9Vo>8H*y{>oQYCn*GpqM8v~E;Ab*7(i4!m`c5P{su@ZM zx(9ygXX;$IQF(6e%)WvH=Bh=`HOwcUx!rONs_xA0c_H2Gj3>%7Oy?Qp){841T5hx+ zs755ta~TOQ)cKe(nPN<;UfzGa4?gH}Kkq&!VVE~r<-JE357hNKRz)Gg?aYu1UX(h7 zeuFU&nki2u(<#?_i|GelOzph;H#ZmNy>b(V78IVzgraB$a18ELc--$v5mw> z8XmJOHV9J<)HbF|UWUqVspO?qtV4N)LnTfpp38f*@FJ^aCGl{!C8?N#CIqDoY`P@M zP$WXvY_*;>2er>62eq>yNu^8r2WoTwX|F2<63JgB>%9?iVm;Z?;cZ7}j-Q}47Otdm zWve+a)!DeCk*Vv4v#a#ntd@vso?;gi#kmiMPRB~q4yV|wQPDe`4TcZAOgtQ7r+N{q zCA*BxWB=n83O_xYa(;9ac5dADYi(0cW7&Cp$%cP6Vr=mDtpPr$|5q>wu aZun$+xSmM Date: Tue, 29 Dec 2015 17:03:38 +0100 Subject: [PATCH 4/6] -FIX specs failing for wrong mime_types --- spec/heathen/processor_methods/libreoffice_spec.rb | 6 +++--- spec/helpers/mime_types.rb | 9 +++++++++ spec/integration/standard_tasks_spec.rb | 6 +++--- 3 files changed, 15 insertions(+), 6 deletions(-) create mode 100644 spec/helpers/mime_types.rb diff --git a/spec/heathen/processor_methods/libreoffice_spec.rb b/spec/heathen/processor_methods/libreoffice_spec.rb index b964276..ce7df2e 100644 --- a/spec/heathen/processor_methods/libreoffice_spec.rb +++ b/spec/heathen/processor_methods/libreoffice_spec.rb @@ -55,7 +55,7 @@ def new_job content it 'from OO word' do new_job oo_word_content @processor.libreoffice format: 'msoffice' - expect(@job.content.mime_type).to eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary' + expect(ms_mime_types).to include(@job.content.mime_type) end it 'from OO spreadsheet' do new_job oo_spreadsheet_content @@ -74,12 +74,12 @@ def new_job content it 'from MS word' do new_job ms_word_content @processor.libreoffice format: 'ooffice' - expect(@job.content.mime_type).to eq 'application/xml; charset=us-ascii' + expect(oo_mime_types).to include(@job.content.mime_type) end it 'from MS spreadsheet' do new_job ms_spreadsheet_content @processor.libreoffice format: 'ooffice' - expect(@job.content.mime_type).to eq 'application/xml; charset=us-ascii' + expect(oo_mime_types).to include(@job.content.mime_type) end it 'from MS powerpoint' do new_job ms_ppt_content diff --git a/spec/helpers/mime_types.rb b/spec/helpers/mime_types.rb new file mode 100644 index 0000000..fbe4745 --- /dev/null +++ b/spec/helpers/mime_types.rb @@ -0,0 +1,9 @@ +def ms_mime_types + ['application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary', + 'application/zip; charset=binary'] +end + +def oo_mime_types + ['application/xml; charset=us-ascii', + 'application/octet-stream; charset=binary'] +end diff --git a/spec/integration/standard_tasks_spec.rb b/spec/integration/standard_tasks_spec.rb index 882bbfc..6ae7f7e 100644 --- a/spec/integration/standard_tasks_spec.rb +++ b/spec/integration/standard_tasks_spec.rb @@ -90,7 +90,7 @@ it 'runs' do content = fixture('heathen/ooword.odt').read new_content = converter.convert 'msoffice', content - expect(new_content.mime_type).to eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary' + expect(ms_mime_types).to include(new_content.mime_type) end end @@ -98,7 +98,7 @@ it 'runs' do content = fixture('heathen/msword.docx').read new_content = converter.convert 'ooffice', content - expect(new_content.mime_type).to eq 'application/xml; charset=us-ascii' + expect(oo_mime_types).to include(new_content.mime_type) end end @@ -106,7 +106,7 @@ it 'runs' do content = fixture('heathen/ooword.odt').read new_content = converter.convert 'doc', content - expect(new_content.mime_type).to eq 'application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary' + expect(ms_mime_types).to include(new_content.mime_type) end end end From 649f66bdc0fd860fd3cfa3de6c8fa9bedf5909ec Mon Sep 17 00:00:00 2001 From: Antonio Delfin Martinez Date: Thu, 31 Dec 2015 10:37:26 +0100 Subject: [PATCH 5/6] -FIX pull request specs comments --- spec/heathen/processor_methods/libreoffice_spec.rb | 2 +- spec/helpers/mime_types.rb | 2 +- spec/integration/standard_tasks_spec.rb | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spec/heathen/processor_methods/libreoffice_spec.rb b/spec/heathen/processor_methods/libreoffice_spec.rb index ce7df2e..8d18b8c 100644 --- a/spec/heathen/processor_methods/libreoffice_spec.rb +++ b/spec/heathen/processor_methods/libreoffice_spec.rb @@ -55,7 +55,7 @@ def new_job content it 'from OO word' do new_job oo_word_content @processor.libreoffice format: 'msoffice' - expect(ms_mime_types).to include(@job.content.mime_type) + expect(ms_word_mime_types).to include(@job.content.mime_type) end it 'from OO spreadsheet' do new_job oo_spreadsheet_content diff --git a/spec/helpers/mime_types.rb b/spec/helpers/mime_types.rb index fbe4745..163f2f9 100644 --- a/spec/helpers/mime_types.rb +++ b/spec/helpers/mime_types.rb @@ -1,4 +1,4 @@ -def ms_mime_types +def ms_word_mime_types ['application/vnd.openxmlformats-officedocument.wordprocessingml.document; charset=binary', 'application/zip; charset=binary'] end diff --git a/spec/integration/standard_tasks_spec.rb b/spec/integration/standard_tasks_spec.rb index 6ae7f7e..21619b3 100644 --- a/spec/integration/standard_tasks_spec.rb +++ b/spec/integration/standard_tasks_spec.rb @@ -90,7 +90,7 @@ it 'runs' do content = fixture('heathen/ooword.odt').read new_content = converter.convert 'msoffice', content - expect(ms_mime_types).to include(new_content.mime_type) + expect(ms_word_mime_types).to include(new_content.mime_type) end end @@ -106,7 +106,7 @@ it 'runs' do content = fixture('heathen/ooword.odt').read new_content = converter.convert 'doc', content - expect(ms_mime_types).to include(new_content.mime_type) + expect(ms_word_mime_types).to include(new_content.mime_type) end end end From 457dfc1847f94468ce879a2e8e1e1bc34db06a73 Mon Sep 17 00:00:00 2001 From: Antonio Delfin Martinez Date: Thu, 31 Dec 2015 10:38:44 +0100 Subject: [PATCH 6/6] -FIX remove unuseful variable after pull request comment --- lib/heathen/processor_methods/libreoffice.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/heathen/processor_methods/libreoffice.rb b/lib/heathen/processor_methods/libreoffice.rb index 7c1c917..5e826ec 100644 --- a/lib/heathen/processor_methods/libreoffice.rb +++ b/lib/heathen/processor_methods/libreoffice.rb @@ -47,7 +47,6 @@ def libreoffice( format: ) target_file = "#{job.content_file}.#{to_suffix}" if to_suffix == 'txt' - pdf_file = "#{job.content_file}.pdf" executioner.execute( 'libreoffice', '--convert-to', 'pdf', @@ -58,7 +57,7 @@ def libreoffice( format: ) executioner.execute( 'pdftotext', - pdf_file, + "#{job.content_file}.pdf", target_file ) else