Skip to content

Commit

Permalink
fix(plugins/pdf-to-txt): set meta and title for html results (#1380)
Browse files Browse the repository at this point in the history
  • Loading branch information
Fdawgs authored Mar 29, 2023
1 parent c1fa596 commit 391637e
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 3 deletions.
20 changes: 19 additions & 1 deletion src/plugins/pdf-to-txt/index.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
/* eslint-disable security/detect-non-literal-fs-filename */
const fixUtf8 = require("fix-utf8");
const fp = require("fastify-plugin");
const fs = require("fs/promises");
const { glob } = require("glob");
const { JSDOM } = require("jsdom");
const path = require("upath");
const { Poppler } = require("node-poppler");
const { randomUUID } = require("crypto");
Expand Down Expand Up @@ -119,6 +121,8 @@ async function plugin(server, options) {
query[value] = parseString(query[value]);
});

const id = `${config.tempFilePrefix}_${randomUUID()}`;

/**
* If `ocr` query string param passed then use pdfToCairo and Tesseract OCR engine.
* image-to-txt plugin adds the "tesseract" decorator to server instance,
Expand All @@ -133,7 +137,6 @@ async function plugin(server, options) {
});

// Build temp file pattern for Poppler to use for output
const id = `${config.tempFilePrefix}_${randomUUID()}`;
const tempFile = path.joinSafe(directory, id);

/**
Expand Down Expand Up @@ -215,6 +218,21 @@ async function plugin(server, options) {
query.generateHtmlMetaFile
) {
contentType = "text/html";
const dom = new JSDOM(req.conversionResults.body);
const meta = dom.window.document.createElement("meta");
meta.content = `text/html; charset=${config.pdfToTxtOptions.outputEncoding.toLowerCase()}`;
meta.httpEquiv = "content-type";
dom.window.document.head.prepend(meta);

// Overwrite content of remaining title element with temp file id
dom.window.document.title = id;

/**
* `fixUtf8` function replaces most common incorrectly converted
* Windows-1252 to UTF-8 results with HTML equivalents.
* Refer to https://i18nqa.com/debug/utf8-debug.html for more info
*/
req.conversionResults.body = fixUtf8(dom.serialize());
}
res.type(
`${contentType}; charset=${config.pdfToTxtOptions.outputEncoding.toLowerCase()}`
Expand Down
30 changes: 28 additions & 2 deletions src/plugins/pdf-to-txt/plugin.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
const fs = require("fs/promises");
const Fastify = require("fastify");
const isHtml = require("is-html");
const { JSDOM } = require("jsdom");
const sensible = require("@fastify/sensible");
const plugin = require(".");
const getConfig = require("../../config");
Expand Down Expand Up @@ -92,18 +93,43 @@ describe("PDF-to-TXT conversion plugin", () => {
"./test_resources/test_files/pdf_1.3_NHS_Constitution.pdf"
),
query: {
firstPageToConvert: 2,
generateHtmlMetaFile: true,
lastPageToConvert: 1,
lastPageToConvert: 2,
},
headers: {
"content-type": "application/pdf",
},
});

const { body } = JSON.parse(response.payload);
const dom = new JSDOM(body);

expect(body).toEqual(expect.stringContaining("for England"));
expect(isHtml(body)).toBe(true);
// Check only one meta and title element exists
expect(dom.window.document.querySelectorAll("meta")).toHaveLength(1);
expect(dom.window.document.querySelectorAll("title")).toHaveLength(1);
// Check that head element contains only a meta and title element in the correct order
expect(dom.window.document.head.firstChild.tagName).toBe("META");
expect(dom.window.document.head.firstChild).toEqual(
expect.objectContaining({
content: expect.stringMatching(/^text\/html; charset=utf-8$/im),
httpEquiv: expect.stringMatching(/^content-type$/im),
})
);
expect(
dom.window.document.head.querySelector("title").textContent
).toMatch(/^docsmith_pdf-to-txt_/m);
// String found at the start of the HTML document
expect(dom.window.document.querySelector("pre").textContent).toEqual(
expect.stringContaining("The NHS belongs to the people")
);
// String found at the end of the HTML document
expect(dom.window.document.querySelector("pre").textContent).toEqual(
expect.stringContaining(
"a full and transparent debate with the public, patients and staff."
)
);
});

test.each([
Expand Down

0 comments on commit 391637e

Please sign in to comment.