-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfToHtml.js
54 lines (45 loc) · 1.75 KB
/
pdfToHtml.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/*
`pdfToHtml.js`, demonstrates how to use the PDFix SDK for Node.js to convert a PDF document
to HTML format. It opens a PDF, extracts its content (text, images, and layout), and
converts it to a structured HTML representation, preserving the original document's formatting,
responsive HTML layout, or layout defined by PDF Tags.
*/
const Pdfix = require('pdfix-sdk');
const pdfixSdkWrapper = new Pdfix();
pdfixSdkWrapper.loadPdfixSdk().then(() => {
const pdfixSdk = pdfixSdkWrapper.getPdfixSdk();
const pdfix = pdfixSdk.GetPdfix();
const pdfDoc = pdfixSdkWrapper.openDocumentFromPath("./pdf/test.pdf");
console.log({
numOfPages: pdfDoc.GetNumPages(),
version: pdfDoc.GetVersion(),
pdfStandard: pdfDoc.GetPdfStandard()
});
const htmlConversion = pdfDoc.CreateHtmlConversion();
// prepare conversion params
const params = new pdfixSdk.PdfHtmlParams();
params.html_type = pdfixSdk.kPdfHtmlFixed;
// all resources (css, ja, img, font) must be embedded to save HTML into stream
params.flags = pdfixSdk.kHtmlNoExternalCSS | pdfixSdk.kHtmlNoExternalJS | pdfixSdk.kHtmlNoExternalIMG | pdfixSdk.kHtmlNoExternalFONT;
if (!htmlConversion.SetParams(params)) {
throw Exception(pdfix.GetError())
}
// save HTML to memory stream
const stream = pdfix.CreateMemStream();
if (!stream) {
throw Exception(pdfix.GetError())
}
if (!htmlConversion.SaveToStream(stream)) {
throw Exception(pdfix.GetError())
}
htmlConversion.Destroy()
// save data from stream to buffer
const buffer = new ArrayBuffer(stream.GetSize())
stream.ReadToArrayBuffer(0, buffer, buffer.byteLength)
stream.Destroy()
// display HTML
const decoder = new TextDecoder();
const str = decoder.decode(buffer);
console.log(str)
pdfDoc.Close();
});