-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfToJson.js
46 lines (37 loc) · 1.36 KB
/
pdfToJson.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/*
`pdfToJson.js`, demonstrates how to use the PDFix SDK for Node.js to convert
a PDF document's content into a JSON format. It opens a PDF, extracts its text,
images, and metadata, and then structures the data as JSON for easy
integration with other applications.
*/
const Pdfix = require('pdfix-sdk');
const pdfixSdkWrapper = new Pdfix();
pdfixSdkWrapper.loadPdfixSdk().then(() => {
const pdfixSdk = pdfixSdkWrapper.getPdfixSdk();
const pdfix = pdfixSdk.GetPdfix();
const pdfDoc = pdfixSdkWrapper.openDocumentFromPath("./pdf/test.pdf");
console.log({
numOfPages: pdfDoc.GetNumPages(),
version: pdfDoc.GetVersion(),
pdfStandard: pdfDoc.GetPdfStandard()
});
const jsonConversion = pdfDoc.CreateJsonConversion();
// prepare conversion params
const params = new pdfixSdk.PdfJsonParams();
params.struct_tree = 1; // export structure tree
params.text = 1; // include text
jsonConversion.SetParams(params);
// save JSON to memory stream
const stream = pdfix.CreateMemStream();
jsonConversion.SaveToStream(stream);
jsonConversion.Destroy()
// save data from stream to buffer
const buffer = new ArrayBuffer(stream.GetSize())
stream.ReadToArrayBuffer(0, buffer, buffer.byteLength)
stream.Destroy()
// display JSON
const decoder = new TextDecoder();
const str = decoder.decode(buffer);
console.log(str)
pdfDoc.Close();
});