-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathpdf.ts
60 lines (51 loc) · 1.56 KB
/
pdf.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// source/parsers/pdf.ts
// The text extracter for PDF files.
import { type Buffer } from 'buffer/'
// @ts-expect-error There are no types for this package.
import parsePdf from 'pdf-parse/lib/pdf-parse.js'
import type { TextExtractionMethod } from '../lib.js'
export class PdfExtractor implements TextExtractionMethod {
/**
* The type(s) of input acceptable to this method.
*/
mimes = ['application/pdf']
/**
* Extract text from a PDF file if possible.
*
* @param payload The input and its type.
* @returns The text extracted from the input.
*/
apply = async (input: Buffer): Promise<string> => {
// Convert the PDF to text and return the text.
const parsedPdf = (await parsePdf(input, {
pagerender: renderPage,
})) as { text: string }
return parsedPdf.text
}
}
/**
* We have to redefine this function to ensure that there are spaces between
* words in the output text.
*
* @param data The data stored in the PDF about the page.
* @returns The text content on the page
*/
const renderPage = async (data: unknown): Promise<string> => {
const options = {
normalizeWhitespace: false,
disableCombineTextItems: false,
}
// @ts-expect-error todo: figure out the types
return data.getTextContent(options).then((textContent: unknown) => {
let lastY = ''
let text = ''
// @ts-expect-error todo: figure out the types
for (const item of textContent.items) {
if (!(lastY === item.transform[5] || !lastY)) text += '\n'
// The word + a space
text += (item.str as string) + ' '
lastY = item.transform[5] as string
}
return text
})
}