From 8f4b5ba3f62463a0fa1c8a23f98b6b2242d2665f Mon Sep 17 00:00:00 2001 From: Henry Date: Wed, 14 Jun 2023 12:34:25 +0100 Subject: [PATCH 1/4] pdf loader add legacy option --- .../components/nodes/documentloaders/Pdf/Pdf.ts | 16 ++++++++++++++-- packages/components/package.json | 5 +++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/packages/components/nodes/documentloaders/Pdf/Pdf.ts b/packages/components/nodes/documentloaders/Pdf/Pdf.ts index bc36f8cb5cd..5bee0e65a26 100644 --- a/packages/components/nodes/documentloaders/Pdf/Pdf.ts +++ b/packages/components/nodes/documentloaders/Pdf/Pdf.ts @@ -49,6 +49,13 @@ class Pdf_DocumentLoaders implements INode { ], default: 'perPage' }, + { + label: 'Use Legacy Build', + name: 'legacyBuild', + type: 'boolean', + optional: true, + additionalParams: true + }, { label: 'Metadata', name: 'metadata', @@ -64,6 +71,7 @@ class Pdf_DocumentLoaders implements INode { const pdfFileBase64 = nodeData.inputs?.pdfFile as string const usage = nodeData.inputs?.usage as string const metadata = nodeData.inputs?.metadata + const legacyBuild = nodeData.inputs?.legacyBuild as boolean let alldocs = [] let files: string[] = [] @@ -82,7 +90,8 @@ class Pdf_DocumentLoaders implements INode { const loader = new PDFLoader(new Blob([bf]), { splitPages: false, // @ts-ignore - pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') + pdfjs: () => + legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) if (textSplitter) { const docs = await loader.loadAndSplit(textSplitter) @@ -93,7 +102,10 @@ class Pdf_DocumentLoaders implements INode { } } else { // @ts-ignore - const loader = new PDFLoader(new Blob([bf]), { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) + const loader = new PDFLoader(new Blob([bf]), { + pdfjs: () => + legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') + }) if (textSplitter) { const docs = await loader.loadAndSplit(textSplitter) alldocs.push(...docs) diff --git a/packages/components/package.json b/packages/components/package.json index 207d3e8975b..07275b08ebc 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -32,15 +32,16 @@ "faiss-node": "^0.2.1", "form-data": "^4.0.0", "graphql": "^16.6.0", + "html-to-text": "^9.0.5", "langchain": "^0.0.94", "linkifyjs": "^4.1.1", "mammoth": "^1.5.1", "moment": "^2.29.3", "node-fetch": "^2.6.11", "pdf-parse": "^1.1.1", + "pdfjs-dist": "^3.7.107", "weaviate-ts-client": "^1.1.0", - "ws": "^8.9.0", - "html-to-text": "^9.0.5" + "ws": "^8.9.0" }, "devDependencies": { "@types/gulp": "4.0.9", From 97ea2c405e0965180a7aa1b259855191c5bb5c5c Mon Sep 17 00:00:00 2001 From: Henry Date: Wed, 14 Jun 2023 13:30:07 +0100 Subject: [PATCH 2/4] yarn lint fix --- packages/components/nodes/documentloaders/Pdf/Pdf.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/components/nodes/documentloaders/Pdf/Pdf.ts b/packages/components/nodes/documentloaders/Pdf/Pdf.ts index 5bee0e65a26..ddb7edb8759 100644 --- a/packages/components/nodes/documentloaders/Pdf/Pdf.ts +++ b/packages/components/nodes/documentloaders/Pdf/Pdf.ts @@ -89,8 +89,8 @@ class Pdf_DocumentLoaders implements INode { if (usage === 'perFile') { const loader = new PDFLoader(new Blob([bf]), { splitPages: false, - // @ts-ignore pdfjs: () => + // @ts-ignore legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) if (textSplitter) { @@ -101,9 +101,9 @@ class Pdf_DocumentLoaders implements INode { alldocs.push(...docs) } } else { - // @ts-ignore const loader = new PDFLoader(new Blob([bf]), { pdfjs: () => + // @ts-ignore legacyBuild ? import('pdfjs-dist/legacy/build/pdf.js') : import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) if (textSplitter) { From a05e9921fa1b7a9b108d7007a652a537894f6912 Mon Sep 17 00:00:00 2001 From: Henry Date: Wed, 14 Jun 2023 19:38:30 +0100 Subject: [PATCH 3/4] add packages to Dockerfile --- Dockerfile | 2 ++ docker/Dockerfile | 2 ++ 2 files changed, 4 insertions(+) diff --git a/Dockerfile b/Dockerfile index fc76cd00837..e9470c313e5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,8 @@ FROM node:18-alpine RUN apk add --update libc6-compat python3 make g++ +# needed for pdfjs-dist +RUN apk add --no-cache build-base cairo-dev pango-dev WORKDIR /usr/src/packages diff --git a/docker/Dockerfile b/docker/Dockerfile index e4bf704a0e9..15c4e0ac6cc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -4,6 +4,8 @@ USER root RUN apk add --no-cache git RUN apk add --no-cache python3 py3-pip make g++ +# needed for pdfjs-dist +RUN apk add --no-cache build-base cairo-dev pango-dev # You can install a specific version like: flowise@1.0.0 RUN npm install -g flowise From 95d3dd3fafaf3782a21004ab34237856ef01a7f5 Mon Sep 17 00:00:00 2001 From: Henry Heng Date: Fri, 16 Jun 2023 15:17:12 +0100 Subject: [PATCH 4/4] Update bug_report.md --- .github/ISSUE_TEMPLATE/bug_report.md | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b8e2e8a5ae2..135312808b4 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -27,7 +27,6 @@ If applicable, add screenshots to help explain your problem. If applicable, add exported flow in order to help replicating the problem. **Setup** - - Installation [e.g. docker, `npx flowise start`, `yarn start`] - Flowise Version [e.g. 1.2.11] - OS: [e.g. macOS, Windows, Linux]