From 4fd73b86ed9ca198a560b821bb4cabf1ac1bdb8f Mon Sep 17 00:00:00 2001 From: atilgner Date: Tue, 13 Jun 2023 13:13:49 -0700 Subject: [PATCH 1/2] feat: added puppeteer loader node --- .../documentloaders/Puppeteer/Puppeteer.ts | 122 ++++++++++++++++++ .../documentloaders/Puppeteer/puppeteer.svg | 14 ++ packages/components/package.json | 5 +- 3 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts create mode 100644 packages/components/nodes/documentloaders/Puppeteer/puppeteer.svg diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts new file mode 100644 index 00000000000..1331c736e91 --- /dev/null +++ b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts @@ -0,0 +1,122 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { TextSplitter } from 'langchain/text_splitter' +import { PuppeteerWebBaseLoader } from 'langchain/document_loaders/web/puppeteer' +import { test } from 'linkifyjs' +import { getAvailableURLs } from '../../../src' + +class Puppeteer_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'Puppeteer Web Scraper' + this.name = 'puppeteerWebScraper' + this.type = 'Document' + this.icon = 'puppeteer.svg' + this.category = 'Document Loaders' + this.description = `Load data from webpages` + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'URL', + name: 'url', + type: 'string' + }, + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + }, + { + label: 'Web Scrape for Relative Links', + name: 'webScrape', + type: 'boolean', + optional: true, + additionalParams: true + }, + { + label: 'Web Scrape Links Limit', + name: 'limit', + type: 'number', + default: 10, + optional: true, + additionalParams: true + }, + { + label: 'Metadata', + name: 'metadata', + type: 'json', + optional: true, + additionalParams: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + const metadata = nodeData.inputs?.metadata + const webScrape = nodeData.inputs?.webScrape as boolean + let limit = nodeData.inputs?.limit as string + + let url = nodeData.inputs?.url as string + url = url.trim() + if (!test(url)) { + throw new Error('Invalid URL') + } + + const puppeteerLoader = async (url: string): Promise => { + let docs = [] + const loader = new PuppeteerWebBaseLoader(url) + if (textSplitter) { + docs = await loader.loadAndSplit(textSplitter) + } else { + docs = await loader.load() + } + return docs + } + + let availableUrls: string[] + let docs = [] + if (webScrape) { + if (!limit) limit = '10' + availableUrls = await getAvailableURLs(url, parseInt(limit)) + for (let i = 0; i < availableUrls.length; i++) { + try { + docs.push(...(await puppeteerLoader(availableUrls[i]))) + } catch (error) { + console.error('Error loading url with puppeteer. URL: ', availableUrls[i], 'Error: ', error) + continue + } + } + } else { + docs = await puppeteerLoader(url) + } + + if (metadata) { + const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) + let finaldocs = [] + for (const doc of docs) { + const newdoc = { + ...doc, + metadata: { + ...doc.metadata, + ...parsedMetadata + } + } + finaldocs.push(newdoc) + } + return finaldocs + } + + return docs + } +} + +module.exports = { nodeClass: Puppeteer_DocumentLoaders } diff --git a/packages/components/nodes/documentloaders/Puppeteer/puppeteer.svg b/packages/components/nodes/documentloaders/Puppeteer/puppeteer.svg new file mode 100644 index 00000000000..8477fc52d55 --- /dev/null +++ b/packages/components/nodes/documentloaders/Puppeteer/puppeteer.svg @@ -0,0 +1,14 @@ + + + + + + + + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index a778ea8fdb3..3bcac0d93a3 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -32,15 +32,16 @@ "faiss-node": "^0.2.1", "form-data": "^4.0.0", "graphql": "^16.6.0", + "html-to-text": "^9.0.5", "langchain": "^0.0.91", "linkifyjs": "^4.1.1", "mammoth": "^1.5.1", "moment": "^2.29.3", "node-fetch": "^2.6.11", "pdf-parse": "^1.1.1", + "puppeteer": "^20.7.1", "weaviate-ts-client": "^1.1.0", - "ws": "^8.9.0", - "html-to-text": "^9.0.5" + "ws": "^8.9.0" }, "devDependencies": { "@types/gulp": "4.0.9", From 919d04930616c12065621f5acb01c7f4c33bb359 Mon Sep 17 00:00:00 2001 From: atilgner Date: Wed, 14 Jun 2023 12:09:31 -0700 Subject: [PATCH 2/2] fix: accidentally deleted html to text --- packages/components/package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/components/package.json b/packages/components/package.json index 0aa161f8f0b..b0a581619e1 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -40,7 +40,8 @@ "pdf-parse": "^1.1.1", "puppeteer": "^20.7.1", "weaviate-ts-client": "^1.1.0", - "ws": "^8.9.0" + "ws": "^8.9.0", + "html-to-text": "^9.0.5" }, "devDependencies": { "@types/gulp": "4.0.9",