From 1bb65e78670bb56223035bb6d0cb35f2d1ab4ca7 Mon Sep 17 00:00:00 2001 From: disflyer Date: Wed, 14 Jun 2023 22:20:17 +0800 Subject: [PATCH] feat: add playwright nodes --- .../documentloaders/Playwright/Playwright.ts | 117 ++++++++++++++++++ .../documentloaders/Playwright/playwright.svg | 9 ++ packages/components/package.json | 1 + 3 files changed, 127 insertions(+) create mode 100644 packages/components/nodes/documentloaders/Playwright/Playwright.ts create mode 100644 packages/components/nodes/documentloaders/Playwright/playwright.svg diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts new file mode 100644 index 00000000000..6b7790af163 --- /dev/null +++ b/packages/components/nodes/documentloaders/Playwright/Playwright.ts @@ -0,0 +1,117 @@ +import { INode, INodeData, INodeParams } from '../../../src/Interface' +import { TextSplitter } from 'langchain/text_splitter' +import { PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright' +import { test } from 'linkifyjs' +import { getAvailableURLs } from '../../../src' + +class Playwright_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs: INodeParams[] + + constructor() { + this.label = 'Playwright Web Scraper' + this.name = 'playwrightWebScraper' + this.type = 'Document' + this.icon = 'playwright.svg' + this.category = 'Document Loaders' + this.description = `Load data from webpages` + this.baseClasses = [this.type] + this.inputs = [ + { + label: 'URL', + name: 'url', + type: 'string' + }, + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: true + }, + { + label: 'Web Scrap for Relative Links', + name: 'webScrap', + type: 'boolean', + optional: true, + additionalParams: true + }, + { + label: 'Web Scrap Links Limit', + name: 'limit', + type: 'number', + default: 10, + optional: true, + additionalParams: true + }, + { + label: 'Metadata', + name: 'metadata', + type: 'json', + optional: true, + additionalParams: true + } + ] + } + + async init(nodeData: INodeData): Promise { + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + const metadata = nodeData.inputs?.metadata + const webScrap = nodeData.inputs?.webScrap as boolean + let limit = nodeData.inputs?.limit as string + + let url = nodeData.inputs?.url as string + url = url.trim() + if (!test(url)) { + throw new Error('Invalid URL') + } + + const playwrightLoader = async (url: string): Promise => { + let docs = [] + const loader = new PlaywrightWebBaseLoader(url) + if (textSplitter) { + docs = await loader.loadAndSplit(textSplitter) + } else { + docs = await loader.load() + } + return docs + } + + let availableUrls: string[] + let docs = [] + if (webScrap) { + if (!limit) limit = '10' + availableUrls = await getAvailableURLs(url, parseInt(limit)) + for (let i = 0; i < availableUrls.length; i++) { + docs.push(...(await playwrightLoader(availableUrls[i]))) + } + } else { + docs = await playwrightLoader(url) + } + + if (metadata) { + const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) + let finaldocs = [] + for (const doc of docs) { + const newdoc = { + ...doc, + metadata: { + ...doc.metadata, + ...parsedMetadata + } + } + finaldocs.push(newdoc) + } + return finaldocs + } + + return docs + } +} + +module.exports = { nodeClass: Playwright_DocumentLoaders } diff --git a/packages/components/nodes/documentloaders/Playwright/playwright.svg b/packages/components/nodes/documentloaders/Playwright/playwright.svg new file mode 100644 index 00000000000..0992832dc17 --- /dev/null +++ b/packages/components/nodes/documentloaders/Playwright/playwright.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/packages/components/package.json b/packages/components/package.json index 207d3e8975b..cbea948aa58 100644 --- a/packages/components/package.json +++ b/packages/components/package.json @@ -38,6 +38,7 @@ "moment": "^2.29.3", "node-fetch": "^2.6.11", "pdf-parse": "^1.1.1", + "playwright": "^1.35.0", "weaviate-ts-client": "^1.1.0", "ws": "^8.9.0", "html-to-text": "^9.0.5"