FlowiseAI · chungyau97 · Jul 9, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 5, 2023
diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts
@@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
 import { TextSplitter } from 'langchain/text_splitter'
 import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
 import { test } from 'linkifyjs'
-import { getAvailableURLs } from '../../../src'
+import { webCrawl, xmlScrape } from '../../../src'
 
 class Cheerio_DocumentLoaders implements INode {
     label: string
@@ -35,19 +35,34 @@ class Cheerio_DocumentLoaders implements INode {
                 optional: true
             },
             {
-                label: 'Web Scrap for Relative Links',
-                name: 'webScrap',
-                type: 'boolean',
+                label: 'Get Relative Links Method',
+                name: 'relativeLinksMethod',
+                type: 'options',
+                description: 'Select a method to retrieve relative links',
+                options: [
+                    {
+                        label: 'Web Crawl',
+                        name: 'webCrawl',
+                        description: 'Crawl relative links from HTML URL'
+                    },
+                    {
+                        label: 'Scrape XML Sitemap',
+                        name: 'scrapeXMLSitemap',
+                        description: 'Scrape relative links from XML sitemap URL'
+                    }
+                ],
                 optional: true,
                 additionalParams: true
             },
             {
-                label: 'Web Scrap Links Limit',
+                label: 'Get Relative Links Limit',
                 name: 'limit',
                 type: 'number',
-                default: 10,
                 optional: true,
-                additionalParams: true
+                additionalParams: true,
+                description:
+                    'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
+                warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
             },
             {
                 label: 'Metadata',
@@ -62,7 +77,7 @@ class Cheerio_DocumentLoaders implements INode {
     async init(nodeData: INodeData): Promise<any> {
         const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
         const metadata = nodeData.inputs?.metadata
-        const webScrap = nodeData.inputs?.webScrap as boolean
+        const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
         let limit = nodeData.inputs?.limit as string
 
         let url = nodeData.inputs?.url as string
@@ -71,25 +86,34 @@ class Cheerio_DocumentLoaders implements INode {
             throw new Error('Invalid URL')
         }
 
-        const cheerioLoader = async (url: string): Promise<any> => {
-            let docs = []
-            const loader = new CheerioWebBaseLoader(url)
-            if (textSplitter) {
-                docs = await loader.loadAndSplit(textSplitter)
-            } else {
-                docs = await loader.load()
+        async function cheerioLoader(url: string): Promise<any> {
+            try {
+                let docs = []
+                const loader = new CheerioWebBaseLoader(url)
+                if (textSplitter) {
+                    docs = await loader.loadAndSplit(textSplitter)
+                } else {
+                    docs = await loader.load()
+                }
+                return docs
+            } catch (err) {
+                if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
             }
-            return docs
         }
 
-        let availableUrls: string[]
         let docs = []
-        if (webScrap) {
+        if (relativeLinksMethod) {
+            if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
             if (!limit) limit = '10'
-            availableUrls = await getAvailableURLs(url, parseInt(limit))
-            for (let i = 0; i < availableUrls.length; i++) {
-                docs.push(...(await cheerioLoader(availableUrls[i])))
+            else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
+            const pages: string[] =
+                relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
+            if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
+            if (!pages || pages.length === 0) throw new Error('No relative links found')
+            for (const page of pages) {
+                docs.push(...(await cheerioLoader(page)))
             }
+            if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
         } else {
             docs = await cheerioLoader(url)
         }

diff --git a/packages/components/nodes/documentloaders/Playwright/Playwright.ts b/packages/components/nodes/documentloaders/Playwright/Playwright.ts
@@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
 import { TextSplitter } from 'langchain/text_splitter'
 import { PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright'
 import { test } from 'linkifyjs'
-import { getAvailableURLs } from '../../../src'
+import { webCrawl, xmlScrape } from '../../../src'
 
 class Playwright_DocumentLoaders implements INode {
     label: string
@@ -35,19 +35,34 @@ class Playwright_DocumentLoaders implements INode {
                 optional: true
             },
             {
-                label: 'Web Scrap for Relative Links',
-                name: 'webScrap',
-                type: 'boolean',
+                label: 'Get Relative Links Method',
+                name: 'relativeLinksMethod',
+                type: 'options',
+                description: 'Select a method to retrieve relative links',
+                options: [
+                    {
+                        label: 'Web Crawl',
+                        name: 'webCrawl',
+                        description: 'Crawl relative links from HTML URL'
+                    },
+                    {
+                        label: 'Scrape XML Sitemap',
+                        name: 'scrapeXMLSitemap',
+                        description: 'Scrape relative links from XML sitemap URL'
+                    }
+                ],
                 optional: true,
                 additionalParams: true
             },
             {
-                label: 'Web Scrap Links Limit',
+                label: 'Get Relative Links Limit',
                 name: 'limit',
                 type: 'number',
-                default: 10,
                 optional: true,
-                additionalParams: true
+                additionalParams: true,
+                description:
+                    'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
+                warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
             },
             {
                 label: 'Metadata',
@@ -62,7 +77,7 @@ class Playwright_DocumentLoaders implements INode {
     async init(nodeData: INodeData): Promise<any> {
         const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
         const metadata = nodeData.inputs?.metadata
-        const webScrap = nodeData.inputs?.webScrap as boolean
+        const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
         let limit = nodeData.inputs?.limit as string
 
         let url = nodeData.inputs?.url as string
@@ -71,25 +86,34 @@ class Playwright_DocumentLoaders implements INode {
             throw new Error('Invalid URL')
         }
 
-        const playwrightLoader = async (url: string): Promise<any> => {
-            let docs = []
-            const loader = new PlaywrightWebBaseLoader(url)
-            if (textSplitter) {
-                docs = await loader.loadAndSplit(textSplitter)
-            } else {
-                docs = await loader.load()
+        async function playwrightLoader(url: string): Promise<any> {
+            try {
+                let docs = []
+                const loader = new PlaywrightWebBaseLoader(url)
+                if (textSplitter) {
+                    docs = await loader.loadAndSplit(textSplitter)
+                } else {
+                    docs = await loader.load()
+                }
+                return docs
+            } catch (err) {
+                if (process.env.DEBUG === 'true') console.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`)
             }
-            return docs
         }
 
-        let availableUrls: string[]
         let docs = []
-        if (webScrap) {
+        if (relativeLinksMethod) {
+            if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
             if (!limit) limit = '10'
-            availableUrls = await getAvailableURLs(url, parseInt(limit))
-            for (let i = 0; i < availableUrls.length; i++) {
-                docs.push(...(await playwrightLoader(availableUrls[i])))
+            else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
+            const pages: string[] =
+                relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
+            if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
+            if (!pages || pages.length === 0) throw new Error('No relative links found')
+            for (const page of pages) {
+                docs.push(...(await playwrightLoader(page)))
             }
+            if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
         } else {
             docs = await playwrightLoader(url)
         }

diff --git a/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts b/packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts
@@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
 import { TextSplitter } from 'langchain/text_splitter'
 import { PuppeteerWebBaseLoader } from 'langchain/document_loaders/web/puppeteer'
 import { test } from 'linkifyjs'
-import { getAvailableURLs } from '../../../src'
+import { webCrawl, xmlScrape } from '../../../src'
 
 class Puppeteer_DocumentLoaders implements INode {
     label: string
@@ -35,19 +35,34 @@ class Puppeteer_DocumentLoaders implements INode {
                 optional: true
             },
             {
-                label: 'Web Scrape for Relative Links',
-                name: 'webScrape',
-                type: 'boolean',
+                label: 'Get Relative Links Method',
+                name: 'relativeLinksMethod',
+                type: 'options',
+                description: 'Select a method to retrieve relative links',
+                options: [
+                    {
+                        label: 'Web Crawl',
+                        name: 'webCrawl',
+                        description: 'Crawl relative links from HTML URL'
+                    },
+                    {
+                        label: 'Scrape XML Sitemap',
+                        name: 'scrapeXMLSitemap',
+                        description: 'Scrape relative links from XML sitemap URL'
+                    }
+                ],
                 optional: true,
                 additionalParams: true
             },
             {
-                label: 'Web Scrape Links Limit',
+                label: 'Get Relative Links Limit',
                 name: 'limit',
                 type: 'number',
-                default: 10,
                 optional: true,
-                additionalParams: true
+                additionalParams: true,
+                description:
+                    'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
+                warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
             },
             {
                 label: 'Metadata',
@@ -62,7 +77,7 @@ class Puppeteer_DocumentLoaders implements INode {
     async init(nodeData: INodeData): Promise<any> {
         const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
         const metadata = nodeData.inputs?.metadata
-        const webScrape = nodeData.inputs?.webScrape as boolean
+        const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
         let limit = nodeData.inputs?.limit as string
 
         let url = nodeData.inputs?.url as string
@@ -71,35 +86,39 @@ class Puppeteer_DocumentLoaders implements INode {
             throw new Error('Invalid URL')
         }
 
-        const puppeteerLoader = async (url: string): Promise<any> => {
-            let docs = []
-            const loader = new PuppeteerWebBaseLoader(url, {
-                launchOptions: {
-                    args: ['--no-sandbox'],
-                    headless: 'new'
+        async function puppeteerLoader(url: string): Promise<any> {
+            try {
+                let docs = []
+                const loader = new PuppeteerWebBaseLoader(url, {
+                    launchOptions: {
+                        args: ['--no-sandbox'],
+                        headless: 'new'
+                    }
+                })
+                if (textSplitter) {
+                    docs = await loader.loadAndSplit(textSplitter)
+                } else {
+                    docs = await loader.load()
                 }
-            })
-            if (textSplitter) {
-                docs = await loader.loadAndSplit(textSplitter)
-            } else {
-                docs = await loader.load()
+                return docs
+            } catch (err) {
+                if (process.env.DEBUG === 'true') console.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`)
             }
-            return docs
         }
 
-        let availableUrls: string[]
         let docs = []
-        if (webScrape) {
+        if (relativeLinksMethod) {
+            if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
             if (!limit) limit = '10'
-            availableUrls = await getAvailableURLs(url, parseInt(limit))
-            for (let i = 0; i < availableUrls.length; i++) {
-                try {
-                    docs.push(...(await puppeteerLoader(availableUrls[i])))
-                } catch (error) {
-                    console.error('Error loading url with puppeteer. URL: ', availableUrls[i], 'Error: ', error)
-                    continue
-                }
+            else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
+            const pages: string[] =
+                relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
+            if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
+            if (!pages || pages.length === 0) throw new Error('No relative links found')
+            for (const page of pages) {
+                docs.push(...(await puppeteerLoader(page)))
             }
+            if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
         } else {
             docs = await puppeteerLoader(url)
         }

diff --git a/packages/components/package.json b/packages/components/package.json
@@ -24,6 +24,7 @@
         "@qdrant/js-client-rest": "^1.2.2",
         "@supabase/supabase-js": "^2.21.0",
         "@types/js-yaml": "^4.0.5",
+        "@types/jsdom": "^21.1.1",
         "axios": "^0.27.2",
         "cheerio": "^1.0.0-rc.12",
         "chromadb": "^1.4.2",

diff --git a/packages/components/src/Interface.ts b/packages/components/src/Interface.ts
@@ -57,6 +57,7 @@ export interface INodeParams {
     type: NodeParamsType | string
     default?: CommonType | ICommonObject | ICommonObject[]
     description?: string
+    warning?: string
     options?: Array<INodeOptionsValue>
     optional?: boolean | INodeDisplay
     rows?: number