Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 45 additions & 21 deletions packages/components/nodes/documentloaders/Cheerio/Cheerio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
import { test } from 'linkifyjs'
import { getAvailableURLs } from '../../../src'
import { webCrawl, xmlScrape } from '../../../src'

class Cheerio_DocumentLoaders implements INode {
label: string
Expand Down Expand Up @@ -35,19 +35,34 @@ class Cheerio_DocumentLoaders implements INode {
optional: true
},
{
label: 'Web Scrap for Relative Links',
name: 'webScrap',
type: 'boolean',
label: 'Get Relative Links Method',
name: 'relativeLinksMethod',
type: 'options',
description: 'Select a method to retrieve relative links',
options: [
{
label: 'Web Crawl',
name: 'webCrawl',
description: 'Crawl relative links from HTML URL'
},
{
label: 'Scrape XML Sitemap',
name: 'scrapeXMLSitemap',
description: 'Scrape relative links from XML sitemap URL'
}
],
optional: true,
additionalParams: true
},
{
label: 'Web Scrap Links Limit',
label: 'Get Relative Links Limit',
name: 'limit',
type: 'number',
default: 10,
optional: true,
additionalParams: true
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
},
{
label: 'Metadata',
Expand All @@ -62,7 +77,7 @@ class Cheerio_DocumentLoaders implements INode {
async init(nodeData: INodeData): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const webScrap = nodeData.inputs?.webScrap as boolean
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string

let url = nodeData.inputs?.url as string
Expand All @@ -71,25 +86,34 @@ class Cheerio_DocumentLoaders implements INode {
throw new Error('Invalid URL')
}

const cheerioLoader = async (url: string): Promise<any> => {
let docs = []
const loader = new CheerioWebBaseLoader(url)
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
async function cheerioLoader(url: string): Promise<any> {
try {
let docs = []
const loader = new CheerioWebBaseLoader(url)
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
}
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
}
return docs
}

let availableUrls: string[]
let docs = []
if (webScrap) {
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
availableUrls = await getAvailableURLs(url, parseInt(limit))
for (let i = 0; i < availableUrls.length; i++) {
docs.push(...(await cheerioLoader(availableUrls[i])))
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await cheerioLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
} else {
docs = await cheerioLoader(url)
}
Expand Down
66 changes: 45 additions & 21 deletions packages/components/nodes/documentloaders/Playwright/Playwright.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { PlaywrightWebBaseLoader } from 'langchain/document_loaders/web/playwright'
import { test } from 'linkifyjs'
import { getAvailableURLs } from '../../../src'
import { webCrawl, xmlScrape } from '../../../src'

class Playwright_DocumentLoaders implements INode {
label: string
Expand Down Expand Up @@ -35,19 +35,34 @@ class Playwright_DocumentLoaders implements INode {
optional: true
},
{
label: 'Web Scrap for Relative Links',
name: 'webScrap',
type: 'boolean',
label: 'Get Relative Links Method',
name: 'relativeLinksMethod',
type: 'options',
description: 'Select a method to retrieve relative links',
options: [
{
label: 'Web Crawl',
name: 'webCrawl',
description: 'Crawl relative links from HTML URL'
},
{
label: 'Scrape XML Sitemap',
name: 'scrapeXMLSitemap',
description: 'Scrape relative links from XML sitemap URL'
}
],
optional: true,
additionalParams: true
},
{
label: 'Web Scrap Links Limit',
label: 'Get Relative Links Limit',
name: 'limit',
type: 'number',
default: 10,
optional: true,
additionalParams: true
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
},
{
label: 'Metadata',
Expand All @@ -62,7 +77,7 @@ class Playwright_DocumentLoaders implements INode {
async init(nodeData: INodeData): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const webScrap = nodeData.inputs?.webScrap as boolean
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string

let url = nodeData.inputs?.url as string
Expand All @@ -71,25 +86,34 @@ class Playwright_DocumentLoaders implements INode {
throw new Error('Invalid URL')
}

const playwrightLoader = async (url: string): Promise<any> => {
let docs = []
const loader = new PlaywrightWebBaseLoader(url)
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
async function playwrightLoader(url: string): Promise<any> {
try {
let docs = []
const loader = new PlaywrightWebBaseLoader(url)
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
}
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in PlaywrightWebBaseLoader: ${err.message}, on page: ${url}`)
}
return docs
}

let availableUrls: string[]
let docs = []
if (webScrap) {
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
availableUrls = await getAvailableURLs(url, parseInt(limit))
for (let i = 0; i < availableUrls.length; i++) {
docs.push(...(await playwrightLoader(availableUrls[i])))
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await playwrightLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
} else {
docs = await playwrightLoader(url)
}
Expand Down
79 changes: 49 additions & 30 deletions packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { INode, INodeData, INodeParams } from '../../../src/Interface'
import { TextSplitter } from 'langchain/text_splitter'
import { PuppeteerWebBaseLoader } from 'langchain/document_loaders/web/puppeteer'
import { test } from 'linkifyjs'
import { getAvailableURLs } from '../../../src'
import { webCrawl, xmlScrape } from '../../../src'

class Puppeteer_DocumentLoaders implements INode {
label: string
Expand Down Expand Up @@ -35,19 +35,34 @@ class Puppeteer_DocumentLoaders implements INode {
optional: true
},
{
label: 'Web Scrape for Relative Links',
name: 'webScrape',
type: 'boolean',
label: 'Get Relative Links Method',
name: 'relativeLinksMethod',
type: 'options',
description: 'Select a method to retrieve relative links',
options: [
{
label: 'Web Crawl',
name: 'webCrawl',
description: 'Crawl relative links from HTML URL'
},
{
label: 'Scrape XML Sitemap',
name: 'scrapeXMLSitemap',
description: 'Scrape relative links from XML sitemap URL'
}
],
optional: true,
additionalParams: true
},
{
label: 'Web Scrape Links Limit',
label: 'Get Relative Links Limit',
name: 'limit',
type: 'number',
default: 10,
optional: true,
additionalParams: true
additionalParams: true,
description:
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
warning: `Retreiving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
},
{
label: 'Metadata',
Expand All @@ -62,7 +77,7 @@ class Puppeteer_DocumentLoaders implements INode {
async init(nodeData: INodeData): Promise<any> {
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
const metadata = nodeData.inputs?.metadata
const webScrape = nodeData.inputs?.webScrape as boolean
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
let limit = nodeData.inputs?.limit as string

let url = nodeData.inputs?.url as string
Expand All @@ -71,35 +86,39 @@ class Puppeteer_DocumentLoaders implements INode {
throw new Error('Invalid URL')
}

const puppeteerLoader = async (url: string): Promise<any> => {
let docs = []
const loader = new PuppeteerWebBaseLoader(url, {
launchOptions: {
args: ['--no-sandbox'],
headless: 'new'
async function puppeteerLoader(url: string): Promise<any> {
try {
let docs = []
const loader = new PuppeteerWebBaseLoader(url, {
launchOptions: {
args: ['--no-sandbox'],
headless: 'new'
}
})
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
}
})
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
} else {
docs = await loader.load()
return docs
} catch (err) {
if (process.env.DEBUG === 'true') console.error(`error in PuppeteerWebBaseLoader: ${err.message}, on page: ${url}`)
}
return docs
}

let availableUrls: string[]
let docs = []
if (webScrape) {
if (relativeLinksMethod) {
if (process.env.DEBUG === 'true') console.info(`Start ${relativeLinksMethod}`)
if (!limit) limit = '10'
availableUrls = await getAvailableURLs(url, parseInt(limit))
for (let i = 0; i < availableUrls.length; i++) {
try {
docs.push(...(await puppeteerLoader(availableUrls[i])))
} catch (error) {
console.error('Error loading url with puppeteer. URL: ', availableUrls[i], 'Error: ', error)
continue
}
else if (parseInt(limit) < 0) throw new Error('Limit cannot be less than 0')
const pages: string[] =
relativeLinksMethod === 'webCrawl' ? await webCrawl(url, parseInt(limit)) : await xmlScrape(url, parseInt(limit))
if (process.env.DEBUG === 'true') console.info(`pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
if (!pages || pages.length === 0) throw new Error('No relative links found')
for (const page of pages) {
docs.push(...(await puppeteerLoader(page)))
}
if (process.env.DEBUG === 'true') console.info(`Finish ${relativeLinksMethod}`)
} else {
docs = await puppeteerLoader(url)
}
Expand Down
1 change: 1 addition & 0 deletions packages/components/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"@qdrant/js-client-rest": "^1.2.2",
"@supabase/supabase-js": "^2.21.0",
"@types/js-yaml": "^4.0.5",
"@types/jsdom": "^21.1.1",
"axios": "^0.27.2",
"cheerio": "^1.0.0-rc.12",
"chromadb": "^1.4.2",
Expand Down
1 change: 1 addition & 0 deletions packages/components/src/Interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export interface INodeParams {
type: NodeParamsType | string
default?: CommonType | ICommonObject | ICommonObject[]
description?: string
warning?: string
options?: Array<INodeOptionsValue>
optional?: boolean | INodeDisplay
rows?: number
Expand Down
Loading