Site scrapping utility for Node.js
const scrapper = require('site-scrapper')
const baseUrl = 'https://www.example.com/'
const sitePages = require('./site-map').pages
scrapper.scrapePagesList(baseUrl, sitePages, (err, pages) => {
if (err) return console.error(err)
console.log(pages)
})
Site map example
{
"pages": {
"home": {
"name": "home",
"contentSelector": ".home-content"
},
"contactUs": {
"name": "contactUs",
"url": "contact-us",
"contentSelector": ".contact-us-content"
},
"blog": {
"name": "blog",
"url": "blog",
"pages": {
"latest": {
"name": "latest",
"url": "latest",
"contentSelector": ".blog-posts"
},
"history": {
"name": "history",
"url": "history",
"contentSelector": ".blog-posts"
}
}
}
}
}
baseUrl
<String> Target site to scrapsitePages
<Object> List of pages to scrap in format{ pageName: pageOptions, ... }
pageOptions
<Object>name
<String> Page name used for mapping of scrapped contenturl
<String> Page url, will be automatically prefixed withbaseUrl
contentSelector
<String>pages
<sitePages> List of nested pages to scrap, for all nested pagesbaseUrl
will bebaseUrl/url
callback
<Function> A callback function
Function returns result in the following format
{
"pageName1": "pageContent",
"pageName2": {
"subPageName1": "subPage1Content"
}
}
baseUrl
<String> Target site to scrappageOptions
<Object>name
<String> Page name used for mapping of scrapped contenturl
<String> Page url, will be automatically prefixed withbaseUrl
contentSelector
<String>
callback
<Function> A callback function
Function returns page content as String