Skip to content

Commit

Permalink
Merge f0d78ce into 1fd128b
Browse files Browse the repository at this point in the history
  • Loading branch information
tarrow committed Sep 14, 2016
2 parents 1fd128b + f0d78ce commit 15e49dc
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 31 deletions.
10 changes: 7 additions & 3 deletions lib/renderer/basic.js
Expand Up @@ -7,15 +7,19 @@ util.inherits(BasicRenderer, EventEmitter);

BasicRenderer.prototype.render = function(url, actions, cookiejar) {
var renderer = this;
var conf = {url: url};
var conf = {url: url, timeout: 10000};
if (cookiejar) {
conf.jar = cookiejar;
}
request(conf, function (error, response, body) {
if (!error && response.statusCode == 200) {
renderer.emit('renderer.urlRendered', url, body);
} else if (error) {
this.emit('error', error);
}
else if (error) {
renderer.emit('renderer.error', error)
}
else {
renderer.emit('renderer.error', 'page did not return a 200 instead returned '+response.statusCode);
}
});
}
Expand Down
14 changes: 12 additions & 2 deletions lib/scraper.js
Expand Up @@ -65,6 +65,7 @@ var Scraper = function(definition, headless) {
scraper.elements = definition.elements;
scraper.followables = definition.followables || [];
scraper.actions = definition.actions || null;
scraper.generic = definition.generic || null;

// The renderer is chosen. Basic by default (see BasicRenderer),
// but if the user specifies headless rendering, or if there are
Expand Down Expand Up @@ -277,6 +278,11 @@ Scraper.prototype.scrapeUrl = function(theUrl, node) {
})
scraper.ticker.tick();
});

renderer.on('renderer.error', function (msg) {
scraper.emit('error', msg);
})

}

// Scrape a specific element
Expand Down Expand Up @@ -347,7 +353,7 @@ Scraper.prototype.startTicker = function() {
if (!scraper.ticker) {
scraper.ticker = new Ticker(0, function() {
var results = scraper.structureResults();
scraper.emit('end', scraper.results, results);
scraper.emit('end', scraper.results, results, scraper);
});
}
}
Expand Down Expand Up @@ -376,7 +382,11 @@ Scraper.prototype.downloadElement = function(element, res, scrapeUrl) {
scraper.ticker.tick();
down.removeAllListeners();
});
down.once('*Error', function(err) {
down.once('downloadError', function(err) {
scraper.emit(this.event, err);
scraper.ticker.tick();
});
down.once('fileSaveError', function(err) {
scraper.emit(this.event, err);
scraper.ticker.tick();
});
Expand Down
4 changes: 2 additions & 2 deletions lib/scraperBox.js
Expand Up @@ -96,8 +96,8 @@ ScraperBox.prototype.getScraper = function(url) {
// specific regex. Specificity is defined as the number of non-
// wildcard characters in the regex.
//
// If a is more specific return 1
// If b is more specific return -1
// If a is more specific return -1
// If b is more specific return 1
// If a and b have equal specificity return 0
var compareRegexSpecificity = function(a, b) {
var aSpec = (a.url.match(/[a-z0-9]/gi)||[]).length;
Expand Down
54 changes: 30 additions & 24 deletions lib/thresher.js
Expand Up @@ -39,6 +39,26 @@ var Thresher = function(scraperBox) {
});
}

var scrapeEnd = function(result, structured, scraper) {
var thresher = this
var keyscaptured = Object.keys(result).length;
var keysexpected = scraper.elementsArray.length;
if (keyscaptured = 0) {
thresher.emit('info', 'no elements captured - trying a resolved scrape');
thresher.resolveScrape(scrapeUrl, headless, result);
} else if (keyscaptured < keysexpected) {
// some expected elements weren't captured
// try resolving any redirects
var msg = 'only ' + keyscaptured +
' elements out of ' + keysexpected +
'were captured. Attempting URL resolve.';
thresher.emit('info', msg);
thresher.emit('result', result, structured);
} else {
thresher.emit('result', result, structured);
}
}

// Thresher inherits from EventEmitter
util.inherits(Thresher, EventEmitter2);

Expand All @@ -55,7 +75,7 @@ Thresher.prototype.scrape = function(scrapeUrl, headless) {

// set up the scraper
var scraper = thresher.scraperBox.getScraper(scrapeUrl);
if (!scraper) {
if (!scraper || scraper.generic) {
// maybe need to resolve the URL
thresher.resolveScrape(scrapeUrl, headless);
return;
Expand All @@ -69,23 +89,10 @@ Thresher.prototype.scrape = function(scrapeUrl, headless) {
thresher.emit('scraper.' + this.event, var1 || '', var2 || '');
});

scraper.on('end', function(result, structured) {
var keyscaptured = Object.keys(result).length;
var keysexpected = scraper.elementsArray.length;
if (keyscaptured = 0) {
thresher.emit('info', 'no elements captured - trying a resolved scrape');
thresher.resolveScrape(scrapeUrl, headless, result);
} else if (keyscaptured < keysexpected) {
// some expected elements weren't captured
// try resolving any redirects
var msg = 'only ' + keyscaptured +
' elements out of ' + keysexpected +
'were captured. Attempting URL resolve.';
thresher.emit('info', msg);
thresher.emit('result', result, structured);
} else {
thresher.emit('result', result, structured);
}
scraper.on('end', scrapeEnd.bind(thresher))

scraper.on('error', function(msg) {
thresher.emit('error', msg)
})

scraper.scrapeUrl(scrapeUrl);
Expand All @@ -97,7 +104,7 @@ Thresher.prototype.resolveScrape = function(scrapeUrl, headless, lastResult) {

// follow url redirects
url.resolveRedirects(scrapeUrl, function(err, resolvedUrl) {

if (err) thresher.emit('error', err)
// set up the scraper
var scraper = thresher.scraperBox.getScraper(resolvedUrl);
if (scraper === null) {
Expand All @@ -113,13 +120,12 @@ Thresher.prototype.resolveScrape = function(scrapeUrl, headless, lastResult) {
thresher.emit('scraper.' + this.event, var1, var2);
});

scraper.on('end', function(result, structured) {
if (lastResult) {
structured = _.deepExtend(lastResult, structured);
}
thresher.emit('result', result, structured);
scraper.on('error', function(msg) {
thresher.emit('error', msg)
})

scraper.on('end', scrapeEnd.bind(thresher))

scraper.scrapeUrl(resolvedUrl);
});
}
Expand Down
3 changes: 3 additions & 0 deletions lib/url.js
Expand Up @@ -57,6 +57,9 @@ url.cleanResourcePath = function(path, pageUrl) {
// Resolve HTTP redirects
url.resolveRedirects = function(url, callback) {
request({ url: url, method: 'HEAD' }, function(err, response, body){
if (!response) response = {}
if (!response.request) response.request = {}
if (!response.request.href) response.request.href = {}
callback(err, response.request.href);
});
}
Empty file added test1234
Empty file.

0 comments on commit 15e49dc

Please sign in to comment.