diff --git a/README.markdown b/README.markdown index 32e8214..898d8ef 100644 --- a/README.markdown +++ b/README.markdown @@ -10,6 +10,7 @@ Then it crawls all articles and uses readability to extract the content of the r ```sh npm install readability + npm install node-expat ``` @@ -23,16 +24,6 @@ git clone git://github.com/4poc/feedability.git To start feedability just change in the working directory and execute `node feedability.py`. If everything works you should be able to view [[http://127.0.0.1:1912/]] with your browser. -```sh -% node feedability.py -Starting Feedability: NodeJS Feed Proxy With Readability - -load json settings: settings.json -http server started: http://127.0.0.1:1912/ - just append your feed url, for instance: - http://127.0.0.1:1912/http://example.com/feed.rss -``` - # Contact me for Feedback/Questions/Problems: - If you would like to contact me in private use apoc@sixserv.org. diff --git a/TODO b/TODO index 5e7f5b1..d4f6b2f 100644 --- a/TODO +++ b/TODO @@ -1,28 +1,25 @@ - -I've ordered the list after the importance. +Readability - readability returns: "Sorry, unable to parse article content. Please view the original page instead." if it was unable to parse it. Should detect - that and restore the original excerpt. -- reuse the excerpt found in the feed, for instance to improve the readability - detection or just include it in the feed somehow. (currently its thrown away) + that and restore the original excerpt. (maybe keep the excerpt anyways) +- manage to use the readability dom tree for filtering + +Proxy + - detect images (and maybe other content) and download it locally so that the - proxy can serve it from cache. *or* make sure the image links are pointing - to an absolute url. -- make cache files a little bit more accessible: use directories for domains - and timestamps etc. for the filename? How about creating a special - directory with more usable information as directories and softlinks? -- gzip the cache files -- currently I built a jsdom tree for the jquery selector filtering, but this - does not really make sense because readability is creating a jsdom tree - anyways, it would be cool to have pre and post hooks to manipulate the - dom tree that readability is using. -- ... -- fetch and respect the robots.txt file of the sites -- rewrite the feed parser/generator using jsdom etc. I do not like the way - the current implementation is building the feed xml with the &replace.. -- refactor the utils2 (or maybe use another library for this anyway) -- ... -- sort this list + proxy can serve it from cache. +- caching proxy that runs readability for text/html + +Cache + +- gzip the cache files (make generic api for that) + +Feed Parser + +- rewrite the feed parser/generator using a xml dom library + +Crawler +- fetch and respect the robots.txt file diff --git a/feedability.js b/feedability.js index 90fcd3a..4ee435d 100644 --- a/feedability.js +++ b/feedability.js @@ -19,32 +19,32 @@ console.log('Starting Feedability: NodeJS Feed Proxy With Readability\n'); // built in libraries var fs = require('fs'), - http = require('http'), - urllib = require('url'); + http = require('http'); // external libraries var readability = require('readability'); // internal libraries var tpl = require('./lib/tpl.js'), - utils2 = require('./lib/utils2.js'), + func = require('./lib/func.js'), + cfg = require('./lib/cfg.js'), + cache = require('./lib/cache.js'), urlopen = require('./lib/urlopen.js'), feed = require('./lib/feed.js'), crawler = require('./lib/crawler.js'), filter = require('./lib/filter.js'); - -var cache_path = utils2.settings['cache_path']; +var cache_path = cfg.get('cache_path'); -if(utils2.filestats(cache_path) == null) { +if(!func.file_exists(cache_path)) { console.log('create cache directory: '+cache_path); fs.mkdirSync(cache_path, 0755); } // some variables used for the http server var url_pattern = /^\/(http:\/\/.*)$/; -var bind = utils2.settings['http_server']['bind']; -var port = utils2.settings['http_server']['port']; +var bind = cfg.get('http_server')['bind']; +var port = cfg.get('http_server')['port']; // create the http server with the feed proxy http.createServer(function (client_request, client_response) { @@ -66,31 +66,32 @@ http.createServer(function (client_request, client_response) { crawl.fetch({ // the articles include a structure with url, orig_url, data, etc. finished: function(articles) { - var article_urls = utils2.hashkeys(articles); + var article_urls = func.array_keys(articles); for(var i = 0; i < article_urls.length; i++) { var article_url = article_urls[i]; var article_data = articles[article_url].data; if(!article_data || article_data.length <= 0) { - console.log('[WARNING] article not retreived: '+article_url+' ('+utils2.cache_file(article_url)+')'); + console.log('[WARNING] article not retreived: '+article_url); continue; } console.log('extract using readability for '+article_url+ ' ('+article_data.length+')'); - var cache_file = utils2.cache_file(article_url) + '.rdby'; + var cache_file = cache.filename('rdby', article_url); var article_text = null; // the extracted article text // check for readability cache: - if(utils2.filestats(cache_file) !== null) { + if(func.file_exists(cache_file)) { console.log('using readability cache file: '+cache_file); article_text = fs.readFileSync(cache_file).toString(); } // use readability to extract the article text else { - readability.parse(article_data, article_url, function(info) { + try { + readability.parse(article_data.toString(), article_url, function(info) { console.log('write readability cache file: '+cache_file); // replace relative urls with absolute ones: - info.content = utils2.rel2abs(info.content, articles[article_url].domain); + info.content = func.html_rel2abs(info.content, articles[article_url].domain); // it would be nice to do this directly in the dom, @TODO fs.writeFile(cache_file, info.content, function(error) { @@ -101,10 +102,15 @@ http.createServer(function (client_request, client_response) { article_text = info.content; }); + } + catch(e) { + fs.writeFileSync('catch.txt', e.toString()); + console.log(e); + } } // insert article text in feed: - var replace_entity = '&replaceurl:'+utils2.sha1(article_url)+';'; + var replace_entity = '&replaceurl:'+func.sha1(article_url)+';'; article_text = article_text.replace(/\x08/, ''); feedxml = feedxml.replace(replace_entity, article_text); } @@ -112,7 +118,7 @@ http.createServer(function (client_request, client_response) { console.log('send finished feed xml to client\n'); var server_headers = { 'Content-Type': feedxmlmime+'; charset=utf-8', - 'Server': utils2.settings['http_server']['banner'] + 'Server': cfg.get('http_server')['banner'] }; client_response.writeHead(200, server_headers); @@ -136,10 +142,6 @@ http.createServer(function (client_request, client_response) { page.render(client_response); } }).listen(port, bind); -if(bind == '0.0.0.0') { - bind = '127.0.0.1'; -} -console.log('http server started: http://'+bind+':'+port+'/'); -console.log(' just append your feed url, for instance:'); -console.log(' http://'+bind+':'+port+'/http://example.com/feed.rss'); +console.log('http server listening on '+bind+' port '+port); +console.log('open a browser and try: http://127.0.0.1:'+port+'/'); diff --git a/lib/cache.js b/lib/cache.js new file mode 100644 index 0000000..d7b0f85 --- /dev/null +++ b/lib/cache.js @@ -0,0 +1,20 @@ +var uri = require('url'), + fs = require('fs'); +var func = require('./func.js'), + cfg = require('./cfg.js'); + +// returns the name of the cache file for the supplied url +function filename(ext, url) +{ + var domain = uri.parse(url).hostname, + urlhash = func.sha1(url); + var cache_path = cfg.get('cache_path')+'/'+domain; + + if(!func.file_exists(cache_path)) { + console.log('create domain directory: '+cache_path); + fs.mkdirSync(cache_path, 0755); + } + + return cache_path + '/' + urlhash + '.' + ext; +} +exports.filename = filename; diff --git a/lib/cfg.js b/lib/cfg.js new file mode 100644 index 0000000..ee661e1 --- /dev/null +++ b/lib/cfg.js @@ -0,0 +1,31 @@ +/** + * Module to load and access settings. + */ +var fs = require('fs'); +var func = require('./func.js'); + +var settings = null; + +// load the configuration settings +function load() { + if(settings == null) { + console.log('load settings.json file'); + try { + settings = JSON.parse(fs.readFileSync('settings.json', 'utf8')); + if(func.file_exists('user_settings.json')) { + console.log('found and load the user_settings.json file'); + var user_settings = JSON.parse(fs.readFileSync('user_settings.json', 'utf8')); + settings = func.object_merge(settings, user_settings); + } + } + catch (error) { + console.log('[ERROR] loading settings: '+error); + } + } +} +load(); + +function get(key) { + return settings[key]; +} +exports.get = get; diff --git a/lib/crawler.js b/lib/crawler.js index 5daa3fe..15aebcc 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -1,9 +1,11 @@ // built in libraries var fs = require('fs'), - urllib = require('url'); + uri = require('url'); // internal libraries -var utils2 = require('./utils2.js'), +var func = require('./func.js'), + cfg = require('./cfg.js'), + cache = require('./cache.js'), urlopen = require('./urlopen.js'); var filter = null; @@ -20,11 +22,11 @@ var Crawler = function(urls) { Crawler.prototype = { // here in json cache methods the url is considered to be the real_url write_json_cache: function(url, real_url, data) { - var json_cache_filename = utils2.cache_file(url) + '.json'; + var json_cache_filename = cache.filename('json', url); var json_cache = { url: real_url, orig_url: url, - domain: urllib.parse(real_url).hostname, + domain: uri.parse(real_url).hostname, length: data.length, date: (new Date()).toLocaleString() }; @@ -33,7 +35,7 @@ Crawler.prototype = { return json_cache; }, load_json_cache: function(url) { - var json_cache_filename = utils2.cache_file(url) + '.json'; + var json_cache_filename = cache.filename('json', url); console.log('read json cache file: '+json_cache_filename); return JSON.parse(fs.readFileSync(json_cache_filename, 'utf8')); }, @@ -45,7 +47,7 @@ Crawler.prototype = { tasks--; if(tasks <= 0) { // filter the received articles: (jquery selector filtering) - if(utils2.settings['filter']['activate']) { + if(cfg.get('filter')['activate']) { if(filter == null) { filter = require('./filter.js'); console.log('loaded filter library'); @@ -69,8 +71,8 @@ Crawler.prototype = { var url = urls[i]; // first check if the url is in cache: - var cache_file = utils2.cache_file(url) + '.raw'; - if(utils2.filestats(cache_file) !== null) { + var cache_file = cache.filename('raw', url); + if(func.file_exists(cache_file, true)) { console.log('use cache file: '+cache_file); fs.readFile(cache_file, function(error, data) { if(error) { diff --git a/lib/feed.js b/lib/feed.js index e0ebccf..c1253c0 100644 --- a/lib/feed.js +++ b/lib/feed.js @@ -2,11 +2,13 @@ var expat = require('node-expat'); // internal libraries -var utils2 = require('./utils2.js'), +var func = require('./func.js'), + cfg = require('./cfg.js'), + cache = require('./cache.js'), urlopen = require('./urlopen.js'); -var item_elements = utils2.settings['feed_parser']['item_elements']; -var remove_elements = utils2.settings['feed_parser']['remove_elements']; +var item_elements = cfg.get('feed_parser')['item_elements']; +var remove_elements = cfg.get('feed_parser')['remove_elements']; /** * Fetch a feed by url, parse it and create new xml string. @@ -63,10 +65,10 @@ function parse(feed_url, callbacks) } // mark the elements as item - if(utils2.inarray(item_elements, name)) itemelm = true; + if(func.array_includes(item_elements, name)) itemelm = true; // ignore the remove elements - if(itemelm && utils2.inarray(remove_elements, name)) ign = true; + if(itemelm && func.array_includes(remove_elements, name)) ign = true; if(itemelm && name == 'link') { if(attrs['href']) { // @@ -79,8 +81,8 @@ function parse(feed_url, callbacks) if(!ign) { xml += '<'+name; - utils2.foreach(attrs, function(attr_name) { - xml += ' '+attr_name+'="'+utils2.encodexml(attrs[attr_name])+'"'; + func.array_foreach(attrs, function(attr_name) { + xml += ' '+attr_name+'="'+func.xml_encode(attrs[attr_name])+'"'; }); xml += '>'; } }); @@ -88,23 +90,23 @@ function parse(feed_url, callbacks) // End Elements etc. xml_parser.addListener('endElement', function(name) { if(textcue != '') { - textcue = utils2.trim(textcue); + textcue = func.string_trim(textcue); xml += textcue; if(itemurlelm && name == 'link') { - itemurl = utils2.decodexml(textcue); + itemurl = func.xml_decode(textcue); itemurlelm = false; } textcue = ''; } // the end of an item element - if(itemelm && utils2.inarray(item_elements, name)) { + if(itemelm && func.array_includes(item_elements, name)) { console.log('found item link: '+itemurl); articles.push(itemurl); if(type == 'atom') xml += ''; else xml += ''; - xml += ''; + xml += ''; if(type == 'atom') xml += ''; else xml += ''; } @@ -113,9 +115,9 @@ function parse(feed_url, callbacks) xml += '\n'; } - if(itemelm && utils2.inarray(remove_elements, name)) ign = false; + if(itemelm && func.array_includes(remove_elements, name)) ign = false; - if(utils2.inarray(item_elements, name)) itemelm = false; + if(func.array_includes(item_elements, name)) itemelm = false; // the endelement of the toplevel feed element ends the parsing if(root == name) { @@ -131,7 +133,7 @@ function parse(feed_url, callbacks) }); xml_parser.addListener('text', function(text) { if(!incdata) { - text = utils2.encodexml(text); + text = func.xml_encode(text); } if(!ign) { diff --git a/lib/filter.js b/lib/filter.js index e741bf9..e782c47 100644 --- a/lib/filter.js +++ b/lib/filter.js @@ -2,11 +2,13 @@ var jsdom = require('jsdom'), util = require('util'), fs = require('fs'); -var utils2 = require('./utils2.js'); +var func = require('./func.js'), + cfg = require('./cfg.js'), + cache = require('./cache.js'); // configuration and filter rules -var jquery_path = utils2.settings['jquery_path']; -var jquery_filters = utils2.settings['filter']['jquery_filters']; +var jquery_path = cfg.get('jquery_path'); +var jquery_filters = cfg.get('filter')['jquery_filters']; var url_patterns = Array(); for(var url_pattern_string in jquery_filters) { url_patterns.push(url_pattern_string); @@ -44,7 +46,7 @@ for(var url_pattern_string in jquery_filters) { * no effect on already cached articles. */ function filter(articles, callback) { - var article_urls = utils2.hashkeys(articles); + var article_urls = func.array_keys(articles); var tasks = article_urls.length; // this nice pattern is based on (3.1): @@ -64,8 +66,8 @@ function filter(articles, callback) { // most of the filtering is very slow, so make sure this is really // necessary before continue. // check for cached readability file: - var cache_file = utils2.cache_file(orig_url) + '.rdby'; - if(utils2.filestats(cache_file) !== null) { // cache file exists + var cache_file = cache.filename('rdby', orig_url); + if(func.file_exists(cache_file)) { // cache file exists console.log('readability cache file found, skip filtering'); next(); return; // == continue here diff --git a/lib/func.js b/lib/func.js new file mode 100644 index 0000000..1fcec30 --- /dev/null +++ b/lib/func.js @@ -0,0 +1,172 @@ +/** + * Node.js Utility Functions + * + * This file is a collection of javascript and nodejs helper + * functions. Besides node.js there are no external libraries + * required. + */ +var fs = require('fs'), + crypto = require('crypto'); + +/***************************************************************************** + * Object Functions + *****************************************************************************/ + +// merge properties of two objects recursively together +function object_merge(obj1, obj2) { + // @http://stackoverflow.com/questions/171251/how-can-i-merge-properties- + // of-two-javascript-objects-dynamically/383245#383245 + for(var key in obj2) { + try { + // Property in destination object set; update its value. + if(obj2[key].constructor == Object) { + obj1[key] = object_merge(obj1[key], obj2[key]); + } + else { + obj1[key] = obj2[key]; + } + } + catch(e) { + // Property in destination object not set; create it and set its value. + obj1[key] = obj2[key]; + } + } + return obj1; +} +exports.object_merge = object_merge; + +/***************************************************************************** + * Array Functions + *****************************************************************************/ + +// remove array item +function array_remove(array, item) { + var j = 0; + while(j < array.length) { + if(array[j] == toremove) { + array.splice(j, 1); + } + else { + j++; + } + } + return array; +} +exports.array_remove = array_remove; + +// checks for item in array, returns true if found +function array_includes(array, item) { + for(var key in array) { + if(array[key] == item) { + return true; + } + } + return false; +} +exports.array_includes = array_includes; + +// returns an array with all keys of the supplied array +function array_keys(array) { + var keys_array = Array(); + for(key in array) { + keys_array.push(key); + } + return keys_array; +} +exports.array_keys = array_keys; + +// iterates over array and calls the callback with key and value as params +function array_foreach(array, callback) { + for(var key in array) { + if(typeof array[key] != 'function') { + callback(key, array[key]); + } + } +} +exports.array_foreach = array_foreach; + +/***************************************************************************** + * String Functions + *****************************************************************************/ + +// removes leading and tailing whitespace characters from string and returns +function string_trim(string) { + return (string || '').replace(/^\s+|\s+$/g, ''); +} +exports.string_trim = string_trim; + +/***************************************************************************** + * Miscellaneous Functions + *****************************************************************************/ + +// read and returns file stats informations +function file_stats(filename) { + try { + var stats = fs.lstatSync(filename); + return stats; + } + catch(e) { + return null; + } +} +exports.file_stats = file_stats; + +// return true if the supplied file exists, false otherwise +// if delempty is set to true the file is deleted and false is returned +// when the file has zero size +function file_exists(filename, delempty) { + var stats = file_stats(filename); + if (stats === null) { + return false; + } + + if (delempty != undefined && stats.size == 0) { + console.log('unlink empty file: '+filename); + fs.unlinkSync(filename); + return false; + } + return true; +} +exports.file_exists = file_exists; + +// calculates and returns the SHA-1 hash of the supplied string +function sha1(string) { + return crypto.createHash('sha1').update(string).digest("hex"); +} +exports.sha1 = sha1; + +// encodes special xml characters with entities +var xml_encode_map = { + '&': '&', + '"': '"', + '<': '<', + '>': '>' +}; +function xml_encode(string) { + return string.replace(/([\&"<>])/g, function(str, item) { + return xml_encode_map[item]; + }); +} +exports.xml_encode = xml_encode; + +// decodes some of the special xml entities +var xml_decode_map = {}; +for(var char in xml_encode_map) { + xml_encode_map[xml_encode_map[char]] = char; +} +function xml_decode(string) { + return string.replace(/("|<|>|&)/g, function(str, item) { + return xml_decode_map[item]; + }); +} +exports.xml_decode = xml_decode; + +// replaces relative links and resource urls with absolute urls +function html_rel2abs(html, domain) { + var search = /(src|href)=('|")?(\/)/ig, + replace = '$1=$2http://'+domain+'$3'; + + return html.replace(search, replace); +} +exports.html_rel2abs = html_rel2abs; + diff --git a/lib/tpl.js b/lib/tpl.js index b20f942..06c81a7 100644 --- a/lib/tpl.js +++ b/lib/tpl.js @@ -1,6 +1,8 @@ // a very simple template library: var fs = require('fs'); -var utils2 = require('./utils2.js'); +var func = require('./func.js'), + cfg = require('./cfg.js'), + cache = require('./cache.js'); /** * Simple Template Class. @@ -28,13 +30,13 @@ Template.prototype = { } var content = data.toString(); - utils2.foreach(variables, function(name) { + func.array_foreach(variables, function(name) { content = content.replace('&'+name+';', variables[name]); }); response.writeHead(200, { 'content-type': 'text/html', - 'server': utils2.settings['http_server']['banner'] + 'server': cfg.get('http_server')['banner'] }); response.end(content); }); diff --git a/lib/urlopen.js b/lib/urlopen.js index 57dfcce..5d834f6 100644 --- a/lib/urlopen.js +++ b/lib/urlopen.js @@ -1,8 +1,10 @@ -var urllib = require('url'), +var uri = require('url'), http = require('http'); -var utils2 = require('./utils2.js'); +var func = require('./func.js'), + cfg = require('./cfg.js'), + cache = require('./cache.js'); -var client_headers = utils2.settings['http_client']['headers']; +var client_headers = cfg.get('http_client')['headers']; /** * Fetches an URL. @@ -22,11 +24,11 @@ function fetch(url, callbacks) return; } console.log('urlopen fetch: '+url); - var p = urllib.parse(url, true); + var p = uri.parse(url, true); var client = http.createClient(p.port || 80, p.hostname); client.on('error', callbacks.error); - var headers = utils2.extend(client_headers, {'Host': p.hostname}); + var headers = func.object_merge(client_headers, {'Host': p.hostname}); var request = client.request('GET', p.pathname + (p.search || ''), headers); request.on('error', callbacks.error); request.on('response', function(response) { diff --git a/lib/utils2.js b/lib/utils2.js deleted file mode 100644 index 813744b..0000000 --- a/lib/utils2.js +++ /dev/null @@ -1,172 +0,0 @@ -var fs = require('fs'), - crypto = require('crypto'), - urllib = require('url'); - -var settings = null; -function load_settings(filename) { - if(settings == null) { - console.log('load json settings: '+filename); - try { - settings = JSON.parse(fs.readFileSync(filename, 'utf8')); - } - catch (error) { - console.log('[ERROR] loading settings: '+error); - } - } - exports.settings = settings; -} -load_settings('settings.json'); - -function cache_file(url) -{ - var domain = urllib.parse(url).hostname, - urlhash = sha1(url); - var cache_path = settings['cache_path']+'/'+domain; - - if(filestats(cache_path) == null) { - console.log('create domain directory: '+cache_path); - fs.mkdirSync(cache_path, 0755); - } - - return cache_path + '/' + urlhash; -} -exports.cache_file = cache_file; - -function extend(to, from) -{ - var l,i,o,j; - for (i = 1, l = arguments.length; i < l; i++) { - o = arguments[i]; - for (j in o) { - to[j] = o[j]; - } - } - return to; -} -exports.extend = extend; - -function removeitem(array, toremove) -{ - var j = 0; - while(j < array.length) { - if(array[j] == toremove) { - array.splice(j, 1); - } - else { - j++; - } - } - return array; -} -exports.removeitem = removeitem; - -function inarray(array, search) -{ - for(var i=0; i': '>' -}; - -var escaped_one_to_xml_special_map = { - '&': '&', - '"': '"', - '<': '<', - '>': '>' -}; - -function encodexml(string) { - return string.replace(/([\&"<>])/g, function(str, item) { - return xml_special_to_escaped_one_map[item]; - }); -}; -function decodexml(string) { - return string.replace(/("|<|>|&)/g, - function(str, item) { - return escaped_one_to_xml_special_map[item]; - }); -} -exports.encodexml = encodexml; -exports.decodexml = decodexml; - -function rel2abs(content, domain) { - var search = /(src|href)=('|")?(\/)/ig, - replace = '$1=$2http://'+domain+'$3'; - - return content.replace(search, replace); -} -exports.rel2abs = rel2abs; - diff --git a/settings.json b/settings.json index 643d478..0bed9ce 100644 --- a/settings.json +++ b/settings.json @@ -1,6 +1,6 @@ { "http_server": { - "bind": "0.0.0.0", + "bind": "127.0.0.1", "port": 1912, "banner": "Feedability/0" }, @@ -21,8 +21,8 @@ "activate": true, "jquery_path": "./ext/jquery-1.5.min.js", "jquery_filters": { - "carta-standard-rss": ["#commentblock"], - "heise\\.de": ["#mitte_rechts"] + "carta.info": ["#commentblock"], + "heise.de": ["#mitte_rechts"] } }, "cache_path": "./cache"