Skip to content

Commit

Permalink
removed utils2 seperated functions, cache, config
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthias -apoc- Hecker committed Feb 15, 2011
1 parent 4287bad commit eb8f43c
Show file tree
Hide file tree
Showing 13 changed files with 316 additions and 265 deletions.
11 changes: 1 addition & 10 deletions README.markdown
Expand Up @@ -10,6 +10,7 @@ Then it crawls all articles and uses readability to extract the content of the r

```sh
npm install readability

npm install node-expat
```

Expand All @@ -23,16 +24,6 @@ git clone git://github.com/4poc/feedability.git

To start feedability just change in the working directory and execute `node feedability.py`. If everything works you should be able to view [[http://127.0.0.1:1912/]] with your browser.

```sh
% node feedability.py
Starting Feedability: NodeJS Feed Proxy With Readability

load json settings: settings.json
http server started: http://127.0.0.1:1912/
just append your feed url, for instance:
http://127.0.0.1:1912/http://example.com/feed.rss
```

# Contact me for Feedback/Questions/Problems:

- If you would like to contact me in private use apoc@sixserv.org.
Expand Down
41 changes: 19 additions & 22 deletions TODO
@@ -1,28 +1,25 @@

I've ordered the list after the importance.
Readability

- readability returns: "Sorry, unable to parse article content. Please view
the original page instead." if it was unable to parse it. Should detect
that and restore the original excerpt.
- reuse the excerpt found in the feed, for instance to improve the readability
detection or just include it in the feed somehow. (currently its thrown away)
that and restore the original excerpt. (maybe keep the excerpt anyways)
- manage to use the readability dom tree for filtering

Proxy

- detect images (and maybe other content) and download it locally so that the
proxy can serve it from cache. *or* make sure the image links are pointing
to an absolute url.
- make cache files a little bit more accessible: use directories for domains
and timestamps etc. for the filename? How about creating a special
directory with more usable information as directories and softlinks?
- gzip the cache files
- currently I built a jsdom tree for the jquery selector filtering, but this
does not really make sense because readability is creating a jsdom tree
anyways, it would be cool to have pre and post hooks to manipulate the
dom tree that readability is using.
- ...
- fetch and respect the robots.txt file of the sites
- rewrite the feed parser/generator using jsdom etc. I do not like the way
the current implementation is building the feed xml with the &replace..
- refactor the utils2 (or maybe use another library for this anyway)
- ...
- sort this list
proxy can serve it from cache.
- caching proxy that runs readability for text/html

Cache

- gzip the cache files (make generic api for that)

Feed Parser

- rewrite the feed parser/generator using a xml dom library

Crawler

- fetch and respect the robots.txt file

46 changes: 24 additions & 22 deletions feedability.js
Expand Up @@ -19,32 +19,32 @@ console.log('Starting Feedability: NodeJS Feed Proxy With Readability\n');

// built in libraries
var fs = require('fs'),
http = require('http'),
urllib = require('url');
http = require('http');

// external libraries
var readability = require('readability');

// internal libraries
var tpl = require('./lib/tpl.js'),
utils2 = require('./lib/utils2.js'),
func = require('./lib/func.js'),
cfg = require('./lib/cfg.js'),
cache = require('./lib/cache.js'),
urlopen = require('./lib/urlopen.js'),
feed = require('./lib/feed.js'),
crawler = require('./lib/crawler.js'),
filter = require('./lib/filter.js');


var cache_path = utils2.settings['cache_path'];
var cache_path = cfg.get('cache_path');

if(utils2.filestats(cache_path) == null) {
if(!func.file_exists(cache_path)) {
console.log('create cache directory: '+cache_path);
fs.mkdirSync(cache_path, 0755);
}

// some variables used for the http server
var url_pattern = /^\/(http:\/\/.*)$/;
var bind = utils2.settings['http_server']['bind'];
var port = utils2.settings['http_server']['port'];
var bind = cfg.get('http_server')['bind'];
var port = cfg.get('http_server')['port'];

// create the http server with the feed proxy
http.createServer(function (client_request, client_response) {
Expand All @@ -66,31 +66,32 @@ http.createServer(function (client_request, client_response) {
crawl.fetch({
// the articles include a structure with url, orig_url, data, etc.
finished: function(articles) {
var article_urls = utils2.hashkeys(articles);
var article_urls = func.array_keys(articles);
for(var i = 0; i < article_urls.length; i++) {
var article_url = article_urls[i];
var article_data = articles[article_url].data;
if(!article_data || article_data.length <= 0) {
console.log('[WARNING] article not retreived: '+article_url+' ('+utils2.cache_file(article_url)+')');
console.log('[WARNING] article not retreived: '+article_url);
continue;
}
console.log('extract using readability for '+article_url+
' ('+article_data.length+')');

var cache_file = utils2.cache_file(article_url) + '.rdby';
var cache_file = cache.filename('rdby', article_url);
var article_text = null; // the extracted article text
// check for readability cache:
if(utils2.filestats(cache_file) !== null) {
if(func.file_exists(cache_file)) {
console.log('using readability cache file: '+cache_file);
article_text = fs.readFileSync(cache_file).toString();
}
// use readability to extract the article text
else {
readability.parse(article_data, article_url, function(info) {
try {
readability.parse(article_data.toString(), article_url, function(info) {
console.log('write readability cache file: '+cache_file);

// replace relative urls with absolute ones:
info.content = utils2.rel2abs(info.content, articles[article_url].domain);
info.content = func.html_rel2abs(info.content, articles[article_url].domain);
// it would be nice to do this directly in the dom, @TODO

fs.writeFile(cache_file, info.content, function(error) {
Expand All @@ -101,18 +102,23 @@ http.createServer(function (client_request, client_response) {

article_text = info.content;
});
}
catch(e) {
fs.writeFileSync('catch.txt', e.toString());
console.log(e);
}
}

// insert article text in feed:
var replace_entity = '&replaceurl:'+utils2.sha1(article_url)+';';
var replace_entity = '&replaceurl:'+func.sha1(article_url)+';';
article_text = article_text.replace(/\x08/, '');
feedxml = feedxml.replace(replace_entity, article_text);
}

console.log('send finished feed xml to client\n');
var server_headers = {
'Content-Type': feedxmlmime+'; charset=utf-8',
'Server': utils2.settings['http_server']['banner']
'Server': cfg.get('http_server')['banner']
};

client_response.writeHead(200, server_headers);
Expand All @@ -136,10 +142,6 @@ http.createServer(function (client_request, client_response) {
page.render(client_response);
}
}).listen(port, bind);
if(bind == '0.0.0.0') {
bind = '127.0.0.1';
}
console.log('http server started: http://'+bind+':'+port+'/');
console.log(' just append your feed url, for instance:');
console.log(' http://'+bind+':'+port+'/http://example.com/feed.rss');
console.log('http server listening on '+bind+' port '+port);
console.log('open a browser and try: http://127.0.0.1:'+port+'/');

20 changes: 20 additions & 0 deletions lib/cache.js
@@ -0,0 +1,20 @@
var uri = require('url'),
fs = require('fs');
var func = require('./func.js'),
cfg = require('./cfg.js');

// returns the name of the cache file for the supplied url
function filename(ext, url)
{
var domain = uri.parse(url).hostname,
urlhash = func.sha1(url);
var cache_path = cfg.get('cache_path')+'/'+domain;

if(!func.file_exists(cache_path)) {
console.log('create domain directory: '+cache_path);
fs.mkdirSync(cache_path, 0755);
}

return cache_path + '/' + urlhash + '.' + ext;
}
exports.filename = filename;
31 changes: 31 additions & 0 deletions lib/cfg.js
@@ -0,0 +1,31 @@
/**
* Module to load and access settings.
*/
var fs = require('fs');
var func = require('./func.js');

var settings = null;

// load the configuration settings
function load() {
if(settings == null) {
console.log('load settings.json file');
try {
settings = JSON.parse(fs.readFileSync('settings.json', 'utf8'));
if(func.file_exists('user_settings.json')) {
console.log('found and load the user_settings.json file');
var user_settings = JSON.parse(fs.readFileSync('user_settings.json', 'utf8'));
settings = func.object_merge(settings, user_settings);
}
}
catch (error) {
console.log('[ERROR] loading settings: '+error);
}
}
}
load();

function get(key) {
return settings[key];
}
exports.get = get;
18 changes: 10 additions & 8 deletions lib/crawler.js
@@ -1,9 +1,11 @@
// built in libraries
var fs = require('fs'),
urllib = require('url');
uri = require('url');

// internal libraries
var utils2 = require('./utils2.js'),
var func = require('./func.js'),
cfg = require('./cfg.js'),
cache = require('./cache.js'),
urlopen = require('./urlopen.js');

var filter = null;
Expand All @@ -20,11 +22,11 @@ var Crawler = function(urls) {
Crawler.prototype = {
// here in json cache methods the url is considered to be the real_url
write_json_cache: function(url, real_url, data) {
var json_cache_filename = utils2.cache_file(url) + '.json';
var json_cache_filename = cache.filename('json', url);
var json_cache = {
url: real_url,
orig_url: url,
domain: urllib.parse(real_url).hostname,
domain: uri.parse(real_url).hostname,
length: data.length,
date: (new Date()).toLocaleString()
};
Expand All @@ -33,7 +35,7 @@ Crawler.prototype = {
return json_cache;
},
load_json_cache: function(url) {
var json_cache_filename = utils2.cache_file(url) + '.json';
var json_cache_filename = cache.filename('json', url);
console.log('read json cache file: '+json_cache_filename);
return JSON.parse(fs.readFileSync(json_cache_filename, 'utf8'));
},
Expand All @@ -45,7 +47,7 @@ Crawler.prototype = {
tasks--;
if(tasks <= 0) {
// filter the received articles: (jquery selector filtering)
if(utils2.settings['filter']['activate']) {
if(cfg.get('filter')['activate']) {
if(filter == null) {
filter = require('./filter.js');
console.log('loaded filter library');
Expand All @@ -69,8 +71,8 @@ Crawler.prototype = {
var url = urls[i];

// first check if the url is in cache:
var cache_file = utils2.cache_file(url) + '.raw';
if(utils2.filestats(cache_file) !== null) {
var cache_file = cache.filename('raw', url);
if(func.file_exists(cache_file, true)) {
console.log('use cache file: '+cache_file);
fs.readFile(cache_file, function(error, data) {
if(error) {
Expand Down
30 changes: 16 additions & 14 deletions lib/feed.js
Expand Up @@ -2,11 +2,13 @@
var expat = require('node-expat');

// internal libraries
var utils2 = require('./utils2.js'),
var func = require('./func.js'),
cfg = require('./cfg.js'),
cache = require('./cache.js'),
urlopen = require('./urlopen.js');

var item_elements = utils2.settings['feed_parser']['item_elements'];
var remove_elements = utils2.settings['feed_parser']['remove_elements'];
var item_elements = cfg.get('feed_parser')['item_elements'];
var remove_elements = cfg.get('feed_parser')['remove_elements'];

/**
* Fetch a feed by url, parse it and create new xml string.
Expand Down Expand Up @@ -63,10 +65,10 @@ function parse(feed_url, callbacks)
}

// mark the elements as item
if(utils2.inarray(item_elements, name)) itemelm = true;
if(func.array_includes(item_elements, name)) itemelm = true;

// ignore the remove elements
if(itemelm && utils2.inarray(remove_elements, name)) ign = true;
if(itemelm && func.array_includes(remove_elements, name)) ign = true;

if(itemelm && name == 'link') {
if(attrs['href']) { // <link href="[itemurl]" />
Expand All @@ -79,32 +81,32 @@ function parse(feed_url, callbacks)

if(!ign) {
xml += '<'+name;
utils2.foreach(attrs, function(attr_name) {
xml += ' '+attr_name+'="'+utils2.encodexml(attrs[attr_name])+'"';
func.array_foreach(attrs, function(attr_name) {
xml += ' '+attr_name+'="'+func.xml_encode(attrs[attr_name])+'"';
}); xml += '>';
}
});

// End Elements </entry> etc.
xml_parser.addListener('endElement', function(name) {
if(textcue != '') {
textcue = utils2.trim(textcue);
textcue = func.string_trim(textcue);
xml += textcue;
if(itemurlelm && name == 'link') {
itemurl = utils2.decodexml(textcue);
itemurl = func.xml_decode(textcue);
itemurlelm = false;
}
textcue = '';
}

// the end of an item element </item> </entry>
if(itemelm && utils2.inarray(item_elements, name)) {
if(itemelm && func.array_includes(item_elements, name)) {
console.log('found item link: '+itemurl);
articles.push(itemurl);

if(type == 'atom') xml += '<content type="html">';
else xml += '<content:encoded>';
xml += '<![CDATA[&replaceurl:'+utils2.sha1(itemurl)+';]]>';
xml += '<![CDATA[&replaceurl:'+func.sha1(itemurl)+';]]>';
if(type == 'atom') xml += '</content>';
else xml += '</content:encoded>';
}
Expand All @@ -113,9 +115,9 @@ function parse(feed_url, callbacks)
xml += '</'+name+'>\n';
}

if(itemelm && utils2.inarray(remove_elements, name)) ign = false;
if(itemelm && func.array_includes(remove_elements, name)) ign = false;

if(utils2.inarray(item_elements, name)) itemelm = false;
if(func.array_includes(item_elements, name)) itemelm = false;

// the endelement of the toplevel feed element ends the parsing
if(root == name) {
Expand All @@ -131,7 +133,7 @@ function parse(feed_url, callbacks)
});
xml_parser.addListener('text', function(text) {
if(!incdata) {
text = utils2.encodexml(text);
text = func.xml_encode(text);
}

if(!ign) {
Expand Down

0 comments on commit eb8f43c

Please sign in to comment.