removed utils2 seperated functions, cache, config

4poc · Feb 15, 2011 · eb8f43c · eb8f43c
1 parent 4287bad
commit eb8f43c
Show file tree

Hide file tree

Showing 13 changed files with 316 additions and 265 deletions.
diff --git a/README.markdown b/README.markdown
@@ -10,6 +10,7 @@ Then it crawls all articles and uses readability to extract the content of the r
 
 ```sh
 npm install readability
+
 npm install node-expat
 ```
 
@@ -23,16 +24,6 @@ git clone git://github.com/4poc/feedability.git
 
 To start feedability just change in the working directory and execute `node feedability.py`. If everything works you should be able to view [[http://127.0.0.1:1912/]] with your browser.
 
-```sh
-% node feedability.py
-Starting Feedability: NodeJS Feed Proxy With Readability
-
-load json settings: settings.json
-http server started: http://127.0.0.1:1912/
-  just append your feed url, for instance:
-    http://127.0.0.1:1912/http://example.com/feed.rss
-```
-
 # Contact me for Feedback/Questions/Problems:
 
 - If you would like to contact me in private use apoc@sixserv.org.

diff --git a/TODO b/TODO
@@ -1,28 +1,25 @@
-
-I've ordered the list after the importance.
+Readability
 
 - readability returns: "Sorry, unable to parse article content. Please view 
   the original page instead." if it was unable to parse it. Should detect
-  that and restore the original excerpt.
-- reuse the excerpt found in the feed, for instance to improve the readability
-  detection or just include it in the feed somehow. (currently its thrown away)
+  that and restore the original excerpt. (maybe keep the excerpt anyways)
+- manage to use the readability dom tree for filtering
+
+Proxy
+
 - detect images (and maybe other content) and download it locally so that the
-  proxy can serve it from cache. *or* make sure the image links are pointing
-  to an absolute url.
-- make cache files a little bit more accessible: use directories for domains
-  and timestamps etc. for the filename? How about creating a special 
-  directory with more usable information as directories and softlinks?
-- gzip the cache files
-- currently I built a jsdom tree for the jquery selector filtering, but this
-  does not really make sense because readability is creating a jsdom tree
-  anyways, it would be cool to have pre and post hooks to manipulate the
-  dom tree that readability is using.
-- ...
-- fetch and respect the robots.txt file of the sites
-- rewrite the feed parser/generator using jsdom etc. I do not like the way
-  the current implementation is building the feed xml with the &replace..
-- refactor the utils2 (or maybe use another library for this anyway)
-- ...
-- sort this list
+  proxy can serve it from cache.
+- caching proxy that runs readability for text/html
+
+Cache
+
+- gzip the cache files (make generic api for that)
+
+Feed Parser
+
+- rewrite the feed parser/generator using a xml dom library
+
+Crawler
 
+- fetch and respect the robots.txt file
 
diff --git a/feedability.js b/feedability.js
@@ -19,32 +19,32 @@ console.log('Starting Feedability: NodeJS Feed Proxy With Readability\n');
 
 // built in libraries
 var fs = require('fs'),
-    http = require('http'),
-    urllib = require('url');
+    http = require('http');
 
 // external libraries
 var readability = require('readability');
 
 // internal libraries
 var tpl = require('./lib/tpl.js'),
-    utils2 = require('./lib/utils2.js'),
+    func = require('./lib/func.js'),
+    cfg = require('./lib/cfg.js'),
+    cache = require('./lib/cache.js'),
     urlopen = require('./lib/urlopen.js'),
     feed = require('./lib/feed.js'),
     crawler = require('./lib/crawler.js'),
     filter = require('./lib/filter.js');
 
-
-var cache_path = utils2.settings['cache_path'];
+var cache_path = cfg.get('cache_path');
 
-if(utils2.filestats(cache_path) == null) {
+if(!func.file_exists(cache_path)) {
   console.log('create cache directory: '+cache_path);
   fs.mkdirSync(cache_path, 0755);
 }
 
 // some variables used for the http server
 var url_pattern = /^\/(http:\/\/.*)$/;
-var bind = utils2.settings['http_server']['bind'];
-var port = utils2.settings['http_server']['port'];
+var bind = cfg.get('http_server')['bind'];
+var port = cfg.get('http_server')['port'];
 
 // create the http server with the feed proxy
 http.createServer(function (client_request, client_response) {
@@ -66,31 +66,32 @@ http.createServer(function (client_request, client_response) {
         crawl.fetch({
           // the articles include a structure with url, orig_url, data, etc.
           finished: function(articles) {
-            var article_urls = utils2.hashkeys(articles);
+            var article_urls = func.array_keys(articles);
             for(var i = 0; i < article_urls.length; i++) {
               var article_url = article_urls[i];
               var article_data = articles[article_url].data;
               if(!article_data || article_data.length <= 0) {
-                console.log('[WARNING] article not retreived: '+article_url+' ('+utils2.cache_file(article_url)+')');
+                console.log('[WARNING] article not retreived: '+article_url);
                 continue;
               }
               console.log('extract using readability for '+article_url+
                           ' ('+article_data.length+')');
 
-              var cache_file = utils2.cache_file(article_url) + '.rdby';
+              var cache_file = cache.filename('rdby', article_url);
               var article_text = null; // the extracted article text
               // check for readability cache:
-              if(utils2.filestats(cache_file) !== null) {
+              if(func.file_exists(cache_file)) {
                 console.log('using readability cache file: '+cache_file);
                 article_text = fs.readFileSync(cache_file).toString();
               }
               // use readability to extract the article text
               else {
-                readability.parse(article_data, article_url, function(info) {
+                try {
+                readability.parse(article_data.toString(), article_url, function(info) {
                   console.log('write readability cache file: '+cache_file);
 
                   // replace relative urls with absolute ones:
-                  info.content = utils2.rel2abs(info.content, articles[article_url].domain);
+                  info.content = func.html_rel2abs(info.content, articles[article_url].domain);
                   // it would be nice to do this directly in the dom, @TODO
 
                   fs.writeFile(cache_file, info.content, function(error) {
@@ -101,18 +102,23 @@ http.createServer(function (client_request, client_response) {
 
                   article_text = info.content;
                 });
+                }
+                catch(e) {
+                  fs.writeFileSync('catch.txt', e.toString());
+                  console.log(e);
+                }
               }
 
               // insert article text in feed:
-              var replace_entity = '&replaceurl:'+utils2.sha1(article_url)+';';
+              var replace_entity = '&replaceurl:'+func.sha1(article_url)+';';
               article_text = article_text.replace(/\x08/, '');
               feedxml = feedxml.replace(replace_entity, article_text);
             }
 
             console.log('send finished feed xml to client\n');
             var server_headers = {
               'Content-Type': feedxmlmime+'; charset=utf-8',
-              'Server': utils2.settings['http_server']['banner']
+              'Server': cfg.get('http_server')['banner']
             };
 
             client_response.writeHead(200, server_headers);
@@ -136,10 +142,6 @@ http.createServer(function (client_request, client_response) {
     page.render(client_response);
   }
 }).listen(port, bind);
-if(bind == '0.0.0.0') {
-  bind = '127.0.0.1';
-}
-console.log('http server started: http://'+bind+':'+port+'/');
-console.log('  just append your feed url, for instance:');
-console.log('    http://'+bind+':'+port+'/http://example.com/feed.rss');
+console.log('http server listening on '+bind+' port '+port);
+console.log('open a browser and try: http://127.0.0.1:'+port+'/');
 
diff --git a/lib/cache.js b/lib/cache.js
@@ -0,0 +1,20 @@
+var uri = require('url'),
+    fs = require('fs');
+var func = require('./func.js'),
+    cfg = require('./cfg.js');
+
+// returns the name of the cache file for the supplied url
+function filename(ext, url)
+{
+  var domain = uri.parse(url).hostname,
+      urlhash = func.sha1(url);
+  var cache_path = cfg.get('cache_path')+'/'+domain;
+
+  if(!func.file_exists(cache_path)) {
+    console.log('create domain directory: '+cache_path);
+    fs.mkdirSync(cache_path, 0755);
+  }
+
+  return cache_path + '/' + urlhash + '.' + ext;
+}
+exports.filename = filename;
diff --git a/lib/cfg.js b/lib/cfg.js
@@ -0,0 +1,31 @@
+/**
+ * Module to load and access settings.
+ */
+var fs = require('fs');
+var func = require('./func.js');
+
+var settings = null;
+
+// load the configuration settings
+function load() {
+  if(settings == null) {
+    console.log('load settings.json file');
+    try {
+      settings = JSON.parse(fs.readFileSync('settings.json', 'utf8'));
+      if(func.file_exists('user_settings.json')) {
+        console.log('found and load the user_settings.json file');
+        var user_settings = JSON.parse(fs.readFileSync('user_settings.json', 'utf8'));
+        settings = func.object_merge(settings, user_settings);
+      }
+    }
+    catch (error) {
+      console.log('[ERROR] loading settings: '+error);
+    }
+  }
+}
+load();
+
+function get(key) {
+  return settings[key];
+}
+exports.get = get;
diff --git a/lib/crawler.js b/lib/crawler.js
@@ -1,9 +1,11 @@
 // built in libraries
 var fs = require('fs'),
-    urllib = require('url');
+    uri = require('url');
 
 // internal libraries
-var utils2 = require('./utils2.js'),
+var func = require('./func.js'),
+    cfg = require('./cfg.js'),
+    cache = require('./cache.js'),
     urlopen = require('./urlopen.js');
 
 var filter = null;
@@ -20,11 +22,11 @@ var Crawler = function(urls) {
 Crawler.prototype = {
   // here in json cache methods the url is considered to be the real_url
   write_json_cache: function(url, real_url, data) {
-    var json_cache_filename = utils2.cache_file(url) + '.json';
+    var json_cache_filename = cache.filename('json', url);
     var json_cache = {
       url: real_url,
       orig_url: url,
-      domain: urllib.parse(real_url).hostname,
+      domain: uri.parse(real_url).hostname,
       length: data.length,
       date: (new Date()).toLocaleString()
     };
@@ -33,7 +35,7 @@ Crawler.prototype = {
     return json_cache;
   },
   load_json_cache: function(url) {
-    var json_cache_filename = utils2.cache_file(url) + '.json';
+    var json_cache_filename = cache.filename('json', url);
     console.log('read json cache file: '+json_cache_filename);
     return JSON.parse(fs.readFileSync(json_cache_filename, 'utf8'));
   },
@@ -45,7 +47,7 @@ Crawler.prototype = {
       tasks--;
       if(tasks <= 0) {
         // filter the received articles: (jquery selector filtering)
-        if(utils2.settings['filter']['activate']) {
+        if(cfg.get('filter')['activate']) {
           if(filter == null) {
             filter = require('./filter.js');
             console.log('loaded filter library');
@@ -69,8 +71,8 @@ Crawler.prototype = {
         var url = urls[i];
 
         // first check if the url is in cache:
-        var cache_file = utils2.cache_file(url) + '.raw';
-        if(utils2.filestats(cache_file) !== null) {
+        var cache_file = cache.filename('raw', url);
+        if(func.file_exists(cache_file, true)) {
           console.log('use cache file: '+cache_file);
           fs.readFile(cache_file, function(error, data) {
             if(error) {

diff --git a/lib/feed.js b/lib/feed.js
@@ -2,11 +2,13 @@
 var expat = require('node-expat');
 
 // internal libraries
-var utils2 = require('./utils2.js'),
+var func = require('./func.js'),
+    cfg = require('./cfg.js'),
+    cache = require('./cache.js'),
     urlopen = require('./urlopen.js');
 
-var item_elements = utils2.settings['feed_parser']['item_elements'];
-var remove_elements = utils2.settings['feed_parser']['remove_elements'];
+var item_elements = cfg.get('feed_parser')['item_elements'];
+var remove_elements = cfg.get('feed_parser')['remove_elements'];
 
 /**
  * Fetch a feed by url, parse it and create new xml string.
@@ -63,10 +65,10 @@ function parse(feed_url, callbacks)
         }
 
         // mark the elements as item
-        if(utils2.inarray(item_elements, name)) itemelm = true;
+        if(func.array_includes(item_elements, name)) itemelm = true;
 
         // ignore the remove elements
-        if(itemelm && utils2.inarray(remove_elements, name)) ign = true;
+        if(itemelm && func.array_includes(remove_elements, name)) ign = true;
 
         if(itemelm && name == 'link') {
           if(attrs['href']) { // <link href="[itemurl]" />
@@ -79,32 +81,32 @@ function parse(feed_url, callbacks)
 
         if(!ign) {
           xml += '<'+name;
-          utils2.foreach(attrs, function(attr_name) {
-            xml += ' '+attr_name+'="'+utils2.encodexml(attrs[attr_name])+'"';
+          func.array_foreach(attrs, function(attr_name) {
+            xml += ' '+attr_name+'="'+func.xml_encode(attrs[attr_name])+'"';
           }); xml += '>';
         }
       });
 
       // End Elements </entry> etc.
       xml_parser.addListener('endElement', function(name) {
         if(textcue != '') {
-          textcue = utils2.trim(textcue);
+          textcue = func.string_trim(textcue);
           xml += textcue;
           if(itemurlelm && name == 'link') {
-            itemurl = utils2.decodexml(textcue);
+            itemurl = func.xml_decode(textcue);
             itemurlelm = false;
           }
           textcue = '';
         }
 
         // the end of an item element </item> </entry>
-        if(itemelm && utils2.inarray(item_elements, name)) {
+        if(itemelm && func.array_includes(item_elements, name)) {
           console.log('found item link: '+itemurl);
           articles.push(itemurl);
 
           if(type == 'atom') xml += '<content type="html">';
           else xml += '<content:encoded>';
-          xml += '<![CDATA[&replaceurl:'+utils2.sha1(itemurl)+';]]>';
+          xml += '<![CDATA[&replaceurl:'+func.sha1(itemurl)+';]]>';
           if(type == 'atom') xml += '</content>';
           else xml += '</content:encoded>';
         }
@@ -113,9 +115,9 @@ function parse(feed_url, callbacks)
           xml += '</'+name+'>\n';
         }
 
-        if(itemelm && utils2.inarray(remove_elements, name)) ign = false;
+        if(itemelm && func.array_includes(remove_elements, name)) ign = false;
 
-        if(utils2.inarray(item_elements, name)) itemelm = false;
+        if(func.array_includes(item_elements, name)) itemelm = false;
 
         // the endelement of the toplevel feed element ends the parsing
         if(root == name) {
@@ -131,7 +133,7 @@ function parse(feed_url, callbacks)
       });
       xml_parser.addListener('text', function(text) {        
         if(!incdata) {
-          text = utils2.encodexml(text);
+          text = func.xml_encode(text);
         }
 
         if(!ign) {