Merge remote-tracking branch 'jsha/sqlite'

Conflicts: src/chrome/content/code/HTTPSRules.js
EFForg · Jan 22, 2014 · 414c3e9 · 414c3e9
2 parents 5fba110 + ba496d7
commit 414c3e9
Show file tree

Hide file tree

Showing 4 changed files with 160 additions and 43 deletions.
diff --git a/makexpi.sh b/makexpi.sh
@@ -15,6 +15,7 @@ APP_NAME=https-everywhere
 #  ./makexpi.sh 0.2.3.development.2
 
 cd "`dirname $0`"
+RULESETS_SQLITE="$PWD/src/defaults/rulesets.sqlite"
 
 [ -d pkg ] || mkdir pkg
 
@@ -97,6 +98,11 @@ if [ "$1" != "--fast" ] ; then
 fi
 # =============== END VALIDATION ================
 
+if [ "$1" != "--fast" -o ! -f "$RULESETS_SQLITE" ] ; then
+  echo "Generating sqlite DB"
+  ./utils/make-sqlite.py src/chrome/content/rules
+fi
+
 # The name/version of the XPI we're building comes from src/install.rdf
 XPI_NAME="pkg/$APP_NAME-`grep em:version src/install.rdf | sed -e 's/[<>]/	/g' | cut -f3`"
 if [ "$1" ] && [ "$1" != "--fast" ] ; then
@@ -114,14 +120,6 @@ if [ -e "$GIT_OBJECT_FILE" ]; then
 	export GIT_COMMIT_ID=$(cat "$GIT_OBJECT_FILE")
 fi
 
-# Unless we're in a hurry and there's already a ruleset library, build it from
-# the ruleset .xml files
-
-if [ "$1" = "--fast" ] ; then
-  FAST="--fast"
-fi
-python ./utils/merge-rulesets.py $FAST
-
 cd src
 
 # Build the XPI!
@@ -135,7 +133,7 @@ if [ "$ret" != 0 ]; then
     rm -f "../$XPI_NAME"
     exit "$?"
 else
-  echo >&2 "Total included rules: `find chrome/content/rules -name "*.xml" | wc -l`"
+  echo >&2 "Total included rules: `sqlite3 $RULESETS_SQLITE 'select count(*) from rulesets'`"
   echo >&2 "Rules disabled by default: `find chrome/content/rules -name "*.xml" | xargs grep -F default_off | wc -l`"
   echo >&2 "Created $XPI_NAME"
   if [ -n "$BRANCH" ]; then

diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js
@@ -280,6 +280,12 @@ const RuleWriter = {
 
     sstream.close();
     fstream.close();
+    return this.readFromString(data, rule_store, file);
+  },
+
+  readFromString: function(data, rule_store, file) {
+    if (typeof file === 'undefined') file = {path: 'fromString'};
+
     // XXX: With DOMParser, we probably do not need to throw away the XML
     // declaration anymore nowadays.
     data = data.replace(/<\?xml[^>]*\?>/, ""); 
@@ -414,30 +420,29 @@ const HTTPSRules = {
       this.rulesetsByName = {};
       var t1 = new Date().getTime();
       this.checkMixedContentHandling();
-      var rulefiles = RuleWriter.enumerate(RuleWriter.getCustomRuleDir());
-      this.scanRulefiles(rulefiles);
-      rulefiles = RuleWriter.enumerate(RuleWriter.getRuleDir());
-      this.scanRulefiles(rulefiles);
-      var t,i;
-      for (t in this.targets) {
-        for (i = 0 ; i < this.targets[t].length ; i++) {
-          this.log(INFO, t + " -> " + this.targets[t][i].name);
-        }
-      }
-
-      // for any rulesets with <target host="*">
-      // every URI needs to be checked against these rulesets
-      // (though currently we don't ship any)
-      this.global_rulesets = this.targets["*"] ? this.targets["*"] : [];
 
-      this.rulesets.sort(
-        function(r1,r2) {
-            if (r1.name.toLowerCase() < r2.name.toLowerCase()) return -1;
-            else return 1;
-        }
-      );
+      // Initialize database connection.
+      var dbFile = FileUtils.getFile("ProfD",
+        ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]);
+      var rulesetDBConn = Services.storage.openDatabase(dbFile);
+      this.queryForRuleset = rulesetDBConn.createStatement(
+        "select contents from rulesets where id = :id");
+
+      // Preload the list of which targets are available in the DB.
+      // This is a little slow (287 ms on a Core2 Duo @ 2.2GHz with SSD),
+      // but is faster than loading all of the rulesets. If this becomes a
+      // bottleneck, change it to load in a background webworker, or load
+      // a smaller bloom filter instead.
+      this.targetsAvailable = {};
+      var targetsQuery = rulesetDBConn.createStatement("select host, ruleset_id from targets");
+      this.log(DBUG, "Adding targets...");
+      while (targetsQuery.executeStep()) {
+        var host = targetsQuery.row.host;
+        this.targetsAvailable[host] = targetsQuery.row.ruleset_id;
+      }
+      this.log(DBUG, "Done adding targets.");
     } catch(e) {
-      this.log(WARN,"Rules Failed: "+e);
+      this.log(DBUG,"Rules Failed: "+e);
     }
     var t2 =  new Date().getTime();
     this.log(NOTE,"Loading rulesets took " + (t2 - t1) / 1000.0 + " seconds");
@@ -498,6 +503,8 @@ const HTTPSRules = {
     }
   },
 
+  httpMatch: /^http/i,
+
   rewrittenURI: function(alist, input_uri) {
     // This function oversees the task of working out if a uri should be
     // rewritten, what it should be rewritten to, and recordkeeping of which
@@ -518,7 +525,7 @@ const HTTPSRules = {
     try {
       var rs = this.potentiallyApplicableRulesets(uri.host);
     } catch(e) {
-      this.log(WARN, 'Could not check applicable rules for '+uri.spec);
+      this.log(WARN, 'Could not check applicable rules for '+uri.spec + '\n'+e);
       return null;
     }
 
@@ -602,17 +609,54 @@ const HTTPSRules = {
         intoList.push(fromList[i]);
   },
 
+  // Try to find a ruleset in the SQLite database for a given target (e.g.
+  // '*.openssl.org')
+  // NOTE: This call runs synchronously, which can lock up the browser UI. Is
+  // there any way to fix that, given that we need to run blocking in the request
+  // flow? Perhaps we can preload all targets from the DB into memory at startup
+  // so we only hit the DB when we know there is something to be had.
+  queryTarget: function(target) {
+    this.log(DBUG, "Querying DB for " + target);
+    var output = [];
+
+    this.queryForRuleset.params.id = this.targetsAvailable[target];
+
+    try {
+      while (this.queryForRuleset.executeStep())
+        output.push(this.queryForRuleset.row.contents);
+    } finally {
+      this.queryForRuleset.reset();
+    }
+    return output;
+  },
+
   potentiallyApplicableRulesets: function(host) {
     // Return a list of rulesets that declare targets matching this host
     var i, tmp, t;
-    var results = this.global_rulesets.slice(0); // copy global_rulesets
-    try {
-      if (this.targets[host])
-        results = results.concat(this.targets[host]);
-    } catch(e) {   
-      this.log(DBUG,"Couldn't check for ApplicableRulesets: " + e);
-      return [];
-    }
+    var results = [];
+
+    var attempt = function(target) {
+      // First try the in-memory rulesets
+      if (this.targets[target] &&
+          this.targets[target].length > 0) {
+        this.setInsert(results, this.targets[target]);
+      } else if (this.targetsAvailable[target]) {
+        // If not found there, check the DB and load the ruleset as appropriate
+        var rulesets = this.queryTarget(target);
+        if (rulesets.length > 0) {
+          for (var i = 0; i < rulesets.length; i++) {
+            var ruleset = rulesets[i];
+            this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset);
+            RuleWriter.readFromString(ruleset, this);
+            this.setInsert(results, this.targets[target]);
+          }
+        } else {
+          this.nonTargets[target] = 1;
+        }
+      }
+    }.bind(this);
+
+    attempt(host);
 
     // replace each portion of the domain with a * in turn
     var segmented = host.split(".");
@@ -621,13 +665,13 @@ const HTTPSRules = {
       segmented[i] = "*";
       t = segmented.join(".");
       segmented[i] = tmp;
-      this.setInsert(results, this.targets[t]);
+      attempt(t);
     }
     // now eat away from the left, with *, so that for x.y.z.google.com we
     // check *.z.google.com and *.google.com (we did *.y.z.google.com above)
-    for (i = 1; i <= segmented.length - 2; ++i) {
+    for (i = 2; i <= segmented.length - 2; ++i) {
       t = "*." + segmented.slice(i,segmented.length).join(".");
-      this.setInsert(results, this.targets[t]);
+      attempt(t);
     }
     this.log(DBUG,"Potentially applicable rules for " + host + ":");
     for (i = 0; i < results.length; ++i)

diff --git a/src/components/https-everywhere.js b/src/components/https-everywhere.js
@@ -31,6 +31,9 @@ const Cc = Components.classes;
 const Cu = Components.utils;
 const Cr = Components.results;
 
+Cu.import("resource://gre/modules/Services.jsm");
+Cu.import("resource://gre/modules/FileUtils.jsm");
+
 const CP_SHOULDPROCESS = 4;
 
 const SERVICE_CTRID = "@eff.org/https-everywhere;1";

diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python2.7
+#
+# Builds an sqlite DB containing all the rulesets, indexed by target.
+
+import sqlite3
+import argparse
+import sys, re, os
+
+from lxml import etree
+
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    description="Ruleset validation script.")
+parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*",
+    default="src/chrome/content/rules",
+    help='Directory of XML files to validate.')
+
+args = parser.parse_args()
+
+def nomes_all(where=sys.argv[1:]):
+    """Returns generator to extract all files from a list of files/dirs"""
+    if not where: where=['.']
+    for i in where:
+        if os.path.isfile(i):
+            yield i
+        elif os.path.isdir(i):
+            for r, d, f in os.walk(i):
+                for fi in f:
+                    yield os.path.join(r, fi)
+
+
+conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite'))
+c = conn.cursor()
+c.execute('''DROP TABLE IF EXISTS rulesets''')
+c.execute('''CREATE TABLE rulesets
+             (id INTEGER PRIMARY KEY,
+              name TEXT,
+              contents TEXT)''')
+c.execute('''DROP TABLE IF EXISTS targets''')
+c.execute('''CREATE TABLE targets
+             (host TEXT,
+              ruleset_id INTEGER)''')
+
+parser = etree.XMLParser(remove_blank_text=True)
+
+for fi in nomes_all():
+    try:
+        tree = etree.parse(fi, parser)
+    except Exception as oops:
+        if fi[-4:] != ".xml":
+            continue
+        print("%s failed XML validity: %s\n" % (fi, oops))
+    if not tree.xpath("/ruleset"):
+        continue
+
+    # Remove comments to save space.
+    etree.strip_tags(tree,etree.Comment)
+
+    targets = tree.xpath("/ruleset/target/@host")
+    # TODO: Strip target tags too. Right now the JS code requires there be a
+    # target tag.
+    #etree.strip_tags(tree,'target')
+
+    # TODO: filter out comments and targets to save storage bytes
+    ruleset_name = tree.xpath("/ruleset/@name")[0]
+    c.execute('''INSERT INTO rulesets (name, contents) VALUES(?, ?)''', (ruleset_name, etree.tostring(tree)));
+    ruleset_id = c.lastrowid
+    for target in targets:
+        c.execute('''INSERT INTO targets (host, ruleset_id) VALUES(?, ?)''', (target, ruleset_id));
+
+conn.commit()
+conn.close()