From 915f9149c73507e44680d84f2fcabab99effe704 Mon Sep 17 00:00:00 2001
From: Jacob Hoffman-Andrews <github@hoffman-andrews.com>
Date: Sun, 12 Jan 2014 13:04:24 -0800
Subject: [PATCH 1/5] Use an SQLite ruleset DB to speed Firefox startup.

Note that this queries the DB synchronously on
many requests, potentially slowing down browsing.
Needs additional work.
---
 makexpi.sh                            | 16 ++---
 src/chrome/content/code/HTTPSRules.js | 94 ++++++++++++++++++---------
 src/components/https-everywhere.js    |  3 +
 utils/make-sqlite.py                  | 73 +++++++++++++++++++++
 4 files changed, 145 insertions(+), 41 deletions(-)
 create mode 100755 utils/make-sqlite.py
diff --git a/makexpi.sh b/makexpi.sh
index b138e8fe6681..4c38159228aa 100755
--- a/makexpi.sh
+++ b/makexpi.sh
@@ -15,6 +15,7 @@ APP_NAME=https-everywhere
 #  ./makexpi.sh 0.2.3.development.2
 
 cd "`dirname $0`"
+RULESETS_SQLITE="$PWD/src/defaults/rulesets.sqlite"
 
 [ -d pkg ] || mkdir pkg
 
@@ -97,6 +98,11 @@ if [ "$1" != "--fast" ] ; then
 fi
 # =============== END VALIDATION ================
 
+if [ "$1" != "--fast" -o ! -f "$RULESETS_SQLITE" ] ; then
+  echo "Generating sqlite DB"
+  ./utils/make-sqlite.py src/chrome/content/rules
+fi
+
 # The name/version of the XPI we're building comes from src/install.rdf
 XPI_NAME="pkg/$APP_NAME-`grep em:version src/install.rdf | sed -e 's/[<>]/	/g' | cut -f3`"
 if [ "$1" ] && [ "$1" != "--fast" ] ; then
@@ -114,14 +120,6 @@ if [ -e "$GIT_OBJECT_FILE" ]; then
 	export GIT_COMMIT_ID=$(cat "$GIT_OBJECT_FILE")
 fi
 
-# Unless we're in a hurry and there's already a ruleset library, build it from
-# the ruleset .xml files
-
-if [ "$1" = "--fast" ] ; then
-  FAST="--fast"
-fi
-python ./utils/merge-rulesets.py $FAST
-
 cd src
 
 # Build the XPI!
@@ -135,7 +133,7 @@ if [ "$ret" != 0 ]; then
     rm -f "../$XPI_NAME"
     exit "$?"
 else
-  echo >&2 "Total included rules: `find chrome/content/rules -name "*.xml" | wc -l`"
+  echo >&2 "Total included rules: `sqlite3 $RULESETS_SQLITE 'select count(*) from rulesets'`"
   echo >&2 "Rules disabled by default: `find chrome/content/rules -name "*.xml" | xargs grep -F default_off | wc -l`"
   echo >&2 "Created $XPI_NAME"
   if [ -n "$BRANCH" ]; then
diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js
index 23bc2332244c..635c826bd684 100644
--- a/src/chrome/content/code/HTTPSRules.js
+++ b/src/chrome/content/code/HTTPSRules.js
@@ -280,6 +280,12 @@ const RuleWriter = {
 
     sstream.close();
     fstream.close();
+    return this.readFromString(data, rule_store, file);
+  },
+
+  readFromString: function(data, rule_store, file) {
+    if (typeof file === 'undefined') file = {path: 'fromString'};
+
     // XXX: With DOMParser, we probably do not need to throw away the XML
     // declaration anymore nowadays.
     data = data.replace(/<\?xml[^>]*\?>/, ""); 
@@ -410,32 +416,19 @@ const HTTPSRules = {
       this.rulesets = [];
       this.targets = {};  // dict mapping target host patterns -> lists of
                           // applicable rules
+      // dict listing target host patterns that don't exist in the DB
+      // (aka negative cache)
+      // TODO: Make this an LRU cache; clear it on history clear
+      this.nonTargets = {};
       this.rulesetsByID = {};
       this.rulesetsByName = {};
       var t1 = new Date().getTime();
       this.checkMixedContentHandling();
-      var rulefiles = RuleWriter.enumerate(RuleWriter.getCustomRuleDir());
-      this.scanRulefiles(rulefiles);
-      rulefiles = RuleWriter.enumerate(RuleWriter.getRuleDir());
-      this.scanRulefiles(rulefiles);
-      var t,i;
-      for (t in this.targets) {
-        for (i = 0 ; i < this.targets[t].length ; i++) {
-          this.log(INFO, t + " -> " + this.targets[t][i].name);
-        }
-      }
 
-      // for any rulesets with <target host="*">
-      // every URI needs to be checked against these rulesets
-      // (though currently we don't ship any)
-      this.global_rulesets = this.targets["*"] ? this.targets["*"] : [];
-
-      this.rulesets.sort(
-        function(r1,r2) {
-            if (r1.name.toLowerCase() < r2.name.toLowerCase()) return -1;
-            else return 1;
-        }
-      );
+      // Initialize database connection.
+      var dbFile = FileUtils.getFile("ProfD", ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]);
+      var mDBConn = Services.storage.openDatabase(dbFile);
+      this.queryForTarget = mDBConn.createStatement("select id, contents from targets, rulesets where targets.ruleset_id = rulesets.id and host = :target;");
     } catch(e) {
       this.log(WARN,"Rules Failed: "+e);
     }
@@ -491,6 +484,8 @@ const HTTPSRules = {
     }
   },
 
+  httpMatch: /^http/i,
+
   rewrittenURI: function(alist, input_uri) {
     // This function oversees the task of working out if a uri should be
     // rewritten, what it should be rewritten to, and recordkeeping of which
@@ -511,7 +506,7 @@ const HTTPSRules = {
     try {
       var rs = this.potentiallyApplicableRulesets(uri.host);
     } catch(e) {
-      this.log(WARN, 'Could not check applicable rules for '+uri.spec);
+      this.log(WARN, 'Could not check applicable rules for '+uri.spec + '\n'+e);
       return null;
     }
 
@@ -595,17 +590,52 @@ const HTTPSRules = {
         intoList.push(fromList[i]);
   },
 
+  // Try to find a ruleset in the SQLite database for a given target (e.g.
+  // '*.openssl.org')
+  // NOTE: This call runs synchronously, which can lock up the browser UI. Is
+  // there any way to fix that, given that we need to run blocking in the request
+  // flow? Perhaps we can preload all targets from the DB into memory at startup
+  // so we only hit the DB when we know there is something to be had.
+  queryTarget: function(target) {
+    this.log(WARN, "Querying DB for " + target);
+    var statement = this.queryForTarget.clone();
+    statement.params.target = target;
+
+    try {
+      if (statement.executeStep())
+        return statement.row.contents;
+    } finally {
+      statement.reset();
+    }
+  },
+
   potentiallyApplicableRulesets: function(host) {
     // Return a list of rulesets that declare targets matching this host
     var i, tmp, t;
-    var results = this.global_rulesets.slice(0); // copy global_rulesets
-    try {
-      if (this.targets[host])
-        results = results.concat(this.targets[host]);
-    } catch(e) {   
-      this.log(DBUG,"Couldn't check for ApplicableRulesets: " + e);
-      return [];
-    }
+    var results = [];
+
+    var attempt = function(target) {
+      // First check for this target in our in-memory negative cache
+      if (this.nonTargets[target]) {
+        return;
+      } else if (this.targets[target] && // Then our positive cache
+          this.targets[target].length > 0) {
+        this.setInsert(results, this.targets[target]);
+      } else {
+        // If not found there, check the DB and load the ruleset as appropriate
+        // TODO: Add negative caching so we don't repeatedly query the DB for
+        // things that aren't there.
+        var ruleset = this.queryTarget(target);
+        if (ruleset != null) {
+          this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset);
+          RuleWriter.readFromString(ruleset, this);
+          this.setInsert(results, this.targets[target]);
+        } else {
+          this.nonTargets[target] = 1;
+        }
+      }
+    }.bind(this);
+
     // replace each portion of the domain with a * in turn
     var segmented = host.split(".");
     for (i = 0; i < segmented.length; ++i) {
@@ -613,13 +643,13 @@ const HTTPSRules = {
       segmented[i] = "*";
       t = segmented.join(".");
       segmented[i] = tmp;
-      this.setInsert(results, this.targets[t]);
+      attempt(t);
     }
     // now eat away from the left, with *, so that for x.y.z.google.com we
     // check *.z.google.com and *.google.com (we did *.y.z.google.com above)
     for (i = 1; i <= segmented.length - 2; ++i) {
       t = "*." + segmented.slice(i,segmented.length).join(".");
-      this.setInsert(results, this.targets[t]);
+      attempt(t);
     }
     this.log(DBUG,"Potentially applicable rules for " + host + ":");
     for (i = 0; i < results.length; ++i)
diff --git a/src/components/https-everywhere.js b/src/components/https-everywhere.js
index c7704d7f140a..fbf067804d38 100644
--- a/src/components/https-everywhere.js
+++ b/src/components/https-everywhere.js
@@ -31,6 +31,9 @@ const Cc = Components.classes;
 const Cu = Components.utils;
 const Cr = Components.results;
 
+Cu.import("resource://gre/modules/Services.jsm");
+Cu.import("resource://gre/modules/FileUtils.jsm");
+
 const CP_SHOULDPROCESS = 4;
 
 const SERVICE_CTRID = "@eff.org/https-everywhere;1";
diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py
new file mode 100755
index 000000000000..926143119dd8
--- /dev/null
+++ b/utils/make-sqlite.py
@@ -0,0 +1,73 @@
+#!/usr/bin/python2.7
+#
+# Builds an sqlite DB containing all the rulesets, indexed by target.
+
+import sqlite3
+import argparse
+import sys, re, os
+
+from lxml import etree
+
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    description="Ruleset validation script.")
+parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*",
+    default="src/chrome/content/rules",
+    help='Directory of XML files to validate.')
+
+args = parser.parse_args()
+
+def nomes_all(where=sys.argv[1:]):
+    """Returns generator to extract all files from a list of files/dirs"""
+    if not where: where=['.']
+    for i in where:
+        if os.path.isfile(i):
+            yield i
+        elif os.path.isdir(i):
+            for r, d, f in os.walk(i):
+                for fi in f:
+                    yield os.path.join(r, fi)
+
+
+conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite'))
+c = conn.cursor()
+c.execute('''DROP TABLE IF EXISTS rulesets''')
+c.execute('''CREATE TABLE rulesets
+             (id INTEGER PRIMARY KEY,
+              name TEXT,
+              contents TEXT)''')
+c.execute('''DROP TABLE IF EXISTS targets''')
+c.execute('''CREATE TABLE targets
+             (
+              host TEXT,
+              ruleset_id INTEGER)''')
+
+parser = etree.XMLParser(remove_blank_text=True)
+
+for fi in nomes_all():
+    try:
+        tree = etree.parse(fi, parser)
+    except Exception as oops:
+        if fi[-4:] != ".xml":
+            continue
+        print("%s failed XML validity: %s\n" % (fi, oops))
+    if not tree.xpath("/ruleset"):
+        continue
+
+    # Remove comments to save space.
+    etree.strip_tags(tree,etree.Comment)
+
+    targets = tree.xpath("/ruleset/target/@host")
+    # TODO: Strip target tags too. Right now the JS code requires there be a
+    # target tag.
+    #etree.strip_tags(tree,'target')
+
+    # TODO: filter out comments and targets to save storage bytes
+    ruleset_name = tree.xpath("/ruleset/@name")[0]
+    c.execute('''INSERT INTO rulesets (name, contents) VALUES(?, ?)''', (ruleset_name, etree.tostring(tree)));
+    ruleset_id = c.lastrowid
+    for target in targets:
+        c.execute('''INSERT INTO targets (host, ruleset_id) VALUES(?, ?)''', (target, ruleset_id));
+
+conn.commit()
+conn.close()

From d4905a384fc8f6e201421971311f25c62d102808 Mon Sep 17 00:00:00 2001
From: Jacob Hoffman-Andrews <github@hoffman-andrews.com>
Date: Sun, 12 Jan 2014 19:46:07 -0800
Subject: [PATCH 2/5] Handle multiple targets and preload a list of available
 targets

---
 src/chrome/content/code/HTTPSRules.js | 53 ++++++++++++++++++---------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js
index 635c826bd684..33e77aede9d1 100644
--- a/src/chrome/content/code/HTTPSRules.js
+++ b/src/chrome/content/code/HTTPSRules.js
@@ -416,19 +416,32 @@ const HTTPSRules = {
       this.rulesets = [];
       this.targets = {};  // dict mapping target host patterns -> lists of
                           // applicable rules
-      // dict listing target host patterns that don't exist in the DB
-      // (aka negative cache)
-      // TODO: Make this an LRU cache; clear it on history clear
-      this.nonTargets = {};
       this.rulesetsByID = {};
       this.rulesetsByName = {};
       var t1 = new Date().getTime();
       this.checkMixedContentHandling();
 
       // Initialize database connection.
-      var dbFile = FileUtils.getFile("ProfD", ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]);
-      var mDBConn = Services.storage.openDatabase(dbFile);
-      this.queryForTarget = mDBConn.createStatement("select id, contents from targets, rulesets where targets.ruleset_id = rulesets.id and host = :target;");
+      var dbFile = FileUtils.getFile("ProfD",
+        ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]);
+      var rulesetDBConn = Services.storage.openDatabase(dbFile);
+      this.queryForTarget = rulesetDBConn.createStatement(
+        "select id, contents from targets, rulesets " +
+        "where targets.ruleset_id = rulesets.id and host = :target;");
+
+      // Preload the list of which targets are available in the DB.
+      // This is a little slow (287 ms on a Core2 Duo @ 2.2GHz with SSD),
+      // but is faster than loading all of the rulesets. If this becomes a
+      // bottleneck, change it to load in a background webworker, or load
+      // a smaller bloom filter instead.
+      this.targetsAvailable = new Set(); // Firefox-specific
+      var targetsQuery = rulesetDBConn.createStatement("select host from targets");
+      this.log(WARN, "Adding targets...");
+      while (targetsQuery.executeStep()) {
+        var host = targetsQuery.row.host;
+        this.targetsAvailable.add(host);
+      }
+      this.log(WARN, "Done adding targets.");
     } catch(e) {
       this.log(WARN,"Rules Failed: "+e);
     }
@@ -598,15 +611,18 @@ const HTTPSRules = {
   // so we only hit the DB when we know there is something to be had.
   queryTarget: function(target) {
     this.log(WARN, "Querying DB for " + target);
+    var output = [];
+
     var statement = this.queryForTarget.clone();
     statement.params.target = target;
 
     try {
-      if (statement.executeStep())
-        return statement.row.contents;
+      while (statement.executeStep())
+        output.push(statement.row.contents);
     } finally {
       statement.reset();
     }
+    return output;
   },
 
   potentiallyApplicableRulesets: function(host) {
@@ -616,20 +632,21 @@ const HTTPSRules = {
 
     var attempt = function(target) {
       // First check for this target in our in-memory negative cache
-      if (this.nonTargets[target]) {
-        return;
-      } else if (this.targets[target] && // Then our positive cache
+      if (this.targets[target] && // Then our positive cache
           this.targets[target].length > 0) {
         this.setInsert(results, this.targets[target]);
-      } else {
+      } else if (this.targetsAvailable.has(target)) {
         // If not found there, check the DB and load the ruleset as appropriate
         // TODO: Add negative caching so we don't repeatedly query the DB for
         // things that aren't there.
-        var ruleset = this.queryTarget(target);
-        if (ruleset != null) {
-          this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset);
-          RuleWriter.readFromString(ruleset, this);
-          this.setInsert(results, this.targets[target]);
+        var rulesets = this.queryTarget(target);
+        if (rulesets.length > 0) {
+          for (var i = 0; i < rulesets.length; i++) {
+            var ruleset = rulesets[i];
+            this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset);
+            RuleWriter.readFromString(ruleset, this);
+            this.setInsert(results, this.targets[target]);
+          }
         } else {
           this.nonTargets[target] = 1;
         }

From dd607fb7e3af570383c40f5a2beff0ba70f73b4d Mon Sep 17 00:00:00 2001
From: Jacob Hoffman-Andrews <github@hoffman-andrews.com>
Date: Tue, 14 Jan 2014 23:05:40 -0800
Subject: [PATCH 3/5] Create host index on targets table

---
 utils/make-sqlite.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py
index 926143119dd8..9d15350f466f 100755
--- a/utils/make-sqlite.py
+++ b/utils/make-sqlite.py
@@ -38,9 +38,9 @@ def nomes_all(where=sys.argv[1:]):
               contents TEXT)''')
 c.execute('''DROP TABLE IF EXISTS targets''')
 c.execute('''CREATE TABLE targets
-             (
-              host TEXT,
+             (host TEXT,
               ruleset_id INTEGER)''')
+c.execute('''CREATE INDEX host_index on targets(host)''')
 
 parser = etree.XMLParser(remove_blank_text=True)
 

From 2e16492a8f3e4aac9c69542bcc1917c5f830f82c Mon Sep 17 00:00:00 2001
From: Jacob Hoffman-Andrews <github@hoffman-andrews.com>
Date: Fri, 17 Jan 2014 23:21:30 -0800
Subject: [PATCH 4/5] Fix bug where bare hostname isn't tried.

Also fix a case of double-lookup for "*.foo.com" and fix some comments.
---
 src/chrome/content/code/HTTPSRules.js | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js
index 33e77aede9d1..eae7dc444e67 100644
--- a/src/chrome/content/code/HTTPSRules.js
+++ b/src/chrome/content/code/HTTPSRules.js
@@ -631,14 +631,12 @@ const HTTPSRules = {
     var results = [];
 
     var attempt = function(target) {
-      // First check for this target in our in-memory negative cache
-      if (this.targets[target] && // Then our positive cache
+      // First try the in-memory rulesets
+      if (this.targets[target] &&
           this.targets[target].length > 0) {
         this.setInsert(results, this.targets[target]);
       } else if (this.targetsAvailable.has(target)) {
         // If not found there, check the DB and load the ruleset as appropriate
-        // TODO: Add negative caching so we don't repeatedly query the DB for
-        // things that aren't there.
         var rulesets = this.queryTarget(target);
         if (rulesets.length > 0) {
           for (var i = 0; i < rulesets.length; i++) {
@@ -653,6 +651,8 @@ const HTTPSRules = {
       }
     }.bind(this);
 
+    attempt(host);
+
     // replace each portion of the domain with a * in turn
     var segmented = host.split(".");
     for (i = 0; i < segmented.length; ++i) {
@@ -664,7 +664,7 @@ const HTTPSRules = {
     }
     // now eat away from the left, with *, so that for x.y.z.google.com we
     // check *.z.google.com and *.google.com (we did *.y.z.google.com above)
-    for (i = 1; i <= segmented.length - 2; ++i) {
+    for (i = 2; i <= segmented.length - 2; ++i) {
       t = "*." + segmented.slice(i,segmented.length).join(".");
       attempt(t);
     }

From ba496d789be3cd6f8d97fb40d11df8533768b701 Mon Sep 17 00:00:00 2001
From: Jacob Hoffman-Andrews <github@hoffman-andrews.com>
Date: Sat, 18 Jan 2014 23:20:46 -0800
Subject: [PATCH 5/5] Remove cache on targets table and change JS-side query to
 match.

---
 src/chrome/content/code/HTTPSRules.js | 30 +++++++++++++--------------
 utils/make-sqlite.py                  |  1 -
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js
index eae7dc444e67..3095de86ba09 100644
--- a/src/chrome/content/code/HTTPSRules.js
+++ b/src/chrome/content/code/HTTPSRules.js
@@ -425,25 +425,24 @@ const HTTPSRules = {
       var dbFile = FileUtils.getFile("ProfD",
         ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]);
       var rulesetDBConn = Services.storage.openDatabase(dbFile);
-      this.queryForTarget = rulesetDBConn.createStatement(
-        "select id, contents from targets, rulesets " +
-        "where targets.ruleset_id = rulesets.id and host = :target;");
+      this.queryForRuleset = rulesetDBConn.createStatement(
+        "select contents from rulesets where id = :id");
 
       // Preload the list of which targets are available in the DB.
       // This is a little slow (287 ms on a Core2 Duo @ 2.2GHz with SSD),
       // but is faster than loading all of the rulesets. If this becomes a
       // bottleneck, change it to load in a background webworker, or load
       // a smaller bloom filter instead.
-      this.targetsAvailable = new Set(); // Firefox-specific
-      var targetsQuery = rulesetDBConn.createStatement("select host from targets");
-      this.log(WARN, "Adding targets...");
+      this.targetsAvailable = {};
+      var targetsQuery = rulesetDBConn.createStatement("select host, ruleset_id from targets");
+      this.log(DBUG, "Adding targets...");
       while (targetsQuery.executeStep()) {
         var host = targetsQuery.row.host;
-        this.targetsAvailable.add(host);
+        this.targetsAvailable[host] = targetsQuery.row.ruleset_id;
       }
-      this.log(WARN, "Done adding targets.");
+      this.log(DBUG, "Done adding targets.");
     } catch(e) {
-      this.log(WARN,"Rules Failed: "+e);
+      this.log(DBUG,"Rules Failed: "+e);
     }
     var t2 =  new Date().getTime();
     this.log(NOTE,"Loading rulesets took " + (t2 - t1) / 1000.0 + " seconds");
@@ -610,17 +609,16 @@ const HTTPSRules = {
   // flow? Perhaps we can preload all targets from the DB into memory at startup
   // so we only hit the DB when we know there is something to be had.
   queryTarget: function(target) {
-    this.log(WARN, "Querying DB for " + target);
+    this.log(DBUG, "Querying DB for " + target);
     var output = [];
 
-    var statement = this.queryForTarget.clone();
-    statement.params.target = target;
+    this.queryForRuleset.params.id = this.targetsAvailable[target];
 
     try {
-      while (statement.executeStep())
-        output.push(statement.row.contents);
+      while (this.queryForRuleset.executeStep())
+        output.push(this.queryForRuleset.row.contents);
     } finally {
-      statement.reset();
+      this.queryForRuleset.reset();
     }
     return output;
   },
@@ -635,7 +633,7 @@ const HTTPSRules = {
       if (this.targets[target] &&
           this.targets[target].length > 0) {
         this.setInsert(results, this.targets[target]);
-      } else if (this.targetsAvailable.has(target)) {
+      } else if (this.targetsAvailable[target]) {
         // If not found there, check the DB and load the ruleset as appropriate
         var rulesets = this.queryTarget(target);
         if (rulesets.length > 0) {
diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py
index 9d15350f466f..501c74f5953b 100755
--- a/utils/make-sqlite.py
+++ b/utils/make-sqlite.py
@@ -40,7 +40,6 @@ def nomes_all(where=sys.argv[1:]):
 c.execute('''CREATE TABLE targets
              (host TEXT,
               ruleset_id INTEGER)''')
-c.execute('''CREATE INDEX host_index on targets(host)''')
 
 parser = etree.XMLParser(remove_blank_text=True)