From 915f9149c73507e44680d84f2fcabab99effe704 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Sun, 12 Jan 2014 13:04:24 -0800 Subject: [PATCH 1/5] Use an SQLite ruleset DB to speed Firefox startup. Note that this queries the DB synchronously on many requests, potentially slowing down browsing. Needs additional work. --- makexpi.sh | 16 ++--- src/chrome/content/code/HTTPSRules.js | 94 ++++++++++++++++++--------- src/components/https-everywhere.js | 3 + utils/make-sqlite.py | 73 +++++++++++++++++++++ 4 files changed, 145 insertions(+), 41 deletions(-) create mode 100755 utils/make-sqlite.py diff --git a/makexpi.sh b/makexpi.sh index b138e8fe6681..4c38159228aa 100755 --- a/makexpi.sh +++ b/makexpi.sh @@ -15,6 +15,7 @@ APP_NAME=https-everywhere # ./makexpi.sh 0.2.3.development.2 cd "`dirname $0`" +RULESETS_SQLITE="$PWD/src/defaults/rulesets.sqlite" [ -d pkg ] || mkdir pkg @@ -97,6 +98,11 @@ if [ "$1" != "--fast" ] ; then fi # =============== END VALIDATION ================ +if [ "$1" != "--fast" -o ! -f "$RULESETS_SQLITE" ] ; then + echo "Generating sqlite DB" + ./utils/make-sqlite.py src/chrome/content/rules +fi + # The name/version of the XPI we're building comes from src/install.rdf XPI_NAME="pkg/$APP_NAME-`grep em:version src/install.rdf | sed -e 's/[<>]/ /g' | cut -f3`" if [ "$1" ] && [ "$1" != "--fast" ] ; then @@ -114,14 +120,6 @@ if [ -e "$GIT_OBJECT_FILE" ]; then export GIT_COMMIT_ID=$(cat "$GIT_OBJECT_FILE") fi -# Unless we're in a hurry and there's already a ruleset library, build it from -# the ruleset .xml files - -if [ "$1" = "--fast" ] ; then - FAST="--fast" -fi -python ./utils/merge-rulesets.py $FAST - cd src # Build the XPI! @@ -135,7 +133,7 @@ if [ "$ret" != 0 ]; then rm -f "../$XPI_NAME" exit "$?" else - echo >&2 "Total included rules: `find chrome/content/rules -name "*.xml" | wc -l`" + echo >&2 "Total included rules: `sqlite3 $RULESETS_SQLITE 'select count(*) from rulesets'`" echo >&2 "Rules disabled by default: `find chrome/content/rules -name "*.xml" | xargs grep -F default_off | wc -l`" echo >&2 "Created $XPI_NAME" if [ -n "$BRANCH" ]; then diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js index 23bc2332244c..635c826bd684 100644 --- a/src/chrome/content/code/HTTPSRules.js +++ b/src/chrome/content/code/HTTPSRules.js @@ -280,6 +280,12 @@ const RuleWriter = { sstream.close(); fstream.close(); + return this.readFromString(data, rule_store, file); + }, + + readFromString: function(data, rule_store, file) { + if (typeof file === 'undefined') file = {path: 'fromString'}; + // XXX: With DOMParser, we probably do not need to throw away the XML // declaration anymore nowadays. data = data.replace(/<\?xml[^>]*\?>/, ""); @@ -410,32 +416,19 @@ const HTTPSRules = { this.rulesets = []; this.targets = {}; // dict mapping target host patterns -> lists of // applicable rules + // dict listing target host patterns that don't exist in the DB + // (aka negative cache) + // TODO: Make this an LRU cache; clear it on history clear + this.nonTargets = {}; this.rulesetsByID = {}; this.rulesetsByName = {}; var t1 = new Date().getTime(); this.checkMixedContentHandling(); - var rulefiles = RuleWriter.enumerate(RuleWriter.getCustomRuleDir()); - this.scanRulefiles(rulefiles); - rulefiles = RuleWriter.enumerate(RuleWriter.getRuleDir()); - this.scanRulefiles(rulefiles); - var t,i; - for (t in this.targets) { - for (i = 0 ; i < this.targets[t].length ; i++) { - this.log(INFO, t + " -> " + this.targets[t][i].name); - } - } - // for any rulesets with - // every URI needs to be checked against these rulesets - // (though currently we don't ship any) - this.global_rulesets = this.targets["*"] ? this.targets["*"] : []; - - this.rulesets.sort( - function(r1,r2) { - if (r1.name.toLowerCase() < r2.name.toLowerCase()) return -1; - else return 1; - } - ); + // Initialize database connection. + var dbFile = FileUtils.getFile("ProfD", ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]); + var mDBConn = Services.storage.openDatabase(dbFile); + this.queryForTarget = mDBConn.createStatement("select id, contents from targets, rulesets where targets.ruleset_id = rulesets.id and host = :target;"); } catch(e) { this.log(WARN,"Rules Failed: "+e); } @@ -491,6 +484,8 @@ const HTTPSRules = { } }, + httpMatch: /^http/i, + rewrittenURI: function(alist, input_uri) { // This function oversees the task of working out if a uri should be // rewritten, what it should be rewritten to, and recordkeeping of which @@ -511,7 +506,7 @@ const HTTPSRules = { try { var rs = this.potentiallyApplicableRulesets(uri.host); } catch(e) { - this.log(WARN, 'Could not check applicable rules for '+uri.spec); + this.log(WARN, 'Could not check applicable rules for '+uri.spec + '\n'+e); return null; } @@ -595,17 +590,52 @@ const HTTPSRules = { intoList.push(fromList[i]); }, + // Try to find a ruleset in the SQLite database for a given target (e.g. + // '*.openssl.org') + // NOTE: This call runs synchronously, which can lock up the browser UI. Is + // there any way to fix that, given that we need to run blocking in the request + // flow? Perhaps we can preload all targets from the DB into memory at startup + // so we only hit the DB when we know there is something to be had. + queryTarget: function(target) { + this.log(WARN, "Querying DB for " + target); + var statement = this.queryForTarget.clone(); + statement.params.target = target; + + try { + if (statement.executeStep()) + return statement.row.contents; + } finally { + statement.reset(); + } + }, + potentiallyApplicableRulesets: function(host) { // Return a list of rulesets that declare targets matching this host var i, tmp, t; - var results = this.global_rulesets.slice(0); // copy global_rulesets - try { - if (this.targets[host]) - results = results.concat(this.targets[host]); - } catch(e) { - this.log(DBUG,"Couldn't check for ApplicableRulesets: " + e); - return []; - } + var results = []; + + var attempt = function(target) { + // First check for this target in our in-memory negative cache + if (this.nonTargets[target]) { + return; + } else if (this.targets[target] && // Then our positive cache + this.targets[target].length > 0) { + this.setInsert(results, this.targets[target]); + } else { + // If not found there, check the DB and load the ruleset as appropriate + // TODO: Add negative caching so we don't repeatedly query the DB for + // things that aren't there. + var ruleset = this.queryTarget(target); + if (ruleset != null) { + this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset); + RuleWriter.readFromString(ruleset, this); + this.setInsert(results, this.targets[target]); + } else { + this.nonTargets[target] = 1; + } + } + }.bind(this); + // replace each portion of the domain with a * in turn var segmented = host.split("."); for (i = 0; i < segmented.length; ++i) { @@ -613,13 +643,13 @@ const HTTPSRules = { segmented[i] = "*"; t = segmented.join("."); segmented[i] = tmp; - this.setInsert(results, this.targets[t]); + attempt(t); } // now eat away from the left, with *, so that for x.y.z.google.com we // check *.z.google.com and *.google.com (we did *.y.z.google.com above) for (i = 1; i <= segmented.length - 2; ++i) { t = "*." + segmented.slice(i,segmented.length).join("."); - this.setInsert(results, this.targets[t]); + attempt(t); } this.log(DBUG,"Potentially applicable rules for " + host + ":"); for (i = 0; i < results.length; ++i) diff --git a/src/components/https-everywhere.js b/src/components/https-everywhere.js index c7704d7f140a..fbf067804d38 100644 --- a/src/components/https-everywhere.js +++ b/src/components/https-everywhere.js @@ -31,6 +31,9 @@ const Cc = Components.classes; const Cu = Components.utils; const Cr = Components.results; +Cu.import("resource://gre/modules/Services.jsm"); +Cu.import("resource://gre/modules/FileUtils.jsm"); + const CP_SHOULDPROCESS = 4; const SERVICE_CTRID = "@eff.org/https-everywhere;1"; diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py new file mode 100755 index 000000000000..926143119dd8 --- /dev/null +++ b/utils/make-sqlite.py @@ -0,0 +1,73 @@ +#!/usr/bin/python2.7 +# +# Builds an sqlite DB containing all the rulesets, indexed by target. + +import sqlite3 +import argparse +import sys, re, os + +from lxml import etree + +parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Ruleset validation script.") +parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*", + default="src/chrome/content/rules", + help='Directory of XML files to validate.') + +args = parser.parse_args() + +def nomes_all(where=sys.argv[1:]): + """Returns generator to extract all files from a list of files/dirs""" + if not where: where=['.'] + for i in where: + if os.path.isfile(i): + yield i + elif os.path.isdir(i): + for r, d, f in os.walk(i): + for fi in f: + yield os.path.join(r, fi) + + +conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite')) +c = conn.cursor() +c.execute('''DROP TABLE IF EXISTS rulesets''') +c.execute('''CREATE TABLE rulesets + (id INTEGER PRIMARY KEY, + name TEXT, + contents TEXT)''') +c.execute('''DROP TABLE IF EXISTS targets''') +c.execute('''CREATE TABLE targets + ( + host TEXT, + ruleset_id INTEGER)''') + +parser = etree.XMLParser(remove_blank_text=True) + +for fi in nomes_all(): + try: + tree = etree.parse(fi, parser) + except Exception as oops: + if fi[-4:] != ".xml": + continue + print("%s failed XML validity: %s\n" % (fi, oops)) + if not tree.xpath("/ruleset"): + continue + + # Remove comments to save space. + etree.strip_tags(tree,etree.Comment) + + targets = tree.xpath("/ruleset/target/@host") + # TODO: Strip target tags too. Right now the JS code requires there be a + # target tag. + #etree.strip_tags(tree,'target') + + # TODO: filter out comments and targets to save storage bytes + ruleset_name = tree.xpath("/ruleset/@name")[0] + c.execute('''INSERT INTO rulesets (name, contents) VALUES(?, ?)''', (ruleset_name, etree.tostring(tree))); + ruleset_id = c.lastrowid + for target in targets: + c.execute('''INSERT INTO targets (host, ruleset_id) VALUES(?, ?)''', (target, ruleset_id)); + +conn.commit() +conn.close() From d4905a384fc8f6e201421971311f25c62d102808 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Sun, 12 Jan 2014 19:46:07 -0800 Subject: [PATCH 2/5] Handle multiple targets and preload a list of available targets --- src/chrome/content/code/HTTPSRules.js | 53 ++++++++++++++++++--------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js index 635c826bd684..33e77aede9d1 100644 --- a/src/chrome/content/code/HTTPSRules.js +++ b/src/chrome/content/code/HTTPSRules.js @@ -416,19 +416,32 @@ const HTTPSRules = { this.rulesets = []; this.targets = {}; // dict mapping target host patterns -> lists of // applicable rules - // dict listing target host patterns that don't exist in the DB - // (aka negative cache) - // TODO: Make this an LRU cache; clear it on history clear - this.nonTargets = {}; this.rulesetsByID = {}; this.rulesetsByName = {}; var t1 = new Date().getTime(); this.checkMixedContentHandling(); // Initialize database connection. - var dbFile = FileUtils.getFile("ProfD", ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]); - var mDBConn = Services.storage.openDatabase(dbFile); - this.queryForTarget = mDBConn.createStatement("select id, contents from targets, rulesets where targets.ruleset_id = rulesets.id and host = :target;"); + var dbFile = FileUtils.getFile("ProfD", + ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]); + var rulesetDBConn = Services.storage.openDatabase(dbFile); + this.queryForTarget = rulesetDBConn.createStatement( + "select id, contents from targets, rulesets " + + "where targets.ruleset_id = rulesets.id and host = :target;"); + + // Preload the list of which targets are available in the DB. + // This is a little slow (287 ms on a Core2 Duo @ 2.2GHz with SSD), + // but is faster than loading all of the rulesets. If this becomes a + // bottleneck, change it to load in a background webworker, or load + // a smaller bloom filter instead. + this.targetsAvailable = new Set(); // Firefox-specific + var targetsQuery = rulesetDBConn.createStatement("select host from targets"); + this.log(WARN, "Adding targets..."); + while (targetsQuery.executeStep()) { + var host = targetsQuery.row.host; + this.targetsAvailable.add(host); + } + this.log(WARN, "Done adding targets."); } catch(e) { this.log(WARN,"Rules Failed: "+e); } @@ -598,15 +611,18 @@ const HTTPSRules = { // so we only hit the DB when we know there is something to be had. queryTarget: function(target) { this.log(WARN, "Querying DB for " + target); + var output = []; + var statement = this.queryForTarget.clone(); statement.params.target = target; try { - if (statement.executeStep()) - return statement.row.contents; + while (statement.executeStep()) + output.push(statement.row.contents); } finally { statement.reset(); } + return output; }, potentiallyApplicableRulesets: function(host) { @@ -616,20 +632,21 @@ const HTTPSRules = { var attempt = function(target) { // First check for this target in our in-memory negative cache - if (this.nonTargets[target]) { - return; - } else if (this.targets[target] && // Then our positive cache + if (this.targets[target] && // Then our positive cache this.targets[target].length > 0) { this.setInsert(results, this.targets[target]); - } else { + } else if (this.targetsAvailable.has(target)) { // If not found there, check the DB and load the ruleset as appropriate // TODO: Add negative caching so we don't repeatedly query the DB for // things that aren't there. - var ruleset = this.queryTarget(target); - if (ruleset != null) { - this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset); - RuleWriter.readFromString(ruleset, this); - this.setInsert(results, this.targets[target]); + var rulesets = this.queryTarget(target); + if (rulesets.length > 0) { + for (var i = 0; i < rulesets.length; i++) { + var ruleset = rulesets[i]; + this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset); + RuleWriter.readFromString(ruleset, this); + this.setInsert(results, this.targets[target]); + } } else { this.nonTargets[target] = 1; } From dd607fb7e3af570383c40f5a2beff0ba70f73b4d Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Tue, 14 Jan 2014 23:05:40 -0800 Subject: [PATCH 3/5] Create host index on targets table --- utils/make-sqlite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py index 926143119dd8..9d15350f466f 100755 --- a/utils/make-sqlite.py +++ b/utils/make-sqlite.py @@ -38,9 +38,9 @@ def nomes_all(where=sys.argv[1:]): contents TEXT)''') c.execute('''DROP TABLE IF EXISTS targets''') c.execute('''CREATE TABLE targets - ( - host TEXT, + (host TEXT, ruleset_id INTEGER)''') +c.execute('''CREATE INDEX host_index on targets(host)''') parser = etree.XMLParser(remove_blank_text=True) From 2e16492a8f3e4aac9c69542bcc1917c5f830f82c Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Fri, 17 Jan 2014 23:21:30 -0800 Subject: [PATCH 4/5] Fix bug where bare hostname isn't tried. Also fix a case of double-lookup for "*.foo.com" and fix some comments. --- src/chrome/content/code/HTTPSRules.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js index 33e77aede9d1..eae7dc444e67 100644 --- a/src/chrome/content/code/HTTPSRules.js +++ b/src/chrome/content/code/HTTPSRules.js @@ -631,14 +631,12 @@ const HTTPSRules = { var results = []; var attempt = function(target) { - // First check for this target in our in-memory negative cache - if (this.targets[target] && // Then our positive cache + // First try the in-memory rulesets + if (this.targets[target] && this.targets[target].length > 0) { this.setInsert(results, this.targets[target]); } else if (this.targetsAvailable.has(target)) { // If not found there, check the DB and load the ruleset as appropriate - // TODO: Add negative caching so we don't repeatedly query the DB for - // things that aren't there. var rulesets = this.queryTarget(target); if (rulesets.length > 0) { for (var i = 0; i < rulesets.length; i++) { @@ -653,6 +651,8 @@ const HTTPSRules = { } }.bind(this); + attempt(host); + // replace each portion of the domain with a * in turn var segmented = host.split("."); for (i = 0; i < segmented.length; ++i) { @@ -664,7 +664,7 @@ const HTTPSRules = { } // now eat away from the left, with *, so that for x.y.z.google.com we // check *.z.google.com and *.google.com (we did *.y.z.google.com above) - for (i = 1; i <= segmented.length - 2; ++i) { + for (i = 2; i <= segmented.length - 2; ++i) { t = "*." + segmented.slice(i,segmented.length).join("."); attempt(t); } From ba496d789be3cd6f8d97fb40d11df8533768b701 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Sat, 18 Jan 2014 23:20:46 -0800 Subject: [PATCH 5/5] Remove cache on targets table and change JS-side query to match. --- src/chrome/content/code/HTTPSRules.js | 30 +++++++++++++-------------- utils/make-sqlite.py | 1 - 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js index eae7dc444e67..3095de86ba09 100644 --- a/src/chrome/content/code/HTTPSRules.js +++ b/src/chrome/content/code/HTTPSRules.js @@ -425,25 +425,24 @@ const HTTPSRules = { var dbFile = FileUtils.getFile("ProfD", ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]); var rulesetDBConn = Services.storage.openDatabase(dbFile); - this.queryForTarget = rulesetDBConn.createStatement( - "select id, contents from targets, rulesets " + - "where targets.ruleset_id = rulesets.id and host = :target;"); + this.queryForRuleset = rulesetDBConn.createStatement( + "select contents from rulesets where id = :id"); // Preload the list of which targets are available in the DB. // This is a little slow (287 ms on a Core2 Duo @ 2.2GHz with SSD), // but is faster than loading all of the rulesets. If this becomes a // bottleneck, change it to load in a background webworker, or load // a smaller bloom filter instead. - this.targetsAvailable = new Set(); // Firefox-specific - var targetsQuery = rulesetDBConn.createStatement("select host from targets"); - this.log(WARN, "Adding targets..."); + this.targetsAvailable = {}; + var targetsQuery = rulesetDBConn.createStatement("select host, ruleset_id from targets"); + this.log(DBUG, "Adding targets..."); while (targetsQuery.executeStep()) { var host = targetsQuery.row.host; - this.targetsAvailable.add(host); + this.targetsAvailable[host] = targetsQuery.row.ruleset_id; } - this.log(WARN, "Done adding targets."); + this.log(DBUG, "Done adding targets."); } catch(e) { - this.log(WARN,"Rules Failed: "+e); + this.log(DBUG,"Rules Failed: "+e); } var t2 = new Date().getTime(); this.log(NOTE,"Loading rulesets took " + (t2 - t1) / 1000.0 + " seconds"); @@ -610,17 +609,16 @@ const HTTPSRules = { // flow? Perhaps we can preload all targets from the DB into memory at startup // so we only hit the DB when we know there is something to be had. queryTarget: function(target) { - this.log(WARN, "Querying DB for " + target); + this.log(DBUG, "Querying DB for " + target); var output = []; - var statement = this.queryForTarget.clone(); - statement.params.target = target; + this.queryForRuleset.params.id = this.targetsAvailable[target]; try { - while (statement.executeStep()) - output.push(statement.row.contents); + while (this.queryForRuleset.executeStep()) + output.push(this.queryForRuleset.row.contents); } finally { - statement.reset(); + this.queryForRuleset.reset(); } return output; }, @@ -635,7 +633,7 @@ const HTTPSRules = { if (this.targets[target] && this.targets[target].length > 0) { this.setInsert(results, this.targets[target]); - } else if (this.targetsAvailable.has(target)) { + } else if (this.targetsAvailable[target]) { // If not found there, check the DB and load the ruleset as appropriate var rulesets = this.queryTarget(target); if (rulesets.length > 0) { diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py index 9d15350f466f..501c74f5953b 100755 --- a/utils/make-sqlite.py +++ b/utils/make-sqlite.py @@ -40,7 +40,6 @@ def nomes_all(where=sys.argv[1:]): c.execute('''CREATE TABLE targets (host TEXT, ruleset_id INTEGER)''') -c.execute('''CREATE INDEX host_index on targets(host)''') parser = etree.XMLParser(remove_blank_text=True)