diff --git a/makexpi.sh b/makexpi.sh index b138e8fe6681..4c38159228aa 100755 --- a/makexpi.sh +++ b/makexpi.sh @@ -15,6 +15,7 @@ APP_NAME=https-everywhere # ./makexpi.sh 0.2.3.development.2 cd "`dirname $0`" +RULESETS_SQLITE="$PWD/src/defaults/rulesets.sqlite" [ -d pkg ] || mkdir pkg @@ -97,6 +98,11 @@ if [ "$1" != "--fast" ] ; then fi # =============== END VALIDATION ================ +if [ "$1" != "--fast" -o ! -f "$RULESETS_SQLITE" ] ; then + echo "Generating sqlite DB" + ./utils/make-sqlite.py src/chrome/content/rules +fi + # The name/version of the XPI we're building comes from src/install.rdf XPI_NAME="pkg/$APP_NAME-`grep em:version src/install.rdf | sed -e 's/[<>]/ /g' | cut -f3`" if [ "$1" ] && [ "$1" != "--fast" ] ; then @@ -114,14 +120,6 @@ if [ -e "$GIT_OBJECT_FILE" ]; then export GIT_COMMIT_ID=$(cat "$GIT_OBJECT_FILE") fi -# Unless we're in a hurry and there's already a ruleset library, build it from -# the ruleset .xml files - -if [ "$1" = "--fast" ] ; then - FAST="--fast" -fi -python ./utils/merge-rulesets.py $FAST - cd src # Build the XPI! @@ -135,7 +133,7 @@ if [ "$ret" != 0 ]; then rm -f "../$XPI_NAME" exit "$?" else - echo >&2 "Total included rules: `find chrome/content/rules -name "*.xml" | wc -l`" + echo >&2 "Total included rules: `sqlite3 $RULESETS_SQLITE 'select count(*) from rulesets'`" echo >&2 "Rules disabled by default: `find chrome/content/rules -name "*.xml" | xargs grep -F default_off | wc -l`" echo >&2 "Created $XPI_NAME" if [ -n "$BRANCH" ]; then diff --git a/src/chrome/content/code/HTTPSRules.js b/src/chrome/content/code/HTTPSRules.js index 23bc2332244c..3095de86ba09 100644 --- a/src/chrome/content/code/HTTPSRules.js +++ b/src/chrome/content/code/HTTPSRules.js @@ -280,6 +280,12 @@ const RuleWriter = { sstream.close(); fstream.close(); + return this.readFromString(data, rule_store, file); + }, + + readFromString: function(data, rule_store, file) { + if (typeof file === 'undefined') file = {path: 'fromString'}; + // XXX: With DOMParser, we probably do not need to throw away the XML // declaration anymore nowadays. data = data.replace(/<\?xml[^>]*\?>/, ""); @@ -414,30 +420,29 @@ const HTTPSRules = { this.rulesetsByName = {}; var t1 = new Date().getTime(); this.checkMixedContentHandling(); - var rulefiles = RuleWriter.enumerate(RuleWriter.getCustomRuleDir()); - this.scanRulefiles(rulefiles); - rulefiles = RuleWriter.enumerate(RuleWriter.getRuleDir()); - this.scanRulefiles(rulefiles); - var t,i; - for (t in this.targets) { - for (i = 0 ; i < this.targets[t].length ; i++) { - this.log(INFO, t + " -> " + this.targets[t][i].name); - } - } - - // for any rulesets with - // every URI needs to be checked against these rulesets - // (though currently we don't ship any) - this.global_rulesets = this.targets["*"] ? this.targets["*"] : []; - this.rulesets.sort( - function(r1,r2) { - if (r1.name.toLowerCase() < r2.name.toLowerCase()) return -1; - else return 1; - } - ); + // Initialize database connection. + var dbFile = FileUtils.getFile("ProfD", + ["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]); + var rulesetDBConn = Services.storage.openDatabase(dbFile); + this.queryForRuleset = rulesetDBConn.createStatement( + "select contents from rulesets where id = :id"); + + // Preload the list of which targets are available in the DB. + // This is a little slow (287 ms on a Core2 Duo @ 2.2GHz with SSD), + // but is faster than loading all of the rulesets. If this becomes a + // bottleneck, change it to load in a background webworker, or load + // a smaller bloom filter instead. + this.targetsAvailable = {}; + var targetsQuery = rulesetDBConn.createStatement("select host, ruleset_id from targets"); + this.log(DBUG, "Adding targets..."); + while (targetsQuery.executeStep()) { + var host = targetsQuery.row.host; + this.targetsAvailable[host] = targetsQuery.row.ruleset_id; + } + this.log(DBUG, "Done adding targets."); } catch(e) { - this.log(WARN,"Rules Failed: "+e); + this.log(DBUG,"Rules Failed: "+e); } var t2 = new Date().getTime(); this.log(NOTE,"Loading rulesets took " + (t2 - t1) / 1000.0 + " seconds"); @@ -491,6 +496,8 @@ const HTTPSRules = { } }, + httpMatch: /^http/i, + rewrittenURI: function(alist, input_uri) { // This function oversees the task of working out if a uri should be // rewritten, what it should be rewritten to, and recordkeeping of which @@ -511,7 +518,7 @@ const HTTPSRules = { try { var rs = this.potentiallyApplicableRulesets(uri.host); } catch(e) { - this.log(WARN, 'Could not check applicable rules for '+uri.spec); + this.log(WARN, 'Could not check applicable rules for '+uri.spec + '\n'+e); return null; } @@ -595,17 +602,55 @@ const HTTPSRules = { intoList.push(fromList[i]); }, + // Try to find a ruleset in the SQLite database for a given target (e.g. + // '*.openssl.org') + // NOTE: This call runs synchronously, which can lock up the browser UI. Is + // there any way to fix that, given that we need to run blocking in the request + // flow? Perhaps we can preload all targets from the DB into memory at startup + // so we only hit the DB when we know there is something to be had. + queryTarget: function(target) { + this.log(DBUG, "Querying DB for " + target); + var output = []; + + this.queryForRuleset.params.id = this.targetsAvailable[target]; + + try { + while (this.queryForRuleset.executeStep()) + output.push(this.queryForRuleset.row.contents); + } finally { + this.queryForRuleset.reset(); + } + return output; + }, + potentiallyApplicableRulesets: function(host) { // Return a list of rulesets that declare targets matching this host var i, tmp, t; - var results = this.global_rulesets.slice(0); // copy global_rulesets - try { - if (this.targets[host]) - results = results.concat(this.targets[host]); - } catch(e) { - this.log(DBUG,"Couldn't check for ApplicableRulesets: " + e); - return []; - } + var results = []; + + var attempt = function(target) { + // First try the in-memory rulesets + if (this.targets[target] && + this.targets[target].length > 0) { + this.setInsert(results, this.targets[target]); + } else if (this.targetsAvailable[target]) { + // If not found there, check the DB and load the ruleset as appropriate + var rulesets = this.queryTarget(target); + if (rulesets.length > 0) { + for (var i = 0; i < rulesets.length; i++) { + var ruleset = rulesets[i]; + this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset); + RuleWriter.readFromString(ruleset, this); + this.setInsert(results, this.targets[target]); + } + } else { + this.nonTargets[target] = 1; + } + } + }.bind(this); + + attempt(host); + // replace each portion of the domain with a * in turn var segmented = host.split("."); for (i = 0; i < segmented.length; ++i) { @@ -613,13 +658,13 @@ const HTTPSRules = { segmented[i] = "*"; t = segmented.join("."); segmented[i] = tmp; - this.setInsert(results, this.targets[t]); + attempt(t); } // now eat away from the left, with *, so that for x.y.z.google.com we // check *.z.google.com and *.google.com (we did *.y.z.google.com above) - for (i = 1; i <= segmented.length - 2; ++i) { + for (i = 2; i <= segmented.length - 2; ++i) { t = "*." + segmented.slice(i,segmented.length).join("."); - this.setInsert(results, this.targets[t]); + attempt(t); } this.log(DBUG,"Potentially applicable rules for " + host + ":"); for (i = 0; i < results.length; ++i) diff --git a/src/components/https-everywhere.js b/src/components/https-everywhere.js index db1b42d4a1fc..d45fb214904d 100644 --- a/src/components/https-everywhere.js +++ b/src/components/https-everywhere.js @@ -31,6 +31,9 @@ const Cc = Components.classes; const Cu = Components.utils; const Cr = Components.results; +Cu.import("resource://gre/modules/Services.jsm"); +Cu.import("resource://gre/modules/FileUtils.jsm"); + const CP_SHOULDPROCESS = 4; const SERVICE_CTRID = "@eff.org/https-everywhere;1"; diff --git a/utils/make-sqlite.py b/utils/make-sqlite.py new file mode 100755 index 000000000000..501c74f5953b --- /dev/null +++ b/utils/make-sqlite.py @@ -0,0 +1,72 @@ +#!/usr/bin/python2.7 +# +# Builds an sqlite DB containing all the rulesets, indexed by target. + +import sqlite3 +import argparse +import sys, re, os + +from lxml import etree + +parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Ruleset validation script.") +parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*", + default="src/chrome/content/rules", + help='Directory of XML files to validate.') + +args = parser.parse_args() + +def nomes_all(where=sys.argv[1:]): + """Returns generator to extract all files from a list of files/dirs""" + if not where: where=['.'] + for i in where: + if os.path.isfile(i): + yield i + elif os.path.isdir(i): + for r, d, f in os.walk(i): + for fi in f: + yield os.path.join(r, fi) + + +conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite')) +c = conn.cursor() +c.execute('''DROP TABLE IF EXISTS rulesets''') +c.execute('''CREATE TABLE rulesets + (id INTEGER PRIMARY KEY, + name TEXT, + contents TEXT)''') +c.execute('''DROP TABLE IF EXISTS targets''') +c.execute('''CREATE TABLE targets + (host TEXT, + ruleset_id INTEGER)''') + +parser = etree.XMLParser(remove_blank_text=True) + +for fi in nomes_all(): + try: + tree = etree.parse(fi, parser) + except Exception as oops: + if fi[-4:] != ".xml": + continue + print("%s failed XML validity: %s\n" % (fi, oops)) + if not tree.xpath("/ruleset"): + continue + + # Remove comments to save space. + etree.strip_tags(tree,etree.Comment) + + targets = tree.xpath("/ruleset/target/@host") + # TODO: Strip target tags too. Right now the JS code requires there be a + # target tag. + #etree.strip_tags(tree,'target') + + # TODO: filter out comments and targets to save storage bytes + ruleset_name = tree.xpath("/ruleset/@name")[0] + c.execute('''INSERT INTO rulesets (name, contents) VALUES(?, ?)''', (ruleset_name, etree.tostring(tree))); + ruleset_id = c.lastrowid + for target in targets: + c.execute('''INSERT INTO targets (host, ruleset_id) VALUES(?, ?)''', (target, ruleset_id)); + +conn.commit() +conn.close()