Skip to content
This repository has been archived by the owner on Nov 6, 2023. It is now read-only.

Firefox load time startup: Use SQLite for ruleset DB #90

Merged
merged 6 commits into from Jan 22, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 7 additions & 9 deletions makexpi.sh
Expand Up @@ -15,6 +15,7 @@ APP_NAME=https-everywhere
# ./makexpi.sh 0.2.3.development.2

cd "`dirname $0`"
RULESETS_SQLITE="$PWD/src/defaults/rulesets.sqlite"

[ -d pkg ] || mkdir pkg

Expand Down Expand Up @@ -97,6 +98,11 @@ if [ "$1" != "--fast" ] ; then
fi
# =============== END VALIDATION ================

if [ "$1" != "--fast" -o ! -f "$RULESETS_SQLITE" ] ; then
echo "Generating sqlite DB"
./utils/make-sqlite.py src/chrome/content/rules
fi

# The name/version of the XPI we're building comes from src/install.rdf
XPI_NAME="pkg/$APP_NAME-`grep em:version src/install.rdf | sed -e 's/[<>]/ /g' | cut -f3`"
if [ "$1" ] && [ "$1" != "--fast" ] ; then
Expand All @@ -114,14 +120,6 @@ if [ -e "$GIT_OBJECT_FILE" ]; then
export GIT_COMMIT_ID=$(cat "$GIT_OBJECT_FILE")
fi

# Unless we're in a hurry and there's already a ruleset library, build it from
# the ruleset .xml files

if [ "$1" = "--fast" ] ; then
FAST="--fast"
fi
python ./utils/merge-rulesets.py $FAST

cd src

# Build the XPI!
Expand All @@ -135,7 +133,7 @@ if [ "$ret" != 0 ]; then
rm -f "../$XPI_NAME"
exit "$?"
else
echo >&2 "Total included rules: `find chrome/content/rules -name "*.xml" | wc -l`"
echo >&2 "Total included rules: `sqlite3 $RULESETS_SQLITE 'select count(*) from rulesets'`"
echo >&2 "Rules disabled by default: `find chrome/content/rules -name "*.xml" | xargs grep -F default_off | wc -l`"
echo >&2 "Created $XPI_NAME"
if [ -n "$BRANCH" ]; then
Expand Down
113 changes: 79 additions & 34 deletions src/chrome/content/code/HTTPSRules.js
Expand Up @@ -280,6 +280,12 @@ const RuleWriter = {

sstream.close();
fstream.close();
return this.readFromString(data, rule_store, file);
},

readFromString: function(data, rule_store, file) {
if (typeof file === 'undefined') file = {path: 'fromString'};

// XXX: With DOMParser, we probably do not need to throw away the XML
// declaration anymore nowadays.
data = data.replace(/<\?xml[^>]*\?>/, "");
Expand Down Expand Up @@ -414,30 +420,29 @@ const HTTPSRules = {
this.rulesetsByName = {};
var t1 = new Date().getTime();
this.checkMixedContentHandling();
var rulefiles = RuleWriter.enumerate(RuleWriter.getCustomRuleDir());
this.scanRulefiles(rulefiles);
rulefiles = RuleWriter.enumerate(RuleWriter.getRuleDir());
this.scanRulefiles(rulefiles);
var t,i;
for (t in this.targets) {
for (i = 0 ; i < this.targets[t].length ; i++) {
this.log(INFO, t + " -> " + this.targets[t][i].name);
}
}

// for any rulesets with <target host="*">
// every URI needs to be checked against these rulesets
// (though currently we don't ship any)
this.global_rulesets = this.targets["*"] ? this.targets["*"] : [];

this.rulesets.sort(
function(r1,r2) {
if (r1.name.toLowerCase() < r2.name.toLowerCase()) return -1;
else return 1;
}
);
// Initialize database connection.
var dbFile = FileUtils.getFile("ProfD",
["extensions", "https-everywhere@eff.org", "defaults", "rulesets.sqlite"]);
var rulesetDBConn = Services.storage.openDatabase(dbFile);
this.queryForRuleset = rulesetDBConn.createStatement(
"select contents from rulesets where id = :id");

// Preload the list of which targets are available in the DB.
// This is a little slow (287 ms on a Core2 Duo @ 2.2GHz with SSD),
// but is faster than loading all of the rulesets. If this becomes a
// bottleneck, change it to load in a background webworker, or load
// a smaller bloom filter instead.
this.targetsAvailable = {};
var targetsQuery = rulesetDBConn.createStatement("select host, ruleset_id from targets");
this.log(DBUG, "Adding targets...");
while (targetsQuery.executeStep()) {
var host = targetsQuery.row.host;
this.targetsAvailable[host] = targetsQuery.row.ruleset_id;
}
this.log(DBUG, "Done adding targets.");
} catch(e) {
this.log(WARN,"Rules Failed: "+e);
this.log(DBUG,"Rules Failed: "+e);
}
var t2 = new Date().getTime();
this.log(NOTE,"Loading rulesets took " + (t2 - t1) / 1000.0 + " seconds");
Expand Down Expand Up @@ -491,6 +496,8 @@ const HTTPSRules = {
}
},

httpMatch: /^http/i,

rewrittenURI: function(alist, input_uri) {
// This function oversees the task of working out if a uri should be
// rewritten, what it should be rewritten to, and recordkeeping of which
Expand All @@ -511,7 +518,7 @@ const HTTPSRules = {
try {
var rs = this.potentiallyApplicableRulesets(uri.host);
} catch(e) {
this.log(WARN, 'Could not check applicable rules for '+uri.spec);
this.log(WARN, 'Could not check applicable rules for '+uri.spec + '\n'+e);
return null;
}

Expand Down Expand Up @@ -595,31 +602,69 @@ const HTTPSRules = {
intoList.push(fromList[i]);
},

// Try to find a ruleset in the SQLite database for a given target (e.g.
// '*.openssl.org')
// NOTE: This call runs synchronously, which can lock up the browser UI. Is
// there any way to fix that, given that we need to run blocking in the request
// flow? Perhaps we can preload all targets from the DB into memory at startup
// so we only hit the DB when we know there is something to be had.
queryTarget: function(target) {
this.log(DBUG, "Querying DB for " + target);
var output = [];

this.queryForRuleset.params.id = this.targetsAvailable[target];

try {
while (this.queryForRuleset.executeStep())
output.push(this.queryForRuleset.row.contents);
} finally {
this.queryForRuleset.reset();
}
return output;
},

potentiallyApplicableRulesets: function(host) {
// Return a list of rulesets that declare targets matching this host
var i, tmp, t;
var results = this.global_rulesets.slice(0); // copy global_rulesets
try {
if (this.targets[host])
results = results.concat(this.targets[host]);
} catch(e) {
this.log(DBUG,"Couldn't check for ApplicableRulesets: " + e);
return [];
}
var results = [];

var attempt = function(target) {
// First try the in-memory rulesets
if (this.targets[target] &&
this.targets[target].length > 0) {
this.setInsert(results, this.targets[target]);
} else if (this.targetsAvailable[target]) {
// If not found there, check the DB and load the ruleset as appropriate
var rulesets = this.queryTarget(target);
if (rulesets.length > 0) {
for (var i = 0; i < rulesets.length; i++) {
var ruleset = rulesets[i];
this.log(INFO, "Found ruleset in DB for " + host + ": " + ruleset);
RuleWriter.readFromString(ruleset, this);
this.setInsert(results, this.targets[target]);
}
} else {
this.nonTargets[target] = 1;
}
}
}.bind(this);

attempt(host);

// replace each portion of the domain with a * in turn
var segmented = host.split(".");
for (i = 0; i < segmented.length; ++i) {
tmp = segmented[i];
segmented[i] = "*";
t = segmented.join(".");
segmented[i] = tmp;
this.setInsert(results, this.targets[t]);
attempt(t);
}
// now eat away from the left, with *, so that for x.y.z.google.com we
// check *.z.google.com and *.google.com (we did *.y.z.google.com above)
for (i = 1; i <= segmented.length - 2; ++i) {
for (i = 2; i <= segmented.length - 2; ++i) {
t = "*." + segmented.slice(i,segmented.length).join(".");
this.setInsert(results, this.targets[t]);
attempt(t);
}
this.log(DBUG,"Potentially applicable rules for " + host + ":");
for (i = 0; i < results.length; ++i)
Expand Down
3 changes: 3 additions & 0 deletions src/components/https-everywhere.js
Expand Up @@ -31,6 +31,9 @@ const Cc = Components.classes;
const Cu = Components.utils;
const Cr = Components.results;

Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/FileUtils.jsm");

const CP_SHOULDPROCESS = 4;

const SERVICE_CTRID = "@eff.org/https-everywhere;1";
Expand Down
72 changes: 72 additions & 0 deletions utils/make-sqlite.py
@@ -0,0 +1,72 @@
#!/usr/bin/python2.7
#
# Builds an sqlite DB containing all the rulesets, indexed by target.

import sqlite3
import argparse
import sys, re, os

from lxml import etree

parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="Ruleset validation script.")
parser.add_argument('ruleset', metavar='XML directory', type=str, nargs="*",
default="src/chrome/content/rules",
help='Directory of XML files to validate.')

args = parser.parse_args()

def nomes_all(where=sys.argv[1:]):
"""Returns generator to extract all files from a list of files/dirs"""
if not where: where=['.']
for i in where:
if os.path.isfile(i):
yield i
elif os.path.isdir(i):
for r, d, f in os.walk(i):
for fi in f:
yield os.path.join(r, fi)


conn = sqlite3.connect(os.path.join(os.path.dirname(__file__), '../src/defaults/rulesets.sqlite'))
c = conn.cursor()
c.execute('''DROP TABLE IF EXISTS rulesets''')
c.execute('''CREATE TABLE rulesets
(id INTEGER PRIMARY KEY,
name TEXT,
contents TEXT)''')
c.execute('''DROP TABLE IF EXISTS targets''')
c.execute('''CREATE TABLE targets
(host TEXT,
ruleset_id INTEGER)''')

parser = etree.XMLParser(remove_blank_text=True)

for fi in nomes_all():
try:
tree = etree.parse(fi, parser)
except Exception as oops:
if fi[-4:] != ".xml":
continue
print("%s failed XML validity: %s\n" % (fi, oops))
if not tree.xpath("/ruleset"):
continue

# Remove comments to save space.
etree.strip_tags(tree,etree.Comment)

targets = tree.xpath("/ruleset/target/@host")
# TODO: Strip target tags too. Right now the JS code requires there be a
# target tag.
#etree.strip_tags(tree,'target')

# TODO: filter out comments and targets to save storage bytes
ruleset_name = tree.xpath("/ruleset/@name")[0]
c.execute('''INSERT INTO rulesets (name, contents) VALUES(?, ?)''', (ruleset_name, etree.tostring(tree)));
ruleset_id = c.lastrowid
for target in targets:
c.execute('''INSERT INTO targets (host, ruleset_id) VALUES(?, ?)''', (target, ruleset_id));

conn.commit()
conn.close()