diff --git a/README.md b/README.md new file mode 100644 index 0000000..4cd3289 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# Synopsis +A wide purpose tokenizer for JavaScript. The interface follow more or less +the WriteStream from [node.js](http://nodejs.org). + +## How to +* require the Tokenizer constructor + var Tokenizer = require('tokenizer'); +* construct one (we'll see what the callback is used for) + var t = new Tokenizer(mycallback); +* add rules + t.addRule(/myawesome regex/, 'type'); +* write or pump to it + t.write(data); + // or + stream.pipe(t); +* listen for new tokens + t.on('token', function(token, type) { + // do something useful + // type is the type of the token (specified with addRule) + // token is the actual matching string + }) + // alternatively you can listen on the 'data' event +* look out for the end + t.on('end', callback); + +the optional callback argument for the constructor is a function that will +be called for each token in order to specify a different type by returning +a string. The parameters passed to the function are token(the token that we found) +and match, an object like this + { + regex: /whatever/ // the regex that matched the token + type: 'type' // the type of the token + } + +## Rules +rules are regular expressions associated with a type name. +The tokenizer tries to find the longest string matching one or more rules. +When several rules match the same string, priority is given to the rule +which was added first. (this may change) + +## To do +* a lot of optimisation +* being able to share rules across several tokenizer + (although this can be achieved with inheritance) +* probably more hooks +* more checking diff --git a/examples/test.js b/examples/test.js new file mode 100644 index 0000000..d936bbe --- /dev/null +++ b/examples/test.js @@ -0,0 +1,17 @@ +var Tokenizer = require('./Tokenizer'); +var t = new Tokenizer(function(token, match) { + // change the type of the token before emitting it + if(match.type == 'word' && token == "coucou") return "coucou"; + // this help reduce the number of RegExps needed +}); + +t.addRule(/^"[^"]*"$/, 'citation'); +t.addRule(/^"[^"]*$/, 'maybe citation') +t.addRule(/^salut$/i, 'salut'); +t.addRule(/^[',;.:!?-]$/, 'ponctuation'); +t.addRule(/^\w+$/, "word"); +t.addRule(/^(\s)+$/, 'whitespace'); + +t.write("coucou Salut\t les \n amis. On m'a dit \"ca va bien?\" "); +t.end(); + diff --git a/examples/test2.js b/examples/test2.js new file mode 100644 index 0000000..c5ed5bd --- /dev/null +++ b/examples/test2.js @@ -0,0 +1,38 @@ +var Tokenizer = require('./Tokenizer'); +var t = new Tokenizer(); +t.on('token', function(token, type) { + console.log('%s(%s)', token, type); +}); +t.addRule(/^"([^"]|\\")*"$/, 'string'); +t.addRule(/^"([^"]|\\")*$/, 'maybe-string'); // same as above without the ending " +t.addRule(/^\d+(\.\d+)?$/, 'number'); +t.addRule(/^\d+\.$/, 'maybe-float'); +t.addRule(/^(true|false)$/, 'bool'); +t.addRule(/^null$/, 'null'); +t.addRule(/^\{$/, 'begin-object'); +t.addRule(/^\}$/, 'end-object'); +t.addRule(/^\[$/, 'begin-array'); +t.addRule(/^\]$/, 'end-array'); +t.addRule(/^:$/, 'end-label'); +t.addRule(/^,$/, 'comma'); +t.addRule(/^\w+$/, "symbol"); +t.addRule(/^(\s)+$/, 'whitespace'); + +var o = { + coucou: 'salut', + complicated: "haha 안녕,; :! {fdf} ' \' \" ", + nombre: 8, + bool: false, + gn: null, + oo: { + a: [ + 'coucou', + 888.3, + false + ] + } +} + +var str = JSON.stringify(o); +console.log('parsing %s', str); +t.write(str);