Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Add readme and examples

  • Loading branch information...
commit 96ebfdf7767691acfd7c97ee03fc774d7717f7b3 1 parent de04cc1
@Floby authored
Showing with 101 additions and 0 deletions.
  1. +46 −0 README.md
  2. +17 −0 examples/test.js
  3. +38 −0 examples/test2.js
View
46 README.md
@@ -0,0 +1,46 @@
+# Synopsis
+A wide purpose tokenizer for JavaScript. The interface follow more or less
+the WriteStream from [node.js](http://nodejs.org).
+
+## How to
+* require the Tokenizer constructor
+ var Tokenizer = require('tokenizer');
+* construct one (we'll see what the callback is used for)
+ var t = new Tokenizer(mycallback);
+* add rules
+ t.addRule(/myawesome regex/, 'type');
+* write or pump to it
+ t.write(data);
+ // or
+ stream.pipe(t);
+* listen for new tokens
+ t.on('token', function(token, type) {
+ // do something useful
+ // type is the type of the token (specified with addRule)
+ // token is the actual matching string
+ })
+ // alternatively you can listen on the 'data' event
+* look out for the end
+ t.on('end', callback);
+
+the optional callback argument for the constructor is a function that will
+be called for each token in order to specify a different type by returning
+a string. The parameters passed to the function are token(the token that we found)
+and match, an object like this
+ {
+ regex: /whatever/ // the regex that matched the token
+ type: 'type' // the type of the token
+ }
+
+## Rules
+rules are regular expressions associated with a type name.
+The tokenizer tries to find the longest string matching one or more rules.
+When several rules match the same string, priority is given to the rule
+which was added first. (this may change)
+
+## To do
+* a lot of optimisation
+* being able to share rules across several tokenizer
+ (although this can be achieved with inheritance)
+* probably more hooks
+* more checking
View
17 examples/test.js
@@ -0,0 +1,17 @@
+var Tokenizer = require('./Tokenizer');
+var t = new Tokenizer(function(token, match) {
+ // change the type of the token before emitting it
+ if(match.type == 'word' && token == "coucou") return "coucou";
+ // this help reduce the number of RegExps needed
+});
+
+t.addRule(/^"[^"]*"$/, 'citation');
+t.addRule(/^"[^"]*$/, 'maybe citation')
+t.addRule(/^salut$/i, 'salut');
+t.addRule(/^[',;.:!?-]$/, 'ponctuation');
+t.addRule(/^\w+$/, "word");
+t.addRule(/^(\s)+$/, 'whitespace');
+
+t.write("coucou Salut\t les \n amis. On m'a dit \"ca va bien?\" ");
+t.end();
+
View
38 examples/test2.js
@@ -0,0 +1,38 @@
+var Tokenizer = require('./Tokenizer');
+var t = new Tokenizer();
+t.on('token', function(token, type) {
+ console.log('%s(%s)', token, type);
+});
+t.addRule(/^"([^"]|\\")*"$/, 'string');
+t.addRule(/^"([^"]|\\")*$/, 'maybe-string'); // same as above without the ending "
+t.addRule(/^\d+(\.\d+)?$/, 'number');
+t.addRule(/^\d+\.$/, 'maybe-float');
+t.addRule(/^(true|false)$/, 'bool');
+t.addRule(/^null$/, 'null');
+t.addRule(/^\{$/, 'begin-object');
+t.addRule(/^\}$/, 'end-object');
+t.addRule(/^\[$/, 'begin-array');
+t.addRule(/^\]$/, 'end-array');
+t.addRule(/^:$/, 'end-label');
+t.addRule(/^,$/, 'comma');
+t.addRule(/^\w+$/, "symbol");
+t.addRule(/^(\s)+$/, 'whitespace');
+
+var o = {
+ coucou: 'salut',
+ complicated: "haha 안녕,; :! {fdf} ' \' \" ",
+ nombre: 8,
+ bool: false,
+ gn: null,
+ oo: {
+ a: [
+ 'coucou',
+ 888.3,
+ false
+ ]
+ }
+}
+
+var str = JSON.stringify(o);
+console.log('parsing %s', str);
+t.write(str);
Please sign in to comment.
Something went wrong with that request. Please try again.