Add readme and examples

Floby · Apr 10, 2011 · 96ebfdf · 96ebfdf
1 parent de04cc1
commit 96ebfdf
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,46 @@
+# Synopsis
+A wide purpose tokenizer for JavaScript. The interface follow more or less
+the WriteStream from [node.js](http://nodejs.org).
+
+## How to
+* require the Tokenizer constructor
+    var Tokenizer = require('tokenizer');
+* construct one (we'll see what the callback is used for)
+    var t = new Tokenizer(mycallback);
+* add rules
+    t.addRule(/myawesome regex/, 'type');
+* write or pump to it
+    t.write(data);
+    // or
+    stream.pipe(t);
+* listen for new tokens
+    t.on('token', function(token, type) {
+        // do something useful
+        // type is the type of the token (specified with addRule)
+        // token is the actual matching string
+    })
+    // alternatively you can listen on the 'data' event
+* look out for the end
+    t.on('end', callback);
+
+the optional callback argument for the constructor is a function that will
+be called for each token in order to specify a different type by returning
+a string. The parameters passed to the function are token(the token that we found)
+and match, an object like this 
+    {
+        regex: /whatever/ // the regex that matched the token
+        type: 'type' // the type of the token
+    }
+
+## Rules
+rules are regular expressions associated with a type name.
+The tokenizer tries to find the longest string matching one or more rules.
+When several rules match the same string, priority is given to the rule
+which was added first. (this may change)
+
+## To do
+* a lot of optimisation
+* being able to share rules across several tokenizer 
+    (although this can be achieved with inheritance)
+* probably more hooks
+* more checking
diff --git a/examples/test.js b/examples/test.js
@@ -0,0 +1,17 @@
+var Tokenizer = require('./Tokenizer');
+var t = new Tokenizer(function(token, match) {
+    // change the type of the token before emitting it
+    if(match.type == 'word' && token == "coucou") return "coucou";
+    // this help reduce the number of RegExps needed
+});
+
+t.addRule(/^"[^"]*"$/, 'citation');
+t.addRule(/^"[^"]*$/, 'maybe citation')
+t.addRule(/^salut$/i, 'salut');
+t.addRule(/^[',;.:!?-]$/, 'ponctuation');
+t.addRule(/^\w+$/, "word");
+t.addRule(/^(\s)+$/, 'whitespace');
+
+t.write("coucou Salut\t les \n amis. On m'a dit \"ca va bien?\" ");
+t.end();
+
diff --git a/examples/test2.js b/examples/test2.js
@@ -0,0 +1,38 @@
+var Tokenizer = require('./Tokenizer');
+var t = new Tokenizer();
+t.on('token', function(token, type) {
+    console.log('%s(%s)', token, type);
+});
+t.addRule(/^"([^"]|\\")*"$/, 'string');
+t.addRule(/^"([^"]|\\")*$/, 'maybe-string'); // same as above without the ending "
+t.addRule(/^\d+(\.\d+)?$/, 'number');
+t.addRule(/^\d+\.$/, 'maybe-float');
+t.addRule(/^(true|false)$/, 'bool');
+t.addRule(/^null$/, 'null');
+t.addRule(/^\{$/, 'begin-object');
+t.addRule(/^\}$/, 'end-object');
+t.addRule(/^\[$/, 'begin-array');
+t.addRule(/^\]$/, 'end-array');
+t.addRule(/^:$/, 'end-label');
+t.addRule(/^,$/, 'comma');
+t.addRule(/^\w+$/, "symbol");
+t.addRule(/^(\s)+$/, 'whitespace');
+
+var o = {
+    coucou: 'salut',
+    complicated: "haha 안녕,; :! {fdf} ' \' \" ",
+    nombre: 8,
+    bool: false,
+    gn: null,
+    oo: {
+        a: [
+            'coucou',
+            888.3,
+            false
+        ]
+    }
+}
+
+var str = JSON.stringify(o);
+console.log('parsing %s', str);
+t.write(str);