-
Notifications
You must be signed in to change notification settings - Fork 0
/
figment
executable file
·4545 lines (3648 loc) · 297 KB
/
figment
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl
# Run perldoc on this file for documentation.
# For the benefit of HTML viewers:
# <div id='cover' style='position: absolute; left: 0; top: 0; width: 10000px; height: 10000px; background: white'></div>
$|++;
my %data;
my %transient;
my %externalized_functions;
my %datatypes;
my %locations; # Maps eval-numbers to attribute names
sub meta::define_form {
my ($namespace, $delegate) = @_;
$datatypes{$namespace} = $delegate;
*{"meta::${namespace}::implementation"} = $delegate;
*{"meta::$namespace"} = sub {
my ($name, $value, %options) = @_;
chomp $value;
$data{"${namespace}::$name"} = $value unless $options{no_binding};
&$delegate($name, $value) unless $options{no_delegate}}}
sub meta::eval_in {
my ($what, $where) = @_;
# Obtain next eval-number and alias it to the designated location
@locations{eval('__FILE__') =~ /\(eval (\d+)\)/} = ($where);
my $result = eval $what;
$@ =~ s/\(eval \d+\)/$where/ if $@;
warn $@ if $@;
$result}
meta::define_form 'meta', sub {
my ($name, $value) = @_;
meta::eval_in($value, "meta::$name")};
meta::meta('configure', <<'__69c2e727c124521d074fde21f8bbc4db');
# A function to configure transients. Transients can be used to store any number of
# different things, but one of the more common usages is type descriptors.
sub meta::configure {
my ($datatype, %options) = @_;
$transient{$_}{$datatype} = $options{$_} for keys %options;
}
__69c2e727c124521d074fde21f8bbc4db
meta::meta('externalize', <<'__aa44e27e0bbee6f0ca4de25d603a1fc7');
# Function externalization. Data types should call this method when defining a function
# that has an external interface.
sub meta::externalize {
my ($name, $attribute, $implementation) = @_;
my $escaped = $name;
$escaped =~ s/[^A-Za-z0-9:]/_/go;
$externalized_functions{$name} = $externalized_functions{$escaped} = $attribute;
*{"::$name"} = *{"::$escaped"} = $implementation || $attribute;
}
__aa44e27e0bbee6f0ca4de25d603a1fc7
meta::meta('functor::editable', <<'__48246c608f363de66511400e00b26164');
# An editable type. This creates a type whose default action is to open an editor
# on whichever value is mentioned. This can be changed using different flags.
sub meta::functor::editable {
my ($typename, %options) = @_;
meta::configure $typename, %options;
meta::define_form $typename, sub {
my ($name, $value) = @_;
$options{on_bind} && &{$options{on_bind}}($name, $value);
meta::externalize $options{prefix} . $name, "${typename}::$name", sub {
my $attribute = "${typename}::$name";
my ($command, @new_value) = @_;
return &{$options{default}}(retrieve($attribute)) if ref $options{default} eq 'CODE' and not defined $command;
return edit($attribute) if $command eq 'edit' or $options{default} eq 'edit' and not defined $command;
return associate($attribute, @new_value ? join(' ', @new_value) : join('', <STDIN>)) if $command eq '=' or $command eq 'import' or $options{default} eq 'import' and not defined $command;
return retrieve($attribute)}}}
__48246c608f363de66511400e00b26164
meta::meta('type::alias', <<'__889d26d2df385e9ff8e2da7de4e48374');
meta::configure 'alias', inherit => 0;
meta::define_form 'alias', sub {
my ($name, $value) = @_;
meta::externalize $name, "alias::$name", sub {
# Can't pre-tokenize because shell::tokenize doesn't exist until the library::
# namespace has been evaluated (which will be after alias::).
shell::run(shell::tokenize($value), shell::tokenize(@_));
};
};
__889d26d2df385e9ff8e2da7de4e48374
meta::meta('type::bootstrap', <<'__51108ab2ddb8d966e927c8f62d9ef3e5');
# Bootstrap attributes don't get executed. The reason for this is that because
# they are serialized directly into the header of the file (and later duplicated
# as regular data attributes), they will have already been executed when the
# file is loaded.
meta::configure 'bootstrap', extension => '.pl', inherit => 1;
meta::define_form 'bootstrap', sub {};
__51108ab2ddb8d966e927c8f62d9ef3e5
meta::meta('type::cache', <<'__9267171f2eace476f64a1a670eaaf2c7');
meta::configure 'cache', inherit => 0;
meta::define_form 'cache', \&meta::bootstrap::implementation;
__9267171f2eace476f64a1a670eaaf2c7
meta::meta('type::cached_dependency', <<'__b9dc0b20c2d3af0deb3b835b20cac4a7');
meta::configure 'cached_dependency', inherit => 0, extension => '';
meta::define_form 'cached_dependency', \&meta::bootstrap::implementation;
__b9dc0b20c2d3af0deb3b835b20cac4a7
meta::meta('type::configuration', <<'__7f5ba514d47ac29a3c226d0e331d9da4');
meta::functor::editable 'configuration', inherit => 0, extension => '.conf', default => sub {
# Any lines starting with #, with or without leading whitespace, are treated as comments.
# Comments are not parsed in option text; that is, you could specify an option that contained
# a # and the # and following text would be considered part of that option.
my ($data) = @_;
my @options = grep /:\h+/o && ! /^\h*#/o && ! /^\h*$/o, split(/\v+/o, $data);
s/^\h+//o for @options;
my @key_values = map split(/\h*:\h+/o, $_, 2), @options;
$key_values[$_ << 1] and $key_values[$_ << 1] =~ s/\s/_/go for 0 .. @key_values >> 1;
$key_values[$_ << 1] and $key_values[$_ << 1] = lc $key_values[$_ << 1] for 0 .. @key_values >> 1;
@key_values;
};
__7f5ba514d47ac29a3c226d0e331d9da4
meta::meta('type::data', 'meta::functor::editable \'data\', extension => \'\', inherit => 0, default => \'cat\';');
meta::meta('type::fig', 'meta::functor::editable \'fig\', default => \'edit\', extension => \'.fig\', inherit => 1;');
meta::meta('type::function', <<'__8ea626198861dc59dd7f303eecb5ff88');
meta::configure 'function', extension => '.pl', inherit => 1;
meta::define_form 'function', sub {
my ($name, $value) = @_;
meta::externalize $name, "function::$name", meta::eval_in("sub {\n$value\n}", "function::$name");
};
__8ea626198861dc59dd7f303eecb5ff88
meta::meta('type::hook', <<'__ff92aef328b6bdc6f87ddd0821f3e42f');
meta::configure 'hook', extension => '.pl', inherit => 0;
meta::define_form 'hook', sub {
my ($name, $value) = @_;
*{"hook::$name"} = meta::eval_in("sub {\n$value\n}", "hook::$name");
};
__ff92aef328b6bdc6f87ddd0821f3e42f
meta::meta('type::inc', <<'__78e0375b6725487cb1f0deca41e96bbe');
meta::configure 'inc', inherit => 1, extension => '.pl';
meta::define_form 'inc', sub {
use File::Path 'mkpath';
use File::Basename qw/basename dirname/;
my ($name, $value) = @_;
my $tmpdir = basename($0) . '-' . $$;
my $filename = "/tmp/$tmpdir/$name";
push @INC, "/tmp/$tmpdir" unless grep /^\/tmp\/$tmpdir$/, @INC;
mkpath(dirname($filename));
unless (-e $filename) {
open my $fh, '>', $filename;
print $fh $value;
close $fh;
}
};
__78e0375b6725487cb1f0deca41e96bbe
meta::meta('type::indicator', <<'__feb54a2624e6983617685047c717427f');
# Shell indicator function. The output of each of these is automatically
# appended to the shell prompt.
meta::configure 'indicator', inherit => 1, extension => '.pl';
meta::define_form 'indicator', sub {
my ($name, $value) = @_;
*{"indicator::$name"} = meta::eval_in("sub {\n$value\n}", "indicator::$name");
};
__feb54a2624e6983617685047c717427f
meta::meta('type::internal_function', <<'__eff3cf31e2635f51c83836f116c99d2f');
meta::configure 'internal_function', extension => '.pl', inherit => 1;
meta::define_form 'internal_function', sub {
my ($name, $value) = @_;
*{$name} = meta::eval_in("sub {\n$value\n}", "internal_function::$name");
};
__eff3cf31e2635f51c83836f116c99d2f
meta::meta('type::js', 'meta::functor::editable \'js\', extension => \'.js\', inherit => 1;');
meta::meta('type::library', <<'__7622e8d65e03066668bade74715d65ad');
meta::configure 'library', extension => '.pl', inherit => 1;
meta::define_form 'library', sub {
my ($name, $value) = @_;
meta::eval_in($value, "library::$name");
};
__7622e8d65e03066668bade74715d65ad
meta::meta('type::message_color', <<'__557a1b44979cbf77a7251fbdc4c5b82c');
meta::configure 'message_color', extension => '', inherit => 1;
meta::define_form 'message_color', sub {
my ($name, $value) = @_;
terminal::color($name, $value);
};
__557a1b44979cbf77a7251fbdc4c5b82c
meta::meta('type::meta', <<'__c6250056816b58a9608dd1b2614246f8');
# This doesn't define a new type. It customizes the existing 'meta' type
# defined in bootstrap::initialization. Note that horrible things will
# happen if you redefine it using the editable functor.
meta::configure 'meta', extension => '.pl', inherit => 1;
__c6250056816b58a9608dd1b2614246f8
meta::meta('type::note', 'meta::functor::editable \'note\', extension => \'.sdoc\', inherit => 0, default => \'edit\';');
meta::meta('type::parent', <<'__09d1d03379e4e0b262e06939f4e00464');
meta::define_form 'parent', \&meta::bootstrap::implementation;
meta::configure 'parent', extension => '', inherit => 1;
__09d1d03379e4e0b262e06939f4e00464
meta::meta('type::retriever', <<'__71a29050bf9f20f6c71afddff83addc9');
meta::configure 'retriever', extension => '.pl', inherit => 1;
meta::define_form 'retriever', sub {
my ($name, $value) = @_;
$transient{retrievers}{$name} = meta::eval_in("sub {\n$value\n}", "retriever::$name");
};
__71a29050bf9f20f6c71afddff83addc9
meta::meta('type::sdoc', <<'__22cd7315641d38c9d536344e83c36bed');
# A meta-type for other types. So retrieve('js::main') will work if you have
# the attribute 'sdoc::js::main'. The filename will be main.js.sdoc.
meta::functor::editable 'sdoc', inherit => 1, extension => sub {
extension_for(attribute($_[0])) . '.sdoc';
};
__22cd7315641d38c9d536344e83c36bed
meta::meta('type::state', <<'__84da7d5220471307f1f990c5057d3319');
# Allows temporary or long-term storage of states. Nothing particularly insightful
# is done about compression, so storing alternative states will cause a large
# increase in size. Also, states don't contain other states -- otherwise the size
# increase would be exponential.
# States are created with the save-state function.
meta::configure 'state', inherit => 0, extension => '.pl';
meta::define_form 'state', \&meta::bootstrap::implementation;
__84da7d5220471307f1f990c5057d3319
meta::meta('type::template', <<'__bc4b0c80b5efc716b19e99b832c22bf3');
meta::configure 'template', extension => '.pl', inherit => 1;
meta::define_form 'template', sub {
my ($name, $value) = @_;
meta::externalize "template::$name", "template::$name", meta::eval_in("sub {\n$value\n}", "template::$name");
};
__bc4b0c80b5efc716b19e99b832c22bf3
meta::meta('type::vim_highlighter', 'meta::functor::editable \'vim_highlighter\', extension => \'.vim\', inherit => 1, default => \'edit\';');
meta::meta('type::watch', 'meta::functor::editable \'watch\', prefix => \'watch::\', inherit => 1, extension => \'.pl\', default => \'cat\';');
meta::alias('e', 'edit sdoc::js::fig');
meta::alias('ep', 'edit sdoc::js::fig.parser');
meta::alias('er', 'edit sdoc::js::fig.require');
meta::alias('es', 'edit sdoc::js::fig.semantics');
meta::alias('estd', 'edit fig::std');
meta::alias('ev', 'edit vim_highlighter::figment');
meta::bootstrap('html', <<'__f44dd03cb0c904b3a5f69fbda5f018d0');
<html>
<head>
<meta http-equiv='content-type' content='text/html; charset=UTF-8' />
<link rel='stylesheet' href='http://spencertipping.com/perl-objects/web/style.css'/>
<script src='http://ajax.googleapis.com/ajax/libs/jquery/1.5.2/jquery.min.js'></script>
<script src='http://spencertipping.com/caterwaul/caterwaul.all.min.js'></script>
<script src='http://spencertipping.com/montenegro/montenegro.client.js'></script>
<script src='http://spencertipping.com/perl-objects/web/attribute-parser.js'></script>
<script src='http://spencertipping.com/perl-objects/web/interface.js'></script>
</head>
<body></body>
</html>
__f44dd03cb0c904b3a5f69fbda5f018d0
meta::bootstrap('initialization', <<'__7637e32ba308624d824eacc9337c2512');
#!/usr/bin/perl
# Run perldoc on this file for documentation.
# For the benefit of HTML viewers:
# <div id='cover' style='position: absolute; left: 0; top: 0; width: 10000px; height: 10000px; background: white'></div>
$|++;
my %data;
my %transient;
my %externalized_functions;
my %datatypes;
my %locations; # Maps eval-numbers to attribute names
sub meta::define_form {
my ($namespace, $delegate) = @_;
$datatypes{$namespace} = $delegate;
*{"meta::${namespace}::implementation"} = $delegate;
*{"meta::$namespace"} = sub {
my ($name, $value, %options) = @_;
chomp $value;
$data{"${namespace}::$name"} = $value unless $options{no_binding};
&$delegate($name, $value) unless $options{no_delegate}}}
sub meta::eval_in {
my ($what, $where) = @_;
# Obtain next eval-number and alias it to the designated location
@locations{eval('__FILE__') =~ /\(eval (\d+)\)/} = ($where);
my $result = eval $what;
$@ =~ s/\(eval \d+\)/$where/ if $@;
warn $@ if $@;
$result}
meta::define_form 'meta', sub {
my ($name, $value) = @_;
meta::eval_in($value, "meta::$name")};
__7637e32ba308624d824eacc9337c2512
meta::bootstrap('perldoc', <<'__5793df44bdd2526bb461272924abfd4b');
=head1 Self-modifying Perl script
=head2 Original implementation by Spencer Tipping L<http://spencertipping.com>
The prototype for this script is licensed under the terms of the MIT source code license.
However, this script in particular may be under different licensing terms. To find out how
this script is licensed, please contact whoever sent it to you. Alternatively, you may
run it with the 'license' argument if they have specified a license that way.
You should not edit this file directly. For information about how it was constructed, go
to L<http://spencertipping.com/writing-self-modifying-perl>. For quick usage guidelines,
run this script with the 'usage' argument.
=cut
__5793df44bdd2526bb461272924abfd4b
meta::cache('parent-identification', <<'__55c2638f4cd6f5a7f70d86793bf8d486');
./sdoc
/home/spencertipping/bin/configuration aa772900bb5b925cb84346bd72a4249d
/home/spencertipping/bin/node-base da62d84a9e81832f089520c172982c1a
/home/spencertipping/bin/object 99aeabc9ec7fe80b1b39f5e53dc7e49e
/home/spencertipping/bin/repository 05bc3036c343fdb8aec5b0be12a9b19e
/home/spencertipping/conjectures/perl-objects/sdoc a1e8480e579614c01dabeecf0f963bcc
configuration aa772900bb5b925cb84346bd72a4249d
development 3d94eaf9719db17882d02c1d1fe18718
notes a9e5975593ed5d90d943ad98405c71e5
object 99aeabc9ec7fe80b1b39f5e53dc7e49e
preprocessor 70dae4b46eb4e06798ec6f38d17d4c7b
repository 05bc3036c343fdb8aec5b0be12a9b19e
vim-highlighters 902333a0bd6ed90ff919fe8477cb4e69
__55c2638f4cd6f5a7f70d86793bf8d486
meta::cached_dependency('caterwaul.all.js', <<'__0e0e4ae99af615a1324d565eacd1919c');
// Caterwaul JS | Spencer Tipping
// Licensed under the terms of the MIT source code license
(function (f) {return f(f)}) (function (self, undefined) {
// Introduction.
// Caterwaul implements a very small Lisp in Javascript syntax. The syntax ends up looking much more like McCarthy's M-expressions than traditional S-expressions, due to the ease of embedding
// those in a JS-compatible grammar. Also, Javascript convention makes square-bracket calls such as qs[foo] relatively uncommon, so I'm using that as the macro syntax (though of course you can
// define macros with other forms as well).
// The most important thing Caterwaul does is provide a quotation operator. For example:
// | caterwaul.clone('std')(function () {
// return qs[x + 1];
// });
// This function returns a syntax tree representing the expression 'x + 1'. Caterwaul also includes macro-definition and quasiquoting (not quite like Lisp, though I imagine you could write a
// macro for that):
// | caterwaul.configure('std')(function () {
// caterwaul.macro(qs[let (_ = _) in _], function (variable, value, expression) {
// return qs[(function (variable) {return expression}).call(this, value)].replace({variable: variable, expression: expression, value: value});
// });
// // Macro usable in future caterwaul()ed functions
// });
// Or, more concisely (since macro definitions can be used inside other macro definitions when you define with rmacro):
// | var f = caterwaul.configure('std')(function () {
// caterwaul.rmacro(qs[let (_ = _) in _], fn[variable, value, expression]
// [qs[(fn[variable][expression]).call(this, value)].replace({variable: variable, expression: expression, value: value})]);
// });
// Note that 'caterwaul' inside a transformed function refers to the transforming function, not to the global Caterwaul function.
// See the 'Macroexpansion' section some distance below for more information about defining macros.
// Coding style.
// I like to code using syntactic minimalism, and since this project is a hobby instead of work I've run with that style completely. This has some advantages and some disadvantages. Advantages
// include (1) a very small gzipped/minified footprint (especially since these comments make up most of the file), (2) few lines of code, though they are very long, and (3) lots of semantic
// factoring that should make modification relatively simple. Disadvantages are (1) completely impenetrable logic (especially without the comments) and (2) possibly suboptimal performance in
// the small scale (depending on whether your JS interpreter is optimized for statements or expressions).
// There are a couple of things worth knowing about as you're reading through this code. One is that invariants are generally coded as such; for example, the 'own' property lookup is factored
// out of the 'has' function even though it would be trivial to write it inside. This is to indicate to Javascript that Object.prototype.hasOwnProperty is relatively invariant, and that saves
// some lookups as the code is running. Another is that I use the (function (variable) {return expression})(value) form to emulate let-bindings. (Reading the code with this in mind will make it
// much more obvious what's going on.)
// Utility methods.
// Gensym is used to support qs[]. When we quote syntax, what we really intend to do is grab a syntax tree representing something; this entails creating a let-binding with the already-evaluated
// tree. (Note: Don't go and modify these qs[]-generated trees; you only get one for each qs[].) The ultimate code ends up looking like this (see 'Environment-dependent compilation' some
// distance below):
// | (function (a_gensym) {
// var v1 = a_gensym.gensym_1;
// var v2 = a_gensym.gensym_2;
// ...
// return <your macroexpanded function>;
// }) ({gensym_1: v1, gensym_2: v2, ..., gensym_n: vn});
// A note about gensym uniqueness. Gensyms are astronomically unlikely to collide, but there are some compromises made to make sure of this. First, gensyms are not predictable; the first one is
// randomized. This means that if you do have a collision, it may be intermittent (and that is probably a non-feature). Second, and this is a good thing, you can load Caterwaul multiple times
// without worrying about gensyms colliding between them. Each instance of Caterwaul uses its own system time and random number to seed the gensym generation, and the system time remains stable
// while the random number gets incremented. It is very unlikely that any collisions would happen.
// Bind() is the usual 'bind this function to some value' function. The only difference is that it supports rebinding; that is, if you have a function you've already bound to X, you can call
// bind on that function and some new value Y and get the original function bound to Y. The bound function has two attributes, 'original' and 'binding', that let bind() achieve this rebinding.
// Map() is an array map function, fairly standard really. I include it because IE doesn't provide Array.prototype.map. hash() takes a string, splits it on whitespace, and returns an object
// that maps each element to true. It's useful for defining sets. extend() takes a constructor function and zero or more extension objects, merging each extension object into the constructor
// function's prototype. The constructor function is then returned. It's a shorthand for defining classes.
// Se() stands for 'side-effect', and its purpose is to take a value and a function, pass the value into the function, and return either whatever the function returned or the value you gave it.
// It's used to initialize things statefully; for example:
// | return se(function () {return 5}, function (f) {
// f.sourceCode = 'return 5';
// });
// The Caterwaul standard library gives you an equivalent but much more refined form of se() called /se[].
var qw = function (x) {return x.split(/\s+/)}, id = function (x) {return x}, se = function (x, f) {return f && f.call(x, x) || x},
gensym = (function (n, m) {return function () {return 'gensym_' + n.toString(36) + '_' + (++m).toString(36)}})(+new Date(), Math.random() * (1 << 30) >>> 0),
bind = function (f, t) {return f.binding === t ? f : f.original ? bind(f.original, t) : merge(function () {return f.apply(t, arguments)}, {original: f, binding: t})},
map = function (f, xs) {for (var i = 0, ys = [], l = xs.length; i < l; ++i) ys.push(f(xs[i], i)); return ys},
hash = function (s) {for (var i = 0, xs = qw(s), o = {}, l = xs.length; i < l; ++i) o[xs[i]] = true; return annotate_keys(o)},
merge = function (o) {for (var i = 1, l = arguments.length, _ = null; _ = arguments[i], i < l; ++i) if (_) for (var k in _) has(_, k) && (o[k] = _[k]); return o},
extend = function (f) {merge.apply(null, [f.prototype].concat(Array.prototype.slice.call(arguments, 1))); return f},
// Optimizations.
// The parser and lexer each assume valid input and do no validation. This is possible because any function passed in to caterwaul will already have been parsed by the Javascript interpreter;
// syntax errors would have caused an error there. This enables a bunch of optimization opportunities in the parser, ultimately making it not in any way recursive and requiring only three
// linear-time passes over the token stream. (An approximate figure; it actually does about 19 fractional passes, but not all nodes are reached.)
// Also, I'm not confident that all Javascript interpreters are smart about hash indexing. Particularly, suppose a hashtable has 10 entries, the longest of whose keys is 5 characters. If we
// throw a 2K string at it, it might very well hash that whole thing just to find that, surprise, the entry doesn't exist. That's a big performance hit if it happens very often. To prevent this
// kind of thing, I'm keeping track of the longest string in the hashtable by using the 'annotate_keys' function. 'has()' knows how to look up the maximum length of a hashtable to verify that
// the candidate is in it, resulting in the key lookup being only O(n) in the longest key (generally this ends up being nearly O(1), since I don't like to type long keys), and average-case O(1)
// regardless of the length of the candidate.
// The bad part is that you can't refer to an object called '_max_length' -- this will never be considered to be in the hash. I don't really have a problem with that, but it's worth being aware
// of. Also, on IE browsers various properties won't exist (among them toString, hasOwnProperty, etc.). These aren't special in Javascript so it isn't a problem, but it's still unfortunate.
annotate_keys = function (o) {var max = 0; for (var k in o) own.call(o, k) && (max = k.length > max ? k.length : max); o._max_length = max; return o},
has = function (o, p) {return p && ! (p.length > o._max_length) && p !== '_max_length' && own.call(o, p)}, own = Object.prototype.hasOwnProperty,
// Global management.
// Caterwaul creates a global symbol, caterwaul. Like jQuery, there's a mechanism to get the original one back if you don't want to replace it. You can call caterwaul.deglobalize() to return
// caterwaul and restore the global that was there when Caterwaul was loaded (might be useful in the unlikely event that someone else named their library Caterwaul). Note that deglobalize() is
// available only on the global caterwaul() function. It wouldn't make much sense for clones to inherit it.
_caterwaul = typeof caterwaul === 'undefined' ? undefined : caterwaul,
// Syntax data structures.
// There are two data structures used for syntax trees. At first, paren-groups are linked into doubly-linked lists, described below. These are then folded into immutable array-based specific
// nodes. At the end of folding there is only one child per paren-group.
// Doubly-linked paren-group lists.
// When the token stream is grouped into paren groups it has a hierarchical linked structure that conceptually has these pointers:
// | +--------+
// +------ | node | ------+
// | +-> | | <--+ |
// first | | +--------+ | | last
// | | parent parent | |
// V | | V
// +--------+ +--------+
// | node | --- r --> | node | --- r ---/
// /--- l --- | | <-- l --- | |
// +--------+ +--------+
// The primary operation performed on this tree, at least initially, is repeated folding. So we have a chain of linear nodes, and one by one certain nodes fold their siblings underneath them,
// breaking the children's links and linking instead to the siblings' neighbors. For example, if we fold node (3) as a binary operator:
// | (1) <-> (2) <-> (3) <-> (4) <-> (5) (1) <--> (3) <--> (5)
// / \ / \ / \ / \ / \ --> / \ / \ / \
// / \
// (2) (4) <- No link between children
// / \ / \ (see 'Fold nodes', below)
// Fold nodes.
// Once a node has been folded (e.g. (3) in the diagram above), none of its children will change and it will gain no more children. The fact that none of its children will change can be shown
// inductively: suppose you've decided to fold the '+' in 'x + y' (here x and y are arbitrary expressions). This means that x and y are comprised of higher-precedence operators. Since there is
// no second pass back to high-precedence operators, x and y will not change nor will they interact with one another. The fact that a folded node never gains more children arrives from the fact
// that it is folded only once; this is by virtue of folding by index instead of by tree structure. (Though a good tree traversal algorithm also wouldn't hit the same node twice -- it's just
// less obvious when the tree is changing.)
// Anyway, the important thing about fold nodes is that their children don't change. This means that an array is a completely reasonable data structure to use for the children; it certainly
// makes the structure simpler. It also means that the only new links that must be added to nodes as they are folded are links to new children (via the array), and links to the new siblings.
// Once we have the array-form of fold nodes, we can build a query interface similar to jQuery, but designed for syntactic traversal. This will make routine operations such as macro
// transformation and quasiquoting far simpler later on.
// Both grouping and fold nodes are represented by the same data structure. In the case of grouping, the 'first' pointer is encoded as [0] -- that is, the first array element. It doesn't
// contain pointers to siblings of [0]; these are still accessed by their 'l' and 'r' pointers. As the structure is folded, the number of children of each paren group should be reduced to just
// one. At this point the remaining element's 'l' and 'r' pointers will both be null, which means that it is in hierarchical form instead of linked form.
// After the tree has been fully generated and we have the root node, we have no further use for the parent pointers. This means that we can use subtree sharing to save memory. Once we're past
// the fold stage, push() should be used instead of append(). append() works in a bidirectionally-linked tree context (much like the HTML DOM), whereas push() works like it does for arrays
// (i.e. no parent pointer).
syntax_node_inspect = function (x) {return x ? x.inspect() : '(<>)'}, syntax_node_tostring = function (x) {return x ? x.serialize ? x.serialize() : x.toString() : ''},
// Syntax node functions.
// These functions are common to various pieces of syntax nodes. Not all of them will always make sense, but the prototypes of the constructors can be modified independently later on if it
// turns out to be an issue.
node_methods = {
// Mutability.
// These functions let you modify nodes in-place. They're used during syntax folding and shouldn't really be used after that (hence the underscores).
_replace: function (n) {return (n.l = this.l) && (this.l.r = n), (n.r = this.r) && (this.r.l = n), this}, _append_to: function (n) {return n && n._append(this), this},
_reparent: function (n) {return this.p && this.p[0] === this && (this.p[0] = n), this}, _fold_l: function (n) {return this._append(this.l && this.l._unlink(this))},
_append: function (n) {return (this[this.length++] = n) && (n.p = this), this}, _fold_r: function (n) {return this._append(this.r && this.r._unlink(this))},
_sibling: function (n) {return n.p = this.p, (this.r = n).l = this}, _fold_lr: function () {return this._fold_l()._fold_r()},
_wrap: function (n) {return n.p = this._replace(n).p, this._reparent(n), delete this.l, delete this.r, this._append_to(n)}, _fold_rr: function () {return this._fold_r()._fold_r()},
_unlink: function (n) {return this.l && (this.l.r = this.r), this.r && (this.r.l = this.l), delete this.l, delete this.r, this._reparent(n)},
// These methods are OK for use after the syntax folding stage is over (though because syntax nodes are shared it's generally dangerous to go modifying them):
pop: function () {return --this.length, this}, push: function (x) {return this[this.length++] = x, this},
// Identification.
// You can request that a syntax node identify itself, in which case it will give you a string identifier if it hasn't already. The identity is not determined until the first time it is
// requested, and after that it is stable.
id: function () {return this.id || (this.id = gensym())},
// Traversal functions.
// each() is the usual side-effecting shallow traversal that returns 'this'. map() distributes a function over a node's children and returns the array of results, also as usual. Two variants,
// reach and rmap, perform the process recursively. reach is non-consing; it returns the original as a reference. rmap, on the other hand, follows some rules to cons a new tree. If the
// function passed to rmap() returns the node verbatim then its children are traversed. If it returns a distinct node, however, then traversal doesn't descend into the children of the newly
// returned tree but rather continues as if the original node had been a leaf. For example:
// | parent Let's suppose that a function f() has these mappings:
// / \
// node1 node2 f(parent) = parent f(node1) = q
// / \ | f(node2) = node2
// c1 c2 c3
// In this example, f() would be called on parent, node1, node2, and c3 in that order. c1 and c2 are omitted because node1 was replaced by q -- and there is hardly any point in going through
// the replaced node's previous children. (Nor is there much point in forcibly iterating over the new node's children, since presumably they are already processed.) If a mapping function
// returns something falsy, it will have exactly the same effect as returning the node without modification.
// Using the old s() to do gensym-safe replacement requires that you invoke it only once, and this means that for complex macroexpansion you'll have a long array of values. This isn't ideal,
// so syntax trees provide a replace() function that handles replacement more gracefully:
// | qs[(foo(_foo), _before_bar + bar(_bar))].replace({_foo: qs[x], _before_bar: qs[3 + 5], _bar: qs[foo.bar]})
// There is a map() variant called nmap() (and a corresponding rnmap()) that lets you insert falsy nodes into the syntax tree. This is used by replace(), which lets you replace named nodes
// with falsy things if you want them to go away. (The exact behavior is that falsy nodes are not added to the syntax tree at all, rather than remaining in their original state.)
each: function (f) {for (var i = 0, l = this.length; i < l; ++i) f(this[i], i); return this},
map: function (f) {for (var n = new this.constructor(this), i = 0, l = this.length; i < l; ++i) n.push(f(this[i], i) || this[i]); return n},
nmap: function (f) {for (var n = new this.constructor(this), i = 0, l = this.length, r; i < l; ++i) (r = f(this[i], i)) && n.push(r); return n},
reach: function (f) {f(this); this.each(function (n) {n && n.reach(f)}); return this},
rmap: function (f) {var r = f(this); return ! r || r === this ? this. map(function (n) {return n && n. rmap(f)}) : r.data === undefined ? new this.constructor(r) : r},
rnmap: function (f) {var r = f(this); return r === this ? this.nmap(function (n) {return n && n.rnmap(f)}) : r && r.data === undefined ? new this.constructor(r) : r},
clone: function () {return this.rmap(function () {return false})},
collect: function (p) {var ns = []; this.reach(function (n) {p(n) && ns.push(n)}); return ns},
replace: function (rs) {return this.rnmap(function (n) {return own.call(rs, n.data) ? rs[n.data] : n})},
// Alteration.
// These functions let you make "changes" to a node by returning a modified copy.
repopulated_with: function (xs) {return new this.constructor(this.data, xs)},
change: function (i, x) {return se(new this.constructor(this.data, Array.prototype.slice.call(this)), function (n) {n[i] = x})},
compose_single: function (i, f) {return this.change(i, f(this[i]))},
// General-purpose traversal.
// This is a SAX-style traversal model, useful for analytical or scope-oriented tree traversal. You specify a callback function that is invoked in pre-post-order on the tree (you get events
// for entering and exiting each node, including leaves). Each time a node is entered, the callback is invoked with an object of the form {entering: node}, where 'node' is the syntax node
// being entered. Each time a node is left, the callback is invoked with an object of the form {exiting: node}. The return value of the function is not used. Any null nodes are not traversed,
// since they would fail any standard truthiness tests for 'entering' or 'exiting'.
// I used to have a method to perform scope-annotated traversal, but I removed it for two reasons. First, I had no use for it (and no tests, so I had no reason to believe that it worked).
// Second, Caterwaul is too low-level to need such a method. That would be more appropriate for an analysis extension.
traverse: function (f) {f({entering: this}); f({exiting: this.each(function (n) {n && n.traverse(f)})}); return this},
// Structural transformation.
// Having nested syntax trees can be troublesome. For example, suppose you're writing a macro that needs a comma-separated list of terms. It's a lot of work to dig through the comma nodes,
// each of which is binary. Javascript is better suited to using a single comma node with an arbitrary number of children. (This also helps with the syntax tree API -- we can use .map() and
// .each() much more effectively.) Any binary operator can be transformed this way, and that is exactly what the flatten() method does. (flatten() returns a new tree; it doesn't modify the
// original.)
// The tree flattening operation looks like this for a left-associative binary operator:
// | (+)
// / \ (+)
// (+) z -> / | \
// / \ x y z
// x y
// This flatten() method returns the nodes along the chain of associativity, always from left to right. It is shallow, since generally you only need a localized flat tree. That is, it doesn't
// descend into the nodes beyond the one specified by the flatten() call. It takes an optional parameter indicating the operator to flatten over; if the operator in the tree differs, then the
// original node is wrapped in a unary node of the specified operator. The transformation looks like this:
// | (,)
// (+) |
// / \ .flatten(',') -> (+)
// x y / \
// x y
// Because ',' is a binary operator, a ',' tree with just one operand will be serialized exactly as its lone operand would be. This means that plurality over a binary operator such as comma
// or semicolon degrades gracefully for the unary case (this sentence makes more sense in the context of macro definitions; see in particular 'let' and 'where' in std.bind).
// The unflatten() method performs the inverse transformation. It doesn't delete a converted unary operator in the tree case, but if called on a node with more than two children it will nest
// according to associativity.
flatten: function (d) {d = d || this.data; return d !== this.data ? this.as(d) : ! (has(parse_lr, d) && this.length) ? this : has(parse_associates_right, d) ?
se(new this.constructor(d), bind(function (n) {for (var i = this; i && i.data === d; i = i[1]) n.push(i[0]); n.push(i)}, this)) :
se(new this.constructor(d), bind(function (n) {for (var i = this, ns = []; i.data === d; i = i[0]) i[1] && ns.push(i[1]); ns.push(i);
for (i = ns.length - 1; i >= 0; --i) n.push(ns[i])}, this))},
unflatten: function () {var right = has(parse_associates_right, this.data); return this.length <= 2 ? this : se(new this.constructor(this.data), bind(function (n) {
if (right) for (var i = 0, l = this.length - 1; i < l; ++i) n = n.push(this[i]).push(i < l - 2 ? new this.constructor(this.data) : this[i])[1];
else for (var i = this.length - 1; i >= 1; --i) n = n.push(i > 1 ? new this.constructor(this.data) : this[0]).push(this[i])[0]}, this))},
// Wrapping.
// Sometimes you want your syntax tree to have a particular operator, and if it doesn't have that operator you want to wrap it in a node that does. Perhaps the most common case of this is
// when you have a possibly-plural node representing a variable or expression -- often the case when you're dealing with argument lists -- and you want to be able to assume that it's wrapped
// in a comma node. Calling node.as(',') will return the node if it's a comma, and will return a new comma node containing the original one if it isn't.
as: function (d) {return this.data === d ? this : new this.constructor(d).push(this)},
// Type detection and retrieval.
// These methods are used to detect the literal type of a node and to extract that value if it exists. You should use the as_x methods only once you know that the node does represent an x;
// otherwise you will get misleading results. (For example, calling as_boolean on a non-boolean will always return false.)
// Other methods are provided to tell you higher-level things about what this node does. For example, is_contextualized_invocation() tells you whether the node represents a call that can't be
// eta-reduced (if it were, then the 'this' binding would be lost).
is_string: function () {return /['"]/.test(this.data.charAt(0))}, as_escaped_string: function () {return this.data.substr(1, this.data.length - 2)},
is_number: function () {return /^-?(0x|\d|\.\d+)/.test(this.data)}, as_number: function () {return Number(this.data)},
is_boolean: function () {return this.data === 'true' || this.data === 'false'}, as_boolean: function () {return this.data === 'true'},
is_regexp: function () {return /^\/./.test(this.data)}, as_escaped_regexp: function () {return this.data.substring(1, this.data.lastIndexOf('/'))},
has_grouped_block: function () {return has(parse_r_until_block, this.data)}, is_block: function () {return has(parse_block, this.data)},
is_blockless_keyword: function () {return has(parse_r_optional, this.data)}, is_null_or_undefined: function () {return this.data === 'null' || this.data === 'undefined'},
is_constant: function () {return this.is_number() || this.is_string() || this.is_boolean() || this.is_regexp() || this.is_null_or_undefined()},
left_is_lvalue: function () {return /=$/.test(this.data) || /\+\+$/.test(this.data) || /--$/.test(this.data)},
is_empty: function () {return !this.length}, has_parameter_list: function () {return this.data === 'function' || this.data === 'catch'},
has_lvalue_list: function () {return this.data === 'var' || this.data === 'const'}, is_dereference: function () {return this.data === '.' || this.data === '[]'},
is_invocation: function () {return this.data === '()'}, is_contextualized_invocation: function () {return this.is_invocation() && this[0] && this[0].is_dereference()},
is_invisible: function () {return has(parse_invisible, this.data)}, is_binary_operator: function () {return has(parse_lr, this.data)},
is_prefix_unary_operator: function () {return has(parse_r, this.data)}, is_postfix_unary_operator: function () {return has(parse_l, this.data)},
is_unary_operator: function () {return this.is_prefix_unary_operator() || this.is_postfix_unary_operator()},
accepts: function (e) {return parse_accepts[this.data] && this.accepts[parse.data] === (e.data || e)},
// Value construction.
// Syntax nodes sometimes represent hard references to values instead of just syntax. (See 'References' for more information.) In order to compile a syntax tree in the right environment you
// need a mapping of symbols to these references, which is what the bindings() method returns. (It also collects references for all descendant nodes.) It takes an optional argument to
// populate, in case you already had a hash set aside for bindings -- though it always returns the hash.
// A bug in Caterwaul 0.5 and earlier failed to bind falsy values. This is no longer the case; nodes which bind values should indicate that they do so by setting a binds_a_value attribute
// (ref nodes do this on the prototype), indicating that their value should be read from the 'value' property. (This allows other uses of a 'value' property while making it unambiguous
// whether a particular node intends to bind something.)
bindings: function (hash) {var result = hash || {}; this.reach(function (n) {if (n.binds_a_value) result[n.data] = n.value}); return result},
// Matching.
// Syntax trees can use the Caterwaul match function to return a list of wildcards.
match: function (pattern) {return macro_try_match(pattern, this)},
// Inspection and syntactic serialization.
// Syntax nodes can be both inspected (producing a Lisp-like structural representation) and serialized (producing valid Javascript code). Each representation captures stray links via the 'r'
// pointer. In the serialized representation, it is shown as a comment /* -> */ containing the serialization of whatever is to the right. This has the property that it will break tests but
// won't necessarily break code (though if it happens in the field then it's certainly a bug).
// Block detection is required for multi-level if/else statements. Consider this code:
// | if (foo) for (...) {}
// else bif;
// A naive approach (the one I was using before version 0.6) would miss the fact that the 'for' was trailed by a block, and insert a spurious semicolon, which would break compilation:
// | if (foo) for (...) {}; // <- note!
// else bif;
// What we do instead is dig through the tree and find out whether the last thing in the 'if' case ends with a block. If so, then no semicolon is inserted; otherwise we insert one. This
// algorithm makes serialization technically O(n^2), but nobody nests if/else blocks to such an extent that it would matter.
ends_with_block: function () {var block_index = parse_r_until_block[this.data], block = this[block_index];
return this.data === '{' || has(parse_r_until_block, this.data) && (this.data !== 'function' || this.length === 3) && block && block.ends_with_block()},
// There's a hack here for single-statement if-else statements. (See 'Grab-until-block behavior' in the parsing code below.) Basically, for various reasons the syntax tree won't munch the
// semicolon and connect it to the expression, so we insert one automatically whenever the second node in an if, else, while, etc. isn't a block.
// Update for Caterawul 0.6.6: I had removed mandatory spacing for unary prefix operators, but now it's back. The reason is to help out the host Javascript lexer, which can misinterpret
// postfix increment/decrement: x + +y will be serialized as x++y, which is invalid Javascript. The fix is to introduce a space in front of the second plus: x+ +y, which is unambiguous.
toString: function () {return this.inspect()},
inspect: function () {return (this.l ? '(left) <- ' : '') + '(' + this.data + (this.length ? ' ' + map(syntax_node_inspect, this).join(' ') : '') + ')' +
(this.r ? ' -> ' + this.r.inspect() : '')},
serialize: function () {var op = this.data, right = this.r ? '/* -> ' + this.r.serialize() + ' */' : '', space = /\w/.test(op.charAt(op.length - 1)) ? ' ' : '',
s = has(parse_invisible, op) ? map(syntax_node_tostring, this).join(space) :
has(parse_invocation, op) ? map(syntax_node_tostring, [this[0], op.charAt(0), this[1], op.charAt(1)]).join(space) :
has(parse_ternary, op) ? map(syntax_node_tostring, [this[0], op, this[1], parse_group[op], this[2]]).join(space) :
has(parse_group, op) ? op + map(syntax_node_tostring, this).join(space) + parse_group[op] :
has(parse_lr, op) ? this.length ? map(syntax_node_tostring, this).join(space + op + space) : op :
has(parse_r, op) || has(parse_r_optional, op) ? op.replace(/^u/, ' ') + space + (this[0] ? this[0].serialize() : '') :
has(parse_r_until_block, op) ? has(parse_accepts, op) && this[1] && this[2] && parse_accepts[op] === this[2].data && ! this[1].ends_with_block() ?
op + space + map(syntax_node_tostring, [this[0], this[1], ';', this[2]]).join('') :
op + space + map(syntax_node_tostring, this).join('') :
has(parse_l, op) ? (this[0] ? this[0].serialize() : '') + space + op : op;
return right ? s + right : s}},
// References.
// You can drop references into code that you're compiling. This is basically variable closure, but a bit more fun. For example:
// | caterwaul.compile(qs[fn_[_ + 1]].replace({_: new caterwaul.ref(3)})() // -> 4
// What actually happens is that caterwaul.compile runs through the code replacing refs with gensyms, and the function is evaluated in a scope where those gensyms are bound to the values they
// represent. This gives you the ability to use a ref even as an lvalue, since it's really just a variable. References are always leaves on the syntax tree, so the prototype has a length of 0.
ref = extend(function (value) {if (value instanceof this.constructor) {this.value = value.value; this.data = value.data}
else {this.value = value; this.data = gensym()}}, {length: 0, binds_a_value: true}, node_methods),
// Syntax node constructor.
// Here's where we combine all of the pieces above into a single function with a large prototype. Note that the 'data' property is converted from a variety of types; so far we support strings,
// numbers, and booleans. Any of these can be added as children. Also, I'm using an instanceof check rather than (.constructor ===) to allow array subclasses such as Caterwaul finite sequences
// to be used.
syntax_node = extend(function (data) {if (data instanceof this.constructor) this.data = data.data, this.length = 0;
else {this.data = data && data.toString(); this.length = 0;
for (var i = 1, l = arguments.length, _; _ = arguments[i], i < l; ++i)
for (var j = 0, lj = _.length, it, itc; _ instanceof Array ? (it = _[j], j < lj) : (it = _, ! j); ++j)
this._append((itc = it.constructor) === String || itc === Number || itc === Boolean ? new this.constructor(it) : it)}}, node_methods),
// Parsing.
// There are two distinct parts to parsing Javascript. One is parsing the irregular statement-mode expressions such as 'if (condition) {...}' and 'function f(x) {...}'; the other is parsing
// expression-mode stuff like arithmetic operators. In Rebase I tried to model everything as an expression, but that failed sometimes because it required that each operator have fixed arity. In
// particular this was infeasible for keywords such as 'break', 'continue', 'return', and some others (any of these can be nullary or unary). It also involved creating a bizarre hack for 'case
// x:' inside a switch block. This hack made the expression passed in to 'case' unavailable, as it would be buried in a ':' node.
// Caterwaul fixes these problems by using a proper context-free grammar. However, it's much looser than most grammars because it doesn't need to validate anything. Correspondingly, it can be
// much faster as well. Instead of guessing and backtracking as a recursive-descent parser would, it classifies many different branches into the same basic structure and fills in the blanks. One
// example of this is the () {} pair, which occurs in a bunch of different constructs, including function () {}, if () {}, for () {}, etc. In fact, any time a () group is followed by a {} group
// we can grab the token that precedes () (along with perhaps one more in the case of function f () {}), and group that under whichever keyword is responsible.
// Syntax folding.
// The first thing to happen is that parenthetical, square bracket, and braced groups are folded up. This happens in a single pass that is linear in the number of tokens, and other foldable
// tokens (including unary and binary operators) are indexed by associativity. The following pass runs through these indexes from high to low precedence and folds tokens into trees. By this
// point all of the parentheticals have been replaced by proper nodes (here I include ?: groups in parentheticals, since they behave the same way). Finally, high-level rules are applied to the
// remaining keywords, which are bound last. This forms a complete parse tree.
// Doing all of this efficiently requires a linked list rather than an array. This gets built during the initial paren grouping stage. Arrays are used for the indexes, which are left-to-right
// and are later processed in the order indicated by the operator associativity. That is, left-associative operators are processed 0 .. n and right associative are processed n .. 0. Keywords
// are categorized by behavior and folded after all of the other operators. Semicolons are folded last, from left to right.
// There are some corner cases due to Javascript's questionable heritage from C-style syntax. For example, most constructs take either syntax blocks or semicolon-delimited statements. Ideally,
// else, while, and catch are associated with their containing if, do, and try blocks, respectively. This can be done easily, as the syntax is folded right-to-left. Another corner case would
// come up if there were any binary operators with equal precedence and different associativity. Javascript doesn't have them however, and it wouldn't make much sense to; it would render
// expressions such as 'a op1 b op2 c' ambiguous if op1 and op2 shared precedence but each wanted to bind first. (I mention this because at first I was worried about it, but now I realize it
// isn't an issue.)
// Notationally (for easier processing later on), a distinction is made between invocation and grouping, and between dereferencing and array literals. Dereferencing and function invocation are
// placed into their own operators, where the left-hand side is the thing being invoked or dereferenced and the right-hand side is the paren-group or bracket-group that is responsible for the
// operation. Also, commas inside these groups are flattened into a single variadic (possibly nullary) comma node so that you don't have to worry about the tree structure. This is the case for
// all left-associative operators; right-associative operators preserve their hierarchical folding.
// Parse/lex shared logic.
// Lexing Javascript is not entirely straightforward, primarily because of regular expression literals. The first implementation of the lexer got things right 99% of the time by inferring the
// role of a / by its preceding token. The problem comes in when you have a case like this:
// | if (condition) /foo/.test(x)
// In this case, (condition) will be incorrectly inferred to be a regular expression (since the close-paren terminates an expression, usually), and /foo/ will be interpreted as division by foo.
// We mark the position before a token and then just increment the position. The token, then, can be retrieved by taking a substring from the mark to the position. This eliminates the need for
// intermediate concatenations. In a couple of cases I've gone ahead and done them anyway -- these are for operators, where we grab the longest contiguous substring that is defined. I'm not too
// worried about the O(n^2) complexity due to concatenation; they're bounded by four characters.
// OK, so why use charAt() instead of regular expressions? It's a matter of asymptotic performance. V8 implements great regular expressions (O(1) in the match length for the (.*)$ pattern), but
// the substring() method is O(n) in the number of characters returned. Firefox implements O(1) substring() but O(n) regular expression matching. Since there are O(n) tokens per document of n
// characters, any O(n) step makes lexing quadratic. So I have to use the only reliably constant-time method provided by strings, charAt() (or in this case, charCodeAt()).
// Of course, building strings via concatenation is also O(n^2), so I also avoid that for any strings that could be long. This is achieved by using a mark to indicate where the substring
// begins, and advancing i independently. The span between mark and i is the substring that will be selected, and since each substring both requires O(n) time and consumes n characters, the
// lexer as a whole is O(n). (Though perhaps with a large constant.)
// Precomputed table values.
// The lexer uses several character lookups, which I've optimized by using integer->boolean arrays. The idea is that instead of using string membership checking or a hash lookup, we use the
// character codes and index into a numerical array. This is guaranteed to be O(1) for any sensible implementation, and is probably the fastest JS way we can do this. For space efficiency,
// only the low 256 characters are indexed. High characters will trigger sparse arrays, which may degrade performance. (I'm aware that the arrays are power-of-two-sized and that there are
// enough of them, plus the right usage patterns, to cause cache line contention on most Pentium-class processors. If we are so lucky to have a Javascript JIT capable enough to have this
// problem, I think we'll be OK.)
// The lex_op table indicates which elements trigger regular expression mode. Elements that trigger this mode cause a following / to delimit a regular expression, whereas other elements would
// cause a following / to indicate division. By the way, the operator ! must be in the table even though it is never used. The reason is that it is a substring of !==; without it, !== would
// fail to parse. (See test/lex-neq-failure for examples.)
lex_op = hash('. new ++ -- u++ u-- u+ u- typeof u~ u! ! * / % + - << >> >>> < > <= >= instanceof in == != === !== & ^ | && || ? = += -= *= /= %= &= |= ^= <<= >>= >>>= : , ' +
'return throw case var const break continue void else u; ;'),
lex_table = function (s) {for (var i = 0, xs = [false]; i < 8; ++i) xs.push.apply(xs, xs); for (var i = 0, l = s.length; i < l; ++i) xs[s.charCodeAt(i)] = true; return xs},
lex_float = lex_table('.0123456789'), lex_decimal = lex_table('0123456789'), lex_integer = lex_table('0123456789abcdefABCDEFx'), lex_exp = lex_table('eE'),
lex_space = lex_table(' \n\r\t'), lex_bracket = lex_table('()[]{}'), lex_opener = lex_table('([{'), lex_punct = lex_table('+-*/%&|^!~=<>?:;.,'),
lex_eol = lex_table('\n\r'), lex_regexp_suffix = lex_table('gims'), lex_quote = lex_table('\'"/'), lex_slash = '/'.charCodeAt(0),
lex_star = '*'.charCodeAt(0), lex_back = '\\'.charCodeAt(0), lex_x = 'x'.charCodeAt(0), lex_dot = '.'.charCodeAt(0),
lex_zero = '0'.charCodeAt(0), lex_postfix_unary = hash('++ --'), lex_ident = lex_table('$_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'),
// Parse data.
// The lexer and parser aren't entirely separate, nor can they be considering the complexity of Javascript's grammar. The lexer ends up grouping parens and identifying block constructs such
// as 'if', 'for', 'while', and 'with'. The parser then folds operators and ends by folding these block-level constructs.
parse_reduce_order = map(hash, ['function', '( [ . [] ()', 'new delete', 'u++ u-- ++ -- typeof u~ u! u+ u-', '* / %', '+ -', '<< >> >>>', '< > <= >= instanceof in', '== != === !==', '&',
'^', '|', '&&', '||', 'case', '?', '= += -= *= /= %= &= |= ^= <<= >>= >>>=', ':', ',', 'return throw break continue void', 'var const',
'if else try catch finally for switch with while do', ';']),
parse_associates_right = hash('= += -= *= /= %= &= ^= |= <<= >>= >>>= ~ ! new typeof u+ u- -- ++ u-- u++ ? if else function try catch finally for switch case with while do'),
parse_inverse_order = (function (xs) {for (var o = {}, i = 0, l = xs.length; i < l; ++i) for (var k in xs[i]) has(xs[i], k) && (o[k] = i); return annotate_keys(o)}) (parse_reduce_order),
parse_index_forward = (function (rs) {for (var xs = [], i = 0, l = rs.length, _ = null; _ = rs[i], xs[i] = true, i < l; ++i)
for (var k in _) if (has(_, k) && (xs[i] = xs[i] && ! has(parse_associates_right, k))) break; return xs}) (parse_reduce_order),
parse_lr = hash('[] . () * / % + - << >> >>> < > <= >= instanceof in == != === !== & ^ | && || = += -= *= /= %= &= |= ^= <<= >>= >>>= , : ;'),
parse_r_until_block = annotate_keys({'function':2, 'if':1, 'do':1, 'catch':1, 'try':1, 'for':1, 'while':1, 'with':1}),
parse_accepts = annotate_keys({'if':'else', 'do':'while', 'catch':'finally', 'try':'catch'}), parse_invocation = hash('[] ()'),
parse_r_optional = hash('return throw break continue else'), parse_also_expression = hash('function'), parse_r = hash('u+ u- u! u~ u++ u-- new typeof finally var const void delete'),
parse_block = hash('; {'), parse_invisible = hash('i;'), parse_l = hash('++ --'), parse_group = annotate_keys({'(':')', '[':']', '{':'}', '?':':'}),
parse_ambiguous_group = hash('[ ('), parse_ternary = hash('?'), parse_not_a_value = hash('function if for while catch'),
// Parse function.
// As mentioned earlier, the parser and lexer aren't distinct. The lexer does most of the heavy lifting; it matches parens and brackets, arranges tokens into a hierarchical linked list, and
// provides an index of those tokens by their fold order. It does all of this by streaming tokens into a micro-parser whose language is grouping and that knows about the oddities required to
// handle regular expression cases. In the same function, though as a distinct case, the operators are folded and the syntax is compiled into a coherent tree form.
// The input to the parse function can be anything whose toString() produces valid Javascript code.
parse = function (input) {
// Lex variables.
// s, obviously, is the string being lexed. mark indicates the position of the stream, while i is used for lookahead. The difference is later read into a token and pushed onto the result. c
// is a temporary value used to store the current character code. re is true iff a slash would begin a regular expression. esc is a flag indicating whether the next character in a string or
// regular expression literal is escaped. exp indicates whether we've seen the exponent marker in a number. close is used for parsing single and double quoted strings; it contains the
// character code of the closing quotation mark. t is the token to be processed.
// Parse variables.
// grouping_stack and gs_top are used for paren/brace/etc. matching. head and parent mark two locations in the linked syntax tree; when a new group is created, parent points to the opener
// (i.e. (, [, ?, or {), while head points to the most recently added child. (Hence the somewhat complex logic in push().) indexes[] determines reduction order, and contains references to the
// nodes in the order in which they should be folded. invocation_nodes is an index of the nodes that will later need to be flattened.
// The push() function manages the mechanics of adding a node to the initial linked structure. There are a few cases here; one is when we've just created a paren group and have no 'head'
// node; in this case we append the node as 'head'. Another case is when 'head' exists; in that case we update head to be the new node, which gets added as a sibling of the old head.
var s = input.toString(), mark = 0, c = 0, re = true, esc = false, dot = false, exp = false, close = 0, t = '', i = 0, l = s.length, cs = function (i) {return s.charCodeAt(i)},
grouping_stack = [], gs_top = null, head = null, parent = null, indexes = map(function () {return []}, parse_reduce_order), invocation_nodes = [], all_nodes = [],
new_node = function (n) {return all_nodes.push(n), n}, push = function (n) {return head ? head._sibling(head = n) : (head = n._append_to(parent)), new_node(n)};
// Main lex loop.
// This loop takes care of reading all of the tokens in the input stream. At the end, we'll have a linked node structure with paren groups. At the beginning, we set the mark to the current
// position (we'll be incrementing i as we read characters), munch whitespace, and reset flags.
while ((mark = i) < l) {
while (lex_space[c = cs(i)] && i < l) mark = ++i;
esc = exp = dot = t = false;
// Miscellaneous lexing.
// This includes bracket resetting (the top case, where an open-bracket of any sort triggers regexp mode) and comment removal. Both line and block comments are removed by comparing against
// lex_slash, which represents /, and lex_star, which represents *.
if (lex_bracket[c]) {t = !! ++i; re = lex_opener[c]}
else if (c === lex_slash && cs(i + 1) === lex_star && (i += 2)) {while (++i < l && cs(i) !== lex_slash || cs(i - 1) !== lex_star); t = ! ++i}
else if (c === lex_slash && cs(i + 1) === lex_slash) {while (++i < l && ! lex_eol[cs(i)]); t = false}
// Regexp and string literal lexing.
// These both take more or less the same form. The idea is that we have an opening delimiter, which can be ", ', or /; and we look for a closing delimiter that follows. It is syntactically
// illegal for a string to occur anywhere that a slash would indicate division (and it is also illegal to follow a string literal with extra characters), so reusing the regular expression
// logic for strings is not a problem. (This follows because we know ahead of time that the Javascript is valid.)
else if (lex_quote[c] && (close = c) && re && ! (re = ! (t = s.charAt(i)))) {while (++i < l && (c = cs(i)) !== close || esc) esc = ! esc && c === lex_back;
while (++i < l && lex_regexp_suffix[cs(i)]) ; t = true}
// Numeric literal lexing.
// This is far more complex than the above cases. Numbers have several different formats, each of which requires some custom logic. The reason we need to parse numbers so exactly is that it
// influences how the rest of the stream is lexed. One example is '0.5.toString()', which is perfectly valid Javascript. What must be output here, though, is '0.5', '.', 'toString', '(',
// ')'; so we have to keep track of the fact that we've seen one dot and stop lexing the number on the second.
// Another case is exponent-notation: 3.0e10. The hard part here is that it's legal to put a + or - on the exponent, which normally terminates a number. Luckily we can safely skip over any
// character that comes directly after an E or e (so long as we're really in exponent mode, which I'll get to momentarily), since there must be at least one digit after an exponent.
// The final case, which restricts the logic somewhat, is hexadecimal numbers. These also contain the characters 'e' and 'E', but we cannot safely skip over the following character, and any
// decimal point terminates the number (since '0x5.toString()' is also valid Javascript). The same follows for octal numbers; the leading zero indicates that there will be no decimal point,
// which changes the lex mode (for example, '0644.toString()' is valid).
// So, all this said, there are different logic branches here. One handles guaranteed integer cases such as hex/octal, and the other handles regular numbers. The first branch is triggered
// whenever a number starts with zero and is followed by 'x' or a digit (for conciseness I call 'x' a digit), and the second case is triggered when '.' is followed by a digit, or when a
// digit starts.
// A trivial change, using regular expressions, would reduce this logic significantly. I chose to write it out longhand because (1) it's more fun that way, and (2) the regular expression
// approach has theoretically quadratic time in the length of the numbers, whereas this approach keeps things linear. Whether or not that actually makes a difference I have no idea.
// Finally, in response to a recently discovered failure case, a period must be followed by a digit if it starts a number. The failure is the string '.end', which will be lexed as '.en',
// 'd' if it is assumed to be a floating-point number. (In fact, any method or property beginning with 'e' will cause this problem.)
else if (c === lex_zero && lex_integer[cs(i + 1)]) {while (++i < l && lex_integer[cs(i)]); re = ! (t = true)}
else if (lex_float[c] && (c !== lex_dot || lex_decimal[cs(i + 1)])) {while (++i < l && (lex_decimal[c = cs(i)] || (dot ^ (dot |= c === lex_dot)) || (exp ^ (exp |= lex_exp[c] && ++i))));
while (i < l && lex_decimal[cs(i)]) ++i; re = ! (t = true)}
// Operator lexing.
// The 're' flag is reused here. Some operators have both unary and binary modes, and as a heuristic (which happens to be accurate) we can assume that anytime we expect a regular
// expression, a unary operator is intended. The only exception are ++ and --, which are always unary but sometimes are prefix and other times are postfix. If re is true, then the prefix
// form is intended; otherwise, it is postfix. For this reason I've listed both '++' and 'u++' (same for --) in the operator tables; the lexer is actually doing more than its job here by
// identifying the variants of these operators.
// The only exception to the regular logic happens if the operator is postfix-unary. (e.g. ++, --.) If so, then the re flag must remain false, since expressions like 'x++ / 4' can be valid.
else if (lex_punct[c] && (t = re ? 'u' : '', re = true)) {while (i < l && lex_punct[cs(i)] && has(lex_op, t + s.charAt(i))) t += s.charAt(i++); re = ! has(lex_postfix_unary, t)}
// Identifier lexing.
// If nothing else matches, then the token is lexed as a regular identifier or Javascript keyword. The 're' flag is set depending on whether the keyword expects a value. The nuance here is
// that you could write 'x / 5', and it is obvious that the / means division. But if you wrote 'return / 5', the / would be a regexp delimiter because return is an operator, not a value. So
// at the very end, in addition to assigning t, we also set the re flag if the word turns out to be an operator.
else {while (++i < l && lex_ident[cs(i)]); re = has(lex_op, t = s.substring(mark, i))}
// Token unification.
// t will contain true, false, or a string. If false, no token was lexed; this happens when we read a comment, for example. If true, the substring method should be used. (It's a shorthand to
// avoid duplicated logic.) For reasons that are not entirely intuitive, the lexer sometimes produces the artifact 'u;'. This is never useful, so I have a case dedicated to removing it.
if (i === mark) throw new Error('Caterwaul lex error at "' + s.substr(mark, 40) + '" with leading context "' + s.substr(mark - 40, 40) + '" (probably a Caterwaul bug)');
if (t === false) continue;
t = t === true ? s.substring(mark, i) : t === 'u;' ? ';' : t;
// Grouping and operator indexing.
// Now that we have a token, we need to see whether it affects grouping status. There are a couple of possibilities. If it's an opener, then we create a new group; if it's a matching closer
// then we close the current group and pop out one layer. (We don't check for matching here. Any code provided to Caterwaul will already have been parsed by the host Javascript interpreter,
// so we know that it is valid.)
// All operator indexing is done uniformly, left-to-right. Note that the indexing isn't strictly by operator. It's by reduction order, which is arguably more important. That's what the
// parse_inverse_order table does: it maps operator names to parse_reduce_order subscripts. (e.g. 'new' -> 2.)
t === gs_top ? (grouping_stack.pop(), gs_top = grouping_stack[grouping_stack.length - 1], head = head ? head.p : parent, parent = null) :
(has(parse_group, t) ? (grouping_stack.push(gs_top = parse_group[t]), parent = push(new_node(new syntax_node(t))), head = null) : push(new_node(new syntax_node(t))),
has(parse_inverse_order, t) && indexes[parse_inverse_order[t]].push(head || parent));
// Regexp flag special cases.
// Normally a () group wraps an expression, so a following / would indicate division. The only exception to this is when we have a block construct; in this case, the next token appears in
// statement-mode, which means that it begins, not modifies, a value. We'll know that we have such a case if (1) the immediately-preceding token is a close-paren, and (2) a block-accepting
// syntactic form occurs to its left.
// With all this trouble over regular expressions, I had to wonder whether it was possible to do it more cleanly. I don't think it is, unfortunately. Even lexing the stream backwards fails
// to resolve the ambiguity:
// | for (var k in foo) /foo/g.test(k) && bar();
// In this case we won't know it's a regexp until we hit the 'for' keyword (or perhaps 'var', if we're being clever -- but a 'with' or 'if' would require complete lookahead). A perfectly
// valid alternative parse, minus the 'for' and 'var', is this:
// | ((k in foo) / (foo) / (g.test(k))) && bar();
// The only case where reverse-lexing is useful is when the regexp has no modifiers.
re |= t === ')' && head.l && has(parse_r_until_block, head.l.data)}
// Operator fold loop.
// This is the second major part of the parser. Now that we've completed the lex process, we can fold operators and syntax, and take care of some exception cases.
// First step: fold function literals, function calls, dots, and dereferences.
// I'm treating this differently from the generalized operator folding because of the syntactic inference required for call and dereference detection. Nothing has been folded at this point
// (with the exception of paren groups, which is appropriate), so if the node to the left of any ( or [ group is an operator, then the ( or [ is really a paren group or array literal. If, on
// the other hand, it is another value, then the group is a function call or a dereference. This folding goes left-to-right. The reason we also process dot operators is that they share the same
// precedence as calls and dereferences. Here's what a () or [] transform looks like:
// | quux <--> foo <--> ( <--> bar quux <--> () <--> bar
// \ / \ <-- This can be done by saying _.l.wrap(new node('()')).p.fold_r().
// bif <--> , <--> baz --> foo ( _.l.wrap() returns l again, .p gets the wrapping node, and fold_r adds a child to it.
// \
// bif <--> , <--> baz
// This is actually merged into the for loop below, even though it happens before other steps do (see 'Ambiguous parse groups').
// Second step: fold operators.
// Now we can go through the list of operators, folding each according to precedence and associativity. Highest to lowest precedence here, which is just going forwards through the indexes[]
// array. The parse_index_forward[] array indicates which indexes should be run left-to-right and which should go right-to-left.
for (var i = 0, l = indexes.length, forward, _; _ = indexes[i], forward = parse_index_forward[i], i < l; ++i)
for (var j = forward ? 0 : _.length - 1, lj = _.length, inc = forward ? 1 : -1, node, data; node = _[j], data = node && node.data, forward ? j < lj : j >= 0; j += inc)
// Binary node behavior.
// The most common behavior is binary binding. This is the usual case for operators such as '+' or ',' -- they grab one or both of their immediate siblings regardless of what they are.
// Operators in this class are considered to be 'fold_lr'; that is, they fold first their left sibling, then their right.
if (has(parse_lr, data)) node._fold_lr();
// Ambiguous parse groups.
// As mentioned above, we need to determine whether grouping constructs are invocations or real groups. This happens to take place before other operators are parsed (which is good -- that way
// it reflects the precedence of dereferencing and invocation). The only change we need to make is to discard the explicit parenthetical or square-bracket grouping for invocations or
// dereferences, respectively. It doesn't make much sense to have a doubly-nested structure, where we have a node for invocation and another for the group on the right-hand side of that
// invocation. Better is to modify the group in-place to represent an invocation.
// We can't solve this problem here, but we can solve it after the parse has finished. I'm pushing these invocation nodes onto an index for the end.