diff --git a/t/DEBUG.pm6 b/t/DEBUG.pm6 new file mode 100644 index 00000000..da3d4208 --- /dev/null +++ b/t/DEBUG.pm6 @@ -0,0 +1,15 @@ +my module DEBUG; +our constant autolexer is export = 1; +our constant symtab is export = 2; +our constant fixed_length is export = 4; +our constant fates is export = 8; +our constant longest_token_pattern_generation is export = 16; +our constant EXPR is export = 32; +our constant matchers is export = 64; +our constant trace_call is export = 128; +our constant cursors is export = 256; +our constant try_processing is export = 1024; +our constant mixins is export = 2048; +our constant callm_show_subnames is export = 16384; +our constant use_color is export = 32768; + diff --git a/t/JsonTinyG.pl b/t/JsonTinyG.pl new file mode 100644 index 00000000..66e1089c --- /dev/null +++ b/t/JsonTinyG.pl @@ -0,0 +1,48 @@ +grammar JSONGrammar { + rule TOP { [ | ] { say "parsed {$/.pos} chars" } } + rule object {'{' '}' } + rule pairlist { [ [ \, ]* ]? } + rule pair { ':' } + rule array {'[' [ [ \, ]* ]? ']' } + + proto token value {*} + token value:sym { + '-'? + [ 0 | <[1..9]> <[0..9]>* ] + [ \. <[0..9]>+ ]? + [ <[eE]> [\+|\-]? <[0..9]>+ ]? + } + token value:sym { } + token value:sym { } + token value:sym { } + token value:sym { } + token value:sym { } + token value:sym { } + + token string { + \" [ | \\ ]* \" + } + + token str { + [ + + + + + . + ]+ + # <-["\\\t\n]>+ + } + + token xdigit { + <[0..9 a..f A..F]> + } + + token str_escape { + <["\\/bfnrt]> | u + } +} + +JSONGrammar.parse($*IN.slurp) + +# vim: ft=perl6 diff --git a/t/STD.pm6 b/t/STD.pm6 new file mode 100644 index 00000000..aabae1f6 --- /dev/null +++ b/t/STD.pm6 @@ -0,0 +1,6033 @@ +# STD.pm +# +# Copyright 2007-2010, Larry Wall +# +# You may copy this software under the terms of the Artistic License, +# version 2.0 or later. + +grammar STD:ver<6.0.0.alpha>:auth; + +use DEBUG; + +our $ALL; + +=begin comment + + Contextuals used in STD + ======================= + # per parse + my $*ACTIONS; # class or object which defines reduce actions + my $*SETTINGNAME; # name of core setting + my $*TMP_PREFIX; # where to put tmp files + my $*ORIG; # the original program string + my @*ORIG; # same thing as individual chars + my @*MEMOS; # per-position info such as ws and line number + my $*HIGHWATER; # where we were last looking for things + my $*HIGHMESS; # current parse failure message + my $*HIGHEXPECT; # things we were looking for at the bleeding edge + my $*IN_PANIC; # don't panic recursively + + # symbol table management + our $ALL; # all the stashes, keyed by id + my $*CORE; # the CORE scope + my $*SETTING; # the SETTING scope + my $*GLOBAL; # the GLOBAL scope + my $*PROCESS; # the PROCESS scope + my $*UNIT; # the UNIT scope + my $*CURLEX; # current lexical scope info + my $*CURPKG; # current package scope + + my %*MYSTERY; # names we assume may be post-declared functions + + # tree attributes, marked as propagating up (u) down (d) or up-and-down (u/d) + my %*LANG; # (d) braided languages: MAIN, Q, Regex, etc + + my $*IN_DECL; # (d) a declarator is looking for a name to declare + my $*SCOPE = ""; # (d) which scope declarator we're under + my $*MULTINESS; # (d) which multi declarator we're under + my $*PKGDECL ::= ""; # (d) current package declarator + my $*NEWPKG; # (u/d) new package being declared + my $*NEWLEX; # (u/d) new lex info being declared + my $*DECLARAND; # (u/d) new object associated with declaration + + my $*GOAL ::= "(eof)"; # (d) which special terminator we're most wanting + my $*IN_REDUCE; # (d) attempting to parse an [op] construct + my $*IN_META; # (d) parsing a metaoperator like [..] + my $*QUASIMODO; # (d) don't carp about quasi variables + my $*LEFTSIGIL; # (u) sigil of LHS for item vs list assignment + my $*QSIGIL; # (d) sigil of current interpolation + + my $*INVOCANT_OK; # (d) parsing a list that allows an invocant + my $*INVOCANT_IS; # (u) invocant of args match + + my $*BORG; # (u/d) who to blame if we're missing a block + +=end comment + +=begin notes + + Some rules are named by syntactic category plus an additional symbol + specified in adverbial form, either in bare :name form or in :sym + form. (It does not matter which form you use for identifier symbols, + except that to specify a symbol "sym" you must use the :sym form + of adverb.) If you use the rule within the rule, it will parse the + symbol at that point. At the final reduction point of a rule, if $sym + has been set, that is used as the final symbol name for the rule. This + need not match the symbol specified as part the rule name; that is just + for disambiguating the name. However, if no $sym is set, the original + symbol will be used by default. + + Note that some of these rules are written strangely because we're + still bootstrapping via a preprocessor, gimme5. For instance, + blocks that contain nested braces are delimited by double braces + so that the preprocessor does not need to parse Perl 6 code. + + This grammar relies on transitive longest-token semantics, though + initially we made a feeble attempt to order rules so a procedural + interpretation of alternation could usually produce a correct parse. + (This is becoming less true over time.) + +=end notes + +method p6class () { ::STD::P6 } + +method TOP ($STOP = '') { + my $lang = self.cursor_fresh( self.p6class ); + + if $STOP { + my $*GOAL ::= $STOP; + $lang.unitstop($STOP).comp_unit; + } + else { + $lang.comp_unit; + } +} + +############## +# Precedence # +############## + +# The internal precedence levels are *not* part of the public interface. +# The current values are mere implementation; they may change at any time. +# Users should specify precedence only in relation to existing levels. + +constant %term = (:dba('term') , :prec); +constant %methodcall = (:dba('methodcall') , :prec, :assoc, :uassoc, :fiddly, :!pure); +constant %autoincrement = (:dba('autoincrement') , :prec, :assoc, :uassoc, :!pure); +constant %exponentiation = (:dba('exponentiation') , :prec, :assoc, :pure); +constant %symbolic_unary = (:dba('symbolic unary') , :prec, :assoc, :uassoc, :pure); +constant %multiplicative = (:dba('multiplicative') , :prec, :assoc, :pure); +constant %additive = (:dba('additive') , :prec, :assoc, :pure); +constant %replication = (:dba('replication') , :prec, :assoc, :pure); +constant %concatenation = (:dba('concatenation') , :prec, :assoc, :pure); +constant %junctive_and = (:dba('junctive and') , :prec, :assoc, :pure); +constant %junctive_or = (:dba('junctive or') , :prec, :assoc, :pure); +constant %named_unary = (:dba('named unary') , :prec, :assoc, :uassoc, :pure); +constant %structural = (:dba('structural infix'), :prec, :assoc, :diffy); +constant %chaining = (:dba('chaining') , :prec, :assoc, :diffy, :iffy, :pure); +constant %tight_and = (:dba('tight and') , :prec, :assoc); +constant %tight_or = (:dba('tight or') , :prec, :assoc); +constant %conditional = (:dba('conditional') , :prec, :assoc, :fiddly); +constant %item_assignment = (:dba('item assignment') , :prec, :assoc, :!pure); +constant %list_assignment = (:dba('list assignment') , :prec, :assoc, :sub, :fiddly, :!pure); +constant %loose_unary = (:dba('loose unary') , :prec, :assoc, :uassoc, :pure); +constant %comma = (:dba('comma') , :prec, :assoc, :nextterm, :fiddly, :pure); +constant %list_infix = (:dba('list infix') , :prec, :assoc, :pure); +constant %list_prefix = (:dba('list prefix') , :prec, :assoc, :uassoc); +constant %loose_and = (:dba('loose and') , :prec, :assoc); +constant %loose_or = (:dba('loose or') , :prec, :assoc); +constant %sequencer = (:dba('sequencer') , :prec, :assoc, :nextterm, :fiddly); +constant %LOOSEST = (:dba('LOOSEST') , :prec); +constant %terminator = (:dba('terminator') , :prec, :assoc); + +# "epsilon" tighter than terminator +#constant $LOOSEST = %LOOSEST; +constant $LOOSEST = "a=!"; # XXX preceding line is busted +constant $item_assignment_prec = 'i='; +constant $methodcall_prec = 'y='; + +############## +# Categories # +############## + +# Categories are designed to be easily extensible in derived grammars +# by merely adding more rules in the same category. The rules within +# a given category start with the category name followed by a differentiating +# adverbial qualifier to serve (along with the category) as the longer name. + +# The endsym context, if specified, says what to implicitly check for in each +# rule right after the initial . Normally this is used to make sure +# there's appropriate whitespace. # Note that endsym isn't called if +# isn't called. + +my $*endargs = -1; + +proto token category {*} + +token category:category { } + +token category:sigil { } +proto token sigil {*} + +token category:twigil { } +proto token twigil {*} + +token category:special_variable { } +proto token special_variable {*} + +token category:comment { } +proto token comment {*} + +token category:version { } +proto token version {*} + +token category:module_name { } +proto token module_name {*} + +token category:value { } +proto token value {*} + +token category:term { } +proto token term {*} + +token category:strtonum { } +proto token strtonum {*} + +token category:quote { } +proto token quote {*} + +token category:prefix { } +proto token prefix {*} + +token category:infix { } +proto token infix {*} + +token category:postfix { } +proto token postfix {*} + +token category:dotty { } +proto token dotty {*} + +token category:circumfix { } +proto token circumfix {*} + +token category:postcircumfix { } +proto token postcircumfix {*} # unary as far as EXPR knows... + +token category:quote_mod { } +proto token quote_mod {*} + +token category:trait_mod { } +proto token trait_mod {*} + +token category:type_declarator { } +proto token type_declarator {*} + +token category:scope_declarator { } +proto token scope_declarator {*} + +token category:package_declarator { } +proto token package_declarator {*} + +token category:multi_declarator { } +proto token multi_declarator {*} + +token category:routine_declarator { } +proto token routine_declarator {*} + +token category:regex_declarator { } +proto token regex_declarator {*} + +token category:statement_prefix { } +proto rule statement_prefix {*} + +token category:statement_control { } +proto rule statement_control {*} + +token category:statement_mod_cond { } +proto rule statement_mod_cond {*} + +token category:statement_mod_loop { } +proto rule statement_mod_loop {*} + +token category:infix_prefix_meta_operator { } +proto token infix_prefix_meta_operator {*} + +token category:infix_postfix_meta_operator { } +proto token infix_postfix_meta_operator {*} + +token category:infix_circumfix_meta_operator { } +proto token infix_circumfix_meta_operator {*} + +token category:postfix_prefix_meta_operator { } +proto token postfix_prefix_meta_operator {*} + +token category:prefix_postfix_meta_operator { } +proto token prefix_postfix_meta_operator {*} + +token category:prefix_circumfix_meta_operator { } +proto token prefix_circumfix_meta_operator {*} + +token category:terminator { } +proto token terminator {*} + +token unspacey { <.unsp>? } +token begid { } +token endid { > } +token spacey { > } +token nofun { } + +# Note, don't reduce on a bare sigil unless you don't want a twigil or +# you otherwise don't care what the longest token is. + +token sigil:sym<$> { } +token sigil:sym<@> { } +token sigil:sym<%> { } +token sigil:sym<&> { } + +token twigil:sym<.> { } +token twigil:sym { } +token twigil:sym<^> { } +token twigil:sym<:> { } +token twigil:sym<*> { } +token twigil:sym { } +token twigil:sym<=> { } +token twigil:sym<~> { } + +# overridden in subgrammars +token stopper { } + +# hopefully we can include these tokens in any outer LTM matcher +regex stdstopper { + :temp $*STUB = return self if @*MEMOS[self.pos] :exists; + :dba('standard stopper') + [ + | + | + | > + | $ # unlikely, check last (normal LTM behavior) + ] + { @*MEMOS[$¢.pos] ||= 1; } +} + +token longname { + {} [ > ]* +} + +token name { + [ + | * + | + + ] +} + +token morename { + :my $*QSIGIL ::= ''; + '::' + [ + || > + [ + | + | :dba('indirect name') '(' ~ ')' + ] + || <.panic: "Name component may not be null"> + ]? +} + +############################## +# Quote primitives # +############################## + +# assumes whitespace is eaten already + +method peek_delimiters { + my $pos = self.pos; + my $startpos = $pos; + my $char = substr($*ORIG,$pos++,1); + if $char ~~ /^\s$/ { + self.panic("Whitespace character is not allowed as delimiter"); # "can't happen" + } + elsif $char ~~ /^\w$/ { + self.panic("Alphanumeric character is not allowed as delimiter"); + } + elsif %STD::close2open{$char} { + self.panic("Use of a closing delimiter for an opener is reserved"); + } + elsif $char eq ':' { + self.panic("Colons may not be used to delimit quoting constructs"); + } + + my $rightbrack = %STD::open2close{$char}; + if not defined $rightbrack { + return $char, $char; + } + while substr($*ORIG,$pos,1) eq $char { + $pos++; + } + my $len = $pos - $startpos; + my $start = $char x $len; + my $stop = $rightbrack x $len; + return $start, $stop; +} + +role startstop[$start,$stop] { + token starter { $start } + token stopper { $stop } +} # end role + +role stop[$stop] { + token starter { } + token stopper { $stop } +} # end role + +role unitstop[$stop] { + token unitstopper { $stop } +} # end role + +token unitstopper { $ } + +method balanced ($start,$stop) { self.mixin( ::startstop[$start,$stop] ); } +method unbalanced ($stop) { self.mixin( ::stop[$stop] ); } +method unitstop ($stop) { self.mixin( ::unitstop[$stop] ); } + +method truly ($bool,$opt) { + return self if $bool; + self.sorry("Can't negate $opt adverb"); + self; +} + +token charname { + [ + | + | .*? + ] || <.sorry: "Unrecognized character name"> .*? +} + +token charnames { \s* [<.ws>] ** [','\s*] } + +token charspec { + [ + | :dba('character name') '[' ~ ']' + | \d+ + | <[ ?..Z \\.._ ]> + | <.sorry: "Unrecognized \\c character"> . + ] +} + +proto token backslash {*} +proto token escape {*} +token starter { } +token escape:none { } + +# and this is what makes nibbler polymorphic... +method nibble ($lang) { + self.cursor_fresh($lang).nibbler; +} + +# note: polymorphic over many quote languages, we hope +token nibbler { + :my $text = ''; + :my $from = self.pos; + :my $to = $from; + :my @nibbles = (); + :my $multiline = 0; + { $.from = self.pos; } + [ > + [ + || + {{ + push @nibbles, $¢.makestr(TEXT => $text, _from => $from, _pos => $to ) if $from != $to; + + my $n = $[*-1]; + my @n = @$n; + + push @nibbles, $; + push @nibbles, @n; + push @nibbles, $; + + $text = ''; + $to = $from = $¢.pos; + }} + || {{ + push @nibbles, $¢.makestr(TEXT => $text, _from => $from, _pos => $to ) if $from != $to; + push @nibbles, $[*-1]; + $text = ''; + $to = $from = $¢.pos; + }} + || . + {{ + my $ch = substr($*ORIG, $¢.pos-1, 1); + $text ~= $ch; + $to = $¢.pos; + if $ch ~~ "\n" { + $multiline++; + } + }} + ] + ]* + {{ + push @nibbles, $¢.makestr(TEXT => $text, _from => $from, _pos => $to ) if $from != $to or !@nibbles; + $ = \@nibbles; + $.pos = $¢.pos; + $ :delete; + $ :delete; + $ :delete; + $ :delete; + $*LAST_NIBBLE = $¢; + $*LAST_NIBBLE_MULTILINE = $¢ if $multiline; + }} +} + +token babble ($l) { + :my $lang = $l; + :my $start; + :my $stop; + + <.ws> + [ <.ws> + { + my $kv = $[*-1]; + $lang = ($lang.tweak($kv., $kv.) + or $lang.sorry("Unrecognized adverb :" ~ $kv. ~ '(' ~ $kv. ~ ')')); + } + ]* + + { + ($start,$stop) = $¢.peek_delimiters(); + $lang = $start ne $stop ?? $lang.balanced($start,$stop) + !! $lang.unbalanced($stop); + $ = [$lang,$start,$stop]; + } +} + +our @herestub_queue; + +class Herestub { + has Str $.delim; + has $.orignode; + has $.lang; +} # end class + +role herestop { + token stopper { ^^ {} $=(\h*?) $*DELIM \h* <.unv>?? $$ \v? } +} # end role + +# XXX be sure to temporize @herestub_queue on reentry to new line of heredocs + +method heredoc () { + my $*CTX ::= self.callm if $*DEBUG +& DEBUG::trace_call; + my $here = self; + while my $herestub = shift @herestub_queue { + my $*DELIM = $herestub.delim; + my $lang = $herestub.lang.mixin( ::herestop ); + my $doc; + if ($doc) = $here.nibble($lang) { + $here = $doc.trim_heredoc(); + $herestub.orignode = $doc; + } + else { + self.panic("Ending delimiter $*DELIM not found"); + } + } + return self.cursor($here.pos); # return to initial type +} + +token quibble ($l) { + :my ($lang, $start, $stop); + + { my $B = $; ($lang,$start,$stop) = @$B; } + + $start [ $stop || <.panic: "Couldn't find terminator $stop"> ] + + {{ + if $lang<_herelang> { + push @herestub_queue, + ::Herestub.new( + delim => $[0], + orignode => $¢, + lang => $lang<_herelang>, + ); + } + }} +} + +token quotepair { + :my $key; + :my $value; + + ':' + :dba('colon pair (restricted)') + [ + | '!' [ <.sorry: "Argument not allowed on negated pair"> ]? + { $key = $.Str; $value = 0; } + | + { $key = $.Str; } + [ + || <.unsp>? { $value = $; } + || { $value = 1; } + ] + | $=(\d+) $=(<[a..z]>+) [ <.sorry: "2nd argument not allowed on pair"> ]? + { $key = $.Str; $value = $.Str; } + ] + { $ = $key; $ = $value; } +} + +token quote:sym<' '> { :dba('single quotes') "'" ~ "'" ).tweak(:q).unbalanced("'"))> } +token quote:sym<" "> { :dba('double quotes') '"' ~ '"' ).tweak(:qq).unbalanced('"'))> } + +token circumfix:sym<« »> { :dba('shell-quote words') '«' ~ '»' ).tweak(:qq).tweak(:ww).balanced('«','»'))> } +token circumfix:sym«<< >>» { :dba('shell-quote words') '<<' ~ '>>' ).tweak(:qq).tweak(:ww).balanced('<<','>>'))> } +token circumfix:sym«< >» { :dba('quote words') '<' ~ '>' + [ + [ ' > <.obs('', '$' ~ '*IN.lines')> ]? # XXX fake out gimme5 + [ ' > <.obs('<>', "lines() to read input,\n or ('') to represent the null string,\n or () to represent Nil")> ]? + ).tweak(:q).tweak(:w).balanced('<','>'))> + ] +} + +################## +# Lexer routines # +################## + +token ws { + :temp $*STUB = return self if @*MEMOS[self.pos] :exists; + :my $startpos = self.pos; + :my $*HIGHEXPECT = {}; + + :dba('whitespace') + [ + | \h+ { @*MEMOS[$¢.pos] = $startpos; } # common case + | ::: + { @*MEMOS[$startpos]:delete; } + <.sorry: "Whitespace is required between alphanumeric tokens"> # must \s+ between words + ] + || + [ + | <.unsp> + | <.vws> <.heredoc> + | <.unv> + | $ { $¢.moreinput } + ]* + + {{ + if ($¢.pos == $startpos) { + @*MEMOS[$¢.pos]:delete; + } + else { + @*MEMOS[$¢.pos] = $startpos; + @*MEMOS[$¢.pos] = @*MEMOS[$startpos] + if @*MEMOS[$startpos] :exists; + } + }} +} + +token unsp { + \\ + :dba('unspace') + [ + | <.vws> + | <.unv> + | $ { $¢.moreinput } + ]* +} + +token vws { + :dba('vertical whitespace') + [ + [ + | \v + | '#DEBUG -1' { say "DEBUG"; $*DEBUG = -1; } \V* \v + | '<<<<<<<' :: >>>>>>' > <.sorry: 'Found a version control conflict marker'> \V* \v + | '=======' :: .*? \v '>>>>>>>' \V* \v # ignore second half + ] + ]+ +} + +# We provide two mechanisms here: +# 1) define $*moreinput, or +# 2) override moreinput method +method moreinput () { + $*moreinput.() if $*moreinput; + self; +} + +token unv { + :dba('horizontal whitespace') + [ + | \h+ + | ^^ <.pod_comment> + | \h* + ] +} + +token comment:sym<#`(...)> { + '#`' :: [ || <.panic: "Opening bracket is required for #` comment"> ] + <.quibble($¢.cursor_fresh( %*LANG ))> +} + +token comment:sym<#(...)> { + '#' + <.suppose + ))> + * \h* [ '#' | $$ ] > # extra stuff on line after closer? + > + <.worry: "Embedded comment seems to be missing backtick"> +} + +token comment:sym<#=(...)> { + '#=' :: + ))> +} + +token comment:sym<#=> { + '#=' :: $ = [\N*] +} + +token comment:sym<#> { + '#' {} \N* +} + +token ident { + <.alpha> \w* +} + +token apostrophe { + <[ ' \- ]> +} + +token identifier { + <.ident> [ <.apostrophe> <.ident> ]* +} + +# XXX We need to parse the pod eventually to support $= variables. + +token pod_comment { + ^^ \h* '=' <.unsp>? + [ + | 'begin' \h+ :: + [ + || .*? "\n" [ :r \h* '=' <.unsp>? 'end' \h+ $ » \N* ] + || .Str eq 'END'}> .* + || { my $id = $.Str; self.panic("=begin $id without matching =end $id"); } + ] + | 'begin' » :: \h* [ $$ || '#' || <.sorry: "Unrecognized token after =begin"> \N* ] + [ .*? "\n" \h* '=' <.unsp>? 'end' » \N* || { self.panic("=begin without matching =end"); } ] + + | 'for' » :: \h* [ || $$ || '#' || <.sorry: "Unrecognized token after =for"> \N* ] + [.*? ^^ \h* $$ || .*] + | :: + [ <.panic: "Obsolescent pod format, please use =begin/=end instead"> ]? + [||\s||<.sorry: "Illegal pod directive">] + \N* + ] +} + +# suppress fancy end-of-line checking +token embeddedblock { + # encapsulate braided languages + :temp %*LANG; + :my $*SIGNUM; + :my $*GOAL ::= '}'; + :temp $*CURLEX; + + :dba('embedded block') + + <.newlex> + <.finishlex> + '{' :: [ :lang(%*LANG
) ] + [ '}' || <.panic: "Unable to parse statement list; couldn't find right brace"> ] +} + +token binints { [<.ws><.ws>] ** ',' } + +token binint { + <[ 0..1 ]>+ [ _ <[ 0..1 ]>+ ]* +} + +token octints { [<.ws><.ws>] ** ',' } + +token octint { + <[ 0..7 ]>+ [ _ <[ 0..7 ]>+ ]* +} + +token hexints { [<.ws><.ws>] ** ',' } + +token hexint { + <[ 0..9 a..f A..F ]>+ [ _ <[ 0..9 a..f A..F ]>+ ]* +} + +token decints { [<.ws><.ws>] ** ',' } + +token decint { + \d+ [ _ \d+ ]* +} + +token integer { + [ + | 0 [ b '_'? + | o '_'? + | x '_'? + | d '_'? + | + .Str ~ " if you mean that") }> + ] + | + ] + > <.sorry: "Decimal point must be followed by digit">]? > + [ <.sorry: "Only isolated underscores are allowed inside numbers"> ]? +} + +token radint { + [ + | + | + and + not defined $ + }> + ] +} + +token escale { + <[Ee]> <[+\-]>? +} + +# careful to distinguish from both integer and 42.method +token dec_number { + :dba('decimal number') + [ + | $ = [ '.' ] ? + | $ = [ '.' ] ? + | $ = [ ] + ] + [ <.sorry: "Number contains two decimal points (missing 'v' for version number?)"> ['.'\d+]+ ]? + [ <.sorry: "Only isolated underscores are allowed inside numbers"> ]? +} + +token alnumint { + [ <[ 0..9 a..z A..Z ]>+ [ _ <[ 0..9 a..z A..Z ]>+ ]* ] +} + +token rad_number { + ':' $ = [\d+] <.unsp>? # XXX optional dot here? + {} # don't recurse in lexer + :dba('number in radix notation') + [ + || '<' + [ + | $ = [ '.' ] + | $ = [ '.' ] + | $ = [ ] + ] + [ + '*' + [ '**' || <.sorry: "Base is missing ** exponent part"> ] + ]? + '>' +# { make radcalc($, $, $, $) } + || + || + || <.panic: "Malformed radix number"> + ] +} + +token terminator:sym<)> + { } + +token terminator:sym<]> + { ']' } + +token terminator:sym<}> + { '}' } + +# XXX should eventually be derived from current Unicode tables. +constant %open2close = ( +"\x0028" => "\x0029", +"\x003C" => "\x003E", +"\x005B" => "\x005D", +"\x007B" => "\x007D", +"\x00AB" => "\x00BB", +"\x0F3A" => "\x0F3B", +"\x0F3C" => "\x0F3D", +"\x169B" => "\x169C", +"\x2018" => "\x2019", +"\x201A" => "\x2019", +"\x201B" => "\x2019", +"\x201C" => "\x201D", +"\x201E" => "\x201D", +"\x201F" => "\x201D", +"\x2039" => "\x203A", +"\x2045" => "\x2046", +"\x207D" => "\x207E", +"\x208D" => "\x208E", +"\x2208" => "\x220B", +"\x2209" => "\x220C", +"\x220A" => "\x220D", +"\x2215" => "\x29F5", +"\x223C" => "\x223D", +"\x2243" => "\x22CD", +"\x2252" => "\x2253", +"\x2254" => "\x2255", +"\x2264" => "\x2265", +"\x2266" => "\x2267", +"\x2268" => "\x2269", +"\x226A" => "\x226B", +"\x226E" => "\x226F", +"\x2270" => "\x2271", +"\x2272" => "\x2273", +"\x2274" => "\x2275", +"\x2276" => "\x2277", +"\x2278" => "\x2279", +"\x227A" => "\x227B", +"\x227C" => "\x227D", +"\x227E" => "\x227F", +"\x2280" => "\x2281", +"\x2282" => "\x2283", +"\x2284" => "\x2285", +"\x2286" => "\x2287", +"\x2288" => "\x2289", +"\x228A" => "\x228B", +"\x228F" => "\x2290", +"\x2291" => "\x2292", +"\x2298" => "\x29B8", +"\x22A2" => "\x22A3", +"\x22A6" => "\x2ADE", +"\x22A8" => "\x2AE4", +"\x22A9" => "\x2AE3", +"\x22AB" => "\x2AE5", +"\x22B0" => "\x22B1", +"\x22B2" => "\x22B3", +"\x22B4" => "\x22B5", +"\x22B6" => "\x22B7", +"\x22C9" => "\x22CA", +"\x22CB" => "\x22CC", +"\x22D0" => "\x22D1", +"\x22D6" => "\x22D7", +"\x22D8" => "\x22D9", +"\x22DA" => "\x22DB", +"\x22DC" => "\x22DD", +"\x22DE" => "\x22DF", +"\x22E0" => "\x22E1", +"\x22E2" => "\x22E3", +"\x22E4" => "\x22E5", +"\x22E6" => "\x22E7", +"\x22E8" => "\x22E9", +"\x22EA" => "\x22EB", +"\x22EC" => "\x22ED", +"\x22F0" => "\x22F1", +"\x22F2" => "\x22FA", +"\x22F3" => "\x22FB", +"\x22F4" => "\x22FC", +"\x22F6" => "\x22FD", +"\x22F7" => "\x22FE", +"\x2308" => "\x2309", +"\x230A" => "\x230B", +"\x2329" => "\x232A", +"\x23B4" => "\x23B5", +"\x2768" => "\x2769", +"\x276A" => "\x276B", +"\x276C" => "\x276D", +"\x276E" => "\x276F", +"\x2770" => "\x2771", +"\x2772" => "\x2773", +"\x2774" => "\x2775", +"\x27C3" => "\x27C4", +"\x27C5" => "\x27C6", +"\x27D5" => "\x27D6", +"\x27DD" => "\x27DE", +"\x27E2" => "\x27E3", +"\x27E4" => "\x27E5", +"\x27E6" => "\x27E7", +"\x27E8" => "\x27E9", +"\x27EA" => "\x27EB", +"\x2983" => "\x2984", +"\x2985" => "\x2986", +"\x2987" => "\x2988", +"\x2989" => "\x298A", +"\x298B" => "\x298C", +"\x298D" => "\x298E", +"\x298F" => "\x2990", +"\x2991" => "\x2992", +"\x2993" => "\x2994", +"\x2995" => "\x2996", +"\x2997" => "\x2998", +"\x29C0" => "\x29C1", +"\x29C4" => "\x29C5", +"\x29CF" => "\x29D0", +"\x29D1" => "\x29D2", +"\x29D4" => "\x29D5", +"\x29D8" => "\x29D9", +"\x29DA" => "\x29DB", +"\x29F8" => "\x29F9", +"\x29FC" => "\x29FD", +"\x2A2B" => "\x2A2C", +"\x2A2D" => "\x2A2E", +"\x2A34" => "\x2A35", +"\x2A3C" => "\x2A3D", +"\x2A64" => "\x2A65", +"\x2A79" => "\x2A7A", +"\x2A7D" => "\x2A7E", +"\x2A7F" => "\x2A80", +"\x2A81" => "\x2A82", +"\x2A83" => "\x2A84", +"\x2A8B" => "\x2A8C", +"\x2A91" => "\x2A92", +"\x2A93" => "\x2A94", +"\x2A95" => "\x2A96", +"\x2A97" => "\x2A98", +"\x2A99" => "\x2A9A", +"\x2A9B" => "\x2A9C", +"\x2AA1" => "\x2AA2", +"\x2AA6" => "\x2AA7", +"\x2AA8" => "\x2AA9", +"\x2AAA" => "\x2AAB", +"\x2AAC" => "\x2AAD", +"\x2AAF" => "\x2AB0", +"\x2AB3" => "\x2AB4", +"\x2ABB" => "\x2ABC", +"\x2ABD" => "\x2ABE", +"\x2ABF" => "\x2AC0", +"\x2AC1" => "\x2AC2", +"\x2AC3" => "\x2AC4", +"\x2AC5" => "\x2AC6", +"\x2ACD" => "\x2ACE", +"\x2ACF" => "\x2AD0", +"\x2AD1" => "\x2AD2", +"\x2AD3" => "\x2AD4", +"\x2AD5" => "\x2AD6", +"\x2AEC" => "\x2AED", +"\x2AF7" => "\x2AF8", +"\x2AF9" => "\x2AFA", +"\x2E02" => "\x2E03", +"\x2E04" => "\x2E05", +"\x2E09" => "\x2E0A", +"\x2E0C" => "\x2E0D", +"\x2E1C" => "\x2E1D", +"\x2E20" => "\x2E21", +"\x3008" => "\x3009", +"\x300A" => "\x300B", +"\x300C" => "\x300D", +"\x300E" => "\x300F", +"\x3010" => "\x3011", +"\x3014" => "\x3015", +"\x3016" => "\x3017", +"\x3018" => "\x3019", +"\x301A" => "\x301B", +"\x301D" => "\x301E", +"\xFD3E" => "\xFD3F", +"\xFE17" => "\xFE18", +"\xFE35" => "\xFE36", +"\xFE37" => "\xFE38", +"\xFE39" => "\xFE3A", +"\xFE3B" => "\xFE3C", +"\xFE3D" => "\xFE3E", +"\xFE3F" => "\xFE40", +"\xFE41" => "\xFE42", +"\xFE43" => "\xFE44", +"\xFE47" => "\xFE48", +"\xFE59" => "\xFE5A", +"\xFE5B" => "\xFE5C", +"\xFE5D" => "\xFE5E", +"\xFF08" => "\xFF09", +"\xFF1C" => "\xFF1E", +"\xFF3B" => "\xFF3D", +"\xFF5B" => "\xFF5D", +"\xFF5F" => "\xFF60", +"\xFF62" => "\xFF63", +); + +constant %close2open = invert %open2close; + +token opener { + <[ +\x0028 \x003C \x005B \x007B \x00AB \x0F3A \x0F3C \x169B \x2018 \x201A \x201B +\x201C \x201E \x201F \x2039 \x2045 \x207D \x208D \x2208 \x2209 \x220A \x2215 +\x223C \x2243 \x2252 \x2254 \x2264 \x2266 \x2268 \x226A \x226E \x2270 \x2272 +\x2274 \x2276 \x2278 \x227A \x227C \x227E \x2280 \x2282 \x2284 \x2286 \x2288 +\x228A \x228F \x2291 \x2298 \x22A2 \x22A6 \x22A8 \x22A9 \x22AB \x22B0 \x22B2 +\x22B4 \x22B6 \x22C9 \x22CB \x22D0 \x22D6 \x22D8 \x22DA \x22DC \x22DE \x22E0 +\x22E2 \x22E4 \x22E6 \x22E8 \x22EA \x22EC \x22F0 \x22F2 \x22F3 \x22F4 \x22F6 +\x22F7 \x2308 \x230A \x2329 \x23B4 \x2768 \x276A \x276C \x276E \x2770 \x2772 +\x2774 \x27C3 \x27C5 \x27D5 \x27DD \x27E2 \x27E4 \x27E6 \x27E8 \x27EA \x2983 +\x2985 \x2987 \x2989 \x298B \x298D \x298F \x2991 \x2993 \x2995 \x2997 \x29C0 +\x29C4 \x29CF \x29D1 \x29D4 \x29D8 \x29DA \x29F8 \x29FC \x2A2B \x2A2D \x2A34 +\x2A3C \x2A64 \x2A79 \x2A7D \x2A7F \x2A81 \x2A83 \x2A8B \x2A91 \x2A93 \x2A95 +\x2A97 \x2A99 \x2A9B \x2AA1 \x2AA6 \x2AA8 \x2AAA \x2AAC \x2AAF \x2AB3 \x2ABB +\x2ABD \x2ABF \x2AC1 \x2AC3 \x2AC5 \x2ACD \x2ACF \x2AD1 \x2AD3 \x2AD5 \x2AEC +\x2AF7 \x2AF9 \x2E02 \x2E04 \x2E09 \x2E0C \x2E1C \x2E20 \x3008 \x300A \x300C +\x300E \x3010 \x3014 \x3016 \x3018 \x301A \x301D \xFD3E \xFE17 \xFE35 \xFE37 +\xFE39 \xFE3B \xFE3D \xFE3F \xFE41 \xFE43 \xFE47 \xFE59 \xFE5B \xFE5D \xFF08 +\xFF1C \xFF3B \xFF5B \xFF5F \xFF62 + ]> +} + +grammar P6 is STD { + + ################### + # Top-level rules # + ################### + + # Note: we only check for the stopper. We don't check for ^ because + # we might be embedded in something else. + rule comp_unit { + :my $*begin_compunit = 1; + :my $*endargs = -1; + :my %*LANG; + :my $*PKGDECL ::= ""; + :my $*IN_DECL = ''; + :my $*DECLARAND; + :my $*OFTYPE; + :my $*NEWPKG; + :my $*NEWLEX; + :my $*QSIGIL ::= ''; + :my $*IN_META = ''; + :my $*QUASIMODO; + :my $*SCOPE = ""; + :my $*LEFTSIGIL; + :my $*PRECLIM; + :my %*MYSTERY = (); + :my $*INVOCANT_OK; + :my $*INVOCANT_IS; + :my $*CURLEX; + :my $*MULTINESS = ''; + :my $*SIGNUM = 0; + :my $*MONKEY_TYPING = False; + :my %*WORRIES; + :my @*WORRIES; + :my $*FATALS = 0; + :my $*IN_SUPPOSE = False; + + :my $*CURPKG; + {{ + + %*LANG
= ::STD::P6 ; + %*LANG = ::STD::Q ; + %*LANG = ::STD::Quasi ; + %*LANG = ::STD::Regex ; + %*LANG = ::STD::P5 ; + %*LANG = ::STD::P5::Regex ; + + @*WORRIES = (); + self.load_setting($*SETTINGNAME); + my $oid = $*SETTING.id; + my $id = 'MY:file<' ~ $*FILE ~ '>'; + $*CURLEX = Stash.new( + 'OUTER::' => [$oid], + '!file' => $*FILE, '!line' => 0, + '!id' => [$id], + ); + $ALL.{$id} = $*CURLEX; + $*UNIT = $*CURLEX; + $ALL. = $*UNIT; + self.finishlex; + # $¢ = self.cursor_fresh($*CURLEX<$?LANGNAME>); + }} + <.unitstart> + + [ || <.panic: "Confused"> ] + # "CHECK" time... + {{ + $¢.explain_mystery(); + $¢. = $*CURLEX; + if @*WORRIES { + note "Potential difficulties:\n " ~ join( "\n ", @*WORRIES) ~ "\n"; + } + die "Check failed\n" if $*FATALS; + }} + } + + # Note: because of the possibility of placeholders we can't determine arity of + # the block syntactically, so this must be determined via semantic analysis. + # Also, pblocks used in an if/unless statement do not treat $_ as a placeholder, + # while most other blocks treat $_ as equivalent to $^x. Therefore the first + # possible place to check arity is not here but in the rule that calls this + # rule. (Could also be done in a later pass.) + + token pblock () { + :temp $*CURLEX; + :dba('parameterized block') + [ | '{' > || + {{ + if $*BORG and $*BORG. { + if $*BORG. { + my $m = "Function '" ~ $*BORG. ~ "' needs parens to avoid gobbling block" ~ $*BORG..locmess; + $*BORG..panic($m ~ "\nMissing block (apparently gobbled by '" ~ $*BORG. ~ "')"); + } + else { + my $m = "Expression needs parens to avoid gobbling block" ~ $*BORG..locmess; + $*BORG..panic($m ~ "\nMissing block (apparently gobbled by expression)"); + } + } + elsif %*MYSTERY { + $¢.panic("Missing block (apparently gobbled by undeclared routine?)"); + } + else { + $¢.panic("Missing block"); + } + }} + ] + [ + | + <.newlex(1)> + + + <.getsig> + | + <.newlex(1)> + + <.getsig> + ] + } + + # this is a hook for subclasses + token unitstart { } + token lambda { '->' | '<->' } + + # Look for an expression followed by a required lambda. + token xblock { + :my $*GOAL ::= '{'; + :my $*BORG = {}; + + { $*BORG. //= $.cursor(self.pos) } + <.ws> + + } + + token block () { + :temp $*CURLEX; + :dba('scoped block') + [ || <.panic: "Missing block"> ] + <.newlex> + + <.checkyada> + } + + token blockoid { + # encapsulate braided languages + :temp %*LANG; + :my $*SIGNUM; + + <.finishlex> + [ + | '{YOU_ARE_HERE}' <.you_are_here> + | :dba('block') '{' ~ '}' :: <.curlycheck> + | <.panic: 'Missing block'> + | <.panic: "Malformed block"> + ] + } + + token curlycheck { + [ + || # (usual case without comments) + { @*MEMOS[$¢.pos] = 2; } + || > + || <.unv> $$ + { @*MEMOS[$¢.pos] = 2; } + || <.unsp>? { @*MEMOS[$¢.pos] = 1; } + ] + } + + token regex_block { + # encapsulate braided languages + :temp %*LANG; + :temp %*RX; + + :my $lang = %*LANG; + :my $*GOAL ::= '}'; + + [ <.ws> + { + my $kv = $[*-1]; + $lang = ($lang.tweak($kv., $kv.) + or $lang.panic("Unrecognized adverb :" ~ $kv. ~ '(' ~ $kv. ~ ')')); + } + ]* + + [ + | '{*}' { $¢. = 1 } + | [ + '{' + + [ '}' || <.panic: "Unable to parse regex; couldn't find right brace"> ] + ] + ] + + <.curlycheck> + } + + # statement semantics + rule statementlist { + :my $*INVOCANT_OK = 0; + :temp $*MONKEY_TYPING; + :dba('statement list') + + [ + | $ + | > + | [ ]* + { self.mark_sinks($) } + ] + } + + # embedded semis, context-dependent semantics + rule semilist { + :my $*INVOCANT_OK = 0; + :dba('semicolon list') + [ + | > + | [ ]* + ] + } + + + token label { + :my $label; + ':' <.ws> + + [ .Str) }> + <.sorry("Illegal redeclaration of '$label'")> + ]? + + # add label as a pseudo constant + {{ $¢.add_constant($label,self.label_id); }} + + } + + token statement { + :my $*endargs = -1; + :my $*QSIGIL ::= 0; + > + + + # this could either be a statement that follows a declaration + # or a statement that is within the block of a code declaration + .bless($¢); }> + + [ + |