diff --git a/src/NQP/Actions.pm b/src/NQP/Actions.pm index 0159111..8a1bda9 100644 --- a/src/NQP/Actions.pm +++ b/src/NQP/Actions.pm @@ -47,11 +47,26 @@ sub colonpair_str($ast) { } method comp_unit($/) { - my $past := $.ast; - my $BLOCK := @BLOCK.shift; - $BLOCK.push($past); - $BLOCK.node($/); - make $BLOCK; + my $mainline := $.ast; + my $unit := @BLOCK.shift; + + # We force a return here, because we have other + # :load/:init blocks to execute that we don't want + # to include as part of the mainline. + $unit.push( + PAST::Op.new( :pirop, $mainline ) + ); + + # If this code is loaded via load_bytecode, we want the unit mainline + # to be executed after all other loadinits have taken place. + $unit.push( + PAST::Block.new( + :pirflags(':load'), :lexical(0), :namespace(''), + PAST::Op.new( :pasttype, PAST::Val.new( :value($unit) ) ) + ) + ); + $unit.node($/); + make $unit; } method statementlist($/) { @@ -60,9 +75,7 @@ method statementlist($/) { for $ { my $ast := $_.ast; $ast := $ast if pir::defined($ast); - if $ast.isa(PAST::Block) && !$ast.blocktype { - $ast := block_immediate($ast); - } + if $ast { $ast := block_immediate($ast); } $past.push( $ast ); } } @@ -104,6 +117,7 @@ method blockoid($/) { my $BLOCK := @BLOCK.shift; $BLOCK.push($past); $BLOCK.node($/); + $BLOCK.closure(1); make $BLOCK; } @@ -280,8 +294,7 @@ method term:sym($/) { make $.ast; } method fatarrow($/) { my $past := $.ast; - my $name := ?$ ?? $.ast !! $.Str; - $past.named( $name ); + $past.named( $.Str ); make $past; } @@ -367,6 +380,10 @@ method scope_declarator:sym($/) { make $.ast; } method scope_declarator:sym($/) { make $.ast; } method scoped($/) { + make $.ast; +} + +method declarator($/) { make $ ?? $.ast !! $.ast; @@ -615,6 +632,10 @@ method arglist($/) { if $past[$i].name eq '&prefix:<|>' { $past[$i] := $past[$i][0]; $past[$i].flat(1); + if $past[$i].isa(PAST::Val) + && pir::substr($past[$i].name, 0, 1) eq '%' { + $past[$i].named(1); + } } $i++; } @@ -649,9 +670,11 @@ method circumfix:sym($/) { make $.ast; } method circumfix:sym<« »>($/) { make $.ast; } method circumfix:sym<{ }>($/) { - make +$ > 0 - ?? $.ast - !! vivitype('%'); + my $past := +$ > 0 + ?? $.ast + !! vivitype('%'); + $past := 1; + make $past; } method circumfix:sym($/) { @@ -735,6 +758,7 @@ method quote_escape:sym<{ }>($/) { :pirop('set S*'), block_immediate($.ast), :node($/) ); } +method quote_escape:sym($/) { make "\c[27]"; } ## Operators @@ -761,6 +785,23 @@ method prefix:sym($/) { ); } +sub control($/, $id) { + make PAST::Op.new( + :node($/), + :pasttype('inline'), + :inline( + '.include "except_types.pasm"', + ' %r = new "Exception"', + ' %r["type"] = ' ~ $id, + ' throw %r' + ) + ) +} + +method term:sym($/) { control($/, '.CONTROL_LOOP_NEXT') } +method term:sym($/) { control($/, '.CONTROL_LOOP_LAST') } +method term:sym($/) { control($/, '.CONTROL_LOOP_REDO') } + method infix:sym<~~>($/) { make PAST::Op.new( :pasttype, :name, :node($/) ); } @@ -773,29 +814,71 @@ class NQP::RegexActions is Regex::P6Regex::Actions { make PAST::Regex.new( $past, :pasttype('pastnode') ); } - method metachar:sym<{ }>($/) { make $.ast; } + method metachar:sym($/) { + my $past; + my $name := $ ?? +$ !! ~$; + if $ { + if $ { + $/.CURSOR.panic('"$var = " syntax not yet supported in regexes'); + } + $past := $[0].ast; + if $past.pasttype eq 'quant' && $past[0].pasttype eq 'subrule' { + Regex::P6Regex::Actions::subrule_alias($past[0], $name); + } + elsif $past.pasttype eq 'subrule' { Regex::P6Regex::Actions::subrule_alias($past, $name); } + else { + $past := PAST::Regex.new( $past, :name($name), :pasttype('subcapture'), :node($/) ); + } + } + else { + if $ { + my @MODIFIERS := Q:PIR { + %r = get_hll_global ['Regex';'P6Regex';'Actions'], '@MODIFIERS' + }; + my $subtype := @MODIFIERS[0] ?? 'interp_literal_i' !! 'interp_literal'; + $past := PAST::Regex.new( $.ast, :pasttype('pastnode'), + :subtype($subtype), :node($/) ); + } else { + $past := PAST::Regex.new( '!BACKREF', $name, :pasttype('subrule'), + :subtype('method'), :node($/) ); + } + } + make $past; + } + + method assertion:sym($/) { + make PAST::Regex.new( $.ast, :pasttype('pastnode'), + :subtype('interp_regex'), :node($/) ); + } + - method assertion:sym<{ }>($/) { make $.ast; } + method metachar:sym<{ }>($/) { + make PAST::Regex.new(:node($/), :pasttype('pastnode'), $.ast); + } + + method assertion:sym<{ }>($/) { + make PAST::Regex.new( :node($/), :pasttype('pastnode'), :subtype('interp_regex'), + $.ast ); + } method codeblock($/) { my $block := $.ast; $block.blocktype('immediate'); - my $past := - PAST::Regex.new( - PAST::Stmts.new( - PAST::Op.new( - PAST::Var.new( :name('$/') ), - PAST::Op.new( - PAST::Var.new( :name('$¢') ), - :name('MATCH'), - :pasttype('callmethod') - ), - :pasttype('bind') - ), - $block + make bindmatch($block); + } + + sub bindmatch($past) { + PAST::Stmts.new( + PAST::Op.new( + PAST::Var.new( :name('$/') ), + PAST::Op.new( + PAST::Var.new( :name('$¢') ), + :name('MATCH'), + :pasttype('callmethod') ), - :pasttype('pastnode') - ); - make $past; + :pasttype('bind') + ), + $past, + ); } } diff --git a/src/NQP/Grammar.pm b/src/NQP/Grammar.pm index aa2f77c..c14a51c 100644 --- a/src/NQP/Grammar.pm +++ b/src/NQP/Grammar.pm @@ -31,9 +31,10 @@ token ENDSTMT { token ws { || || - [ \s+ + [ \v+ | '#' \N* | ^^ <.pod_comment> + | \h+ ]* } @@ -51,10 +52,10 @@ token pod_comment { ^^ \h* '=' [ | 'begin' \h+ 'END' >> - [ .*? \n '=' 'end' \h+ 'END' » \N* || .* ] + [ .*? \n \h* '=' 'end' \h+ 'END' » \N* || .* ] | 'begin' \h+ [ - || .*? \n '=' 'end' \h+ $ » \N* + || .*? \n \h* '=' 'end' \h+ $ » \N* || <.panic: '=begin without matching =end'> ] | 'begin' » \h* @@ -63,10 +64,14 @@ token pod_comment { || .*? \n \h* '=' 'end' » \N* || <.panic: '=begin without matching =end'> ] + | + .*? ^^ ]? + | \n ]> | - [ - <.panic: 'Obsolete pod format, please use =begin/=end instead'> ]? - [ || \s || <.panic: 'Illegal pod directive'> ] + [ \s || <.panic: 'Illegal pod directive'> ] \N* ] } @@ -224,9 +229,7 @@ token term:sym { } token term:sym { } token fatarrow { - | \h* '=>' <.ws> - | $= \h* '=>' <.ws> - | $= \h* '=>' <.ws> + \h* '=>' <.ws> } token colonpair { @@ -268,6 +271,13 @@ token scope_declarator:sym { } token scope_declarator:sym { } rule scoped($*SCOPE) { + | + | + # eventually +} + +token typename { } + +token declarator { | | } @@ -297,6 +307,7 @@ rule method_def { token signature { [ [<.ws><.ws>] ** ',' ]? } token parameter { + [ <.ws> ]* # [ | $=['*'] | [ | ] $=['?'|'!'|] @@ -390,9 +401,9 @@ token number { proto token quote { <...> } token quote:sym { } token quote:sym { } -token quote:sym { 'q' <.ws> } -token quote:sym { 'qq' <.ws> } -token quote:sym { 'Q' <.ws> } +token quote:sym { 'q' >> <.ws> } +token quote:sym { 'qq' >> <.ws> } +token quote:sym { 'Q' >> <.ws> } token quote:sym { 'Q:PIR' <.ws> } token quote:sym { '/' @@ -404,6 +415,7 @@ token quote:sym { token quote_escape:sym<$> { } token quote_escape:sym<{ }> { } +token quote_escape:sym { \\ e } token circumfix:sym<( )> { '(' <.ws> ? ')' } token circumfix:sym<[ ]> { '[' <.ws> ? ']' } @@ -523,6 +535,9 @@ token infix:sym<,> { ')> } token prefix:sym { \s ')> } token prefix:sym { \s } +token term:sym { } +token term:sym { } +token term:sym { } method smartmatch($/) { # swap rhs into invocant position @@ -535,6 +550,26 @@ grammar NQP::Regex is Regex::P6Regex::Grammar { ':' <.ws> ';' } + token metachar:sym<$> { + + } + + token metachar:sym { + [ + | '$<' $=[<-[>]>+] '>' + | '$' $=[\d+] + | \w> + | <.panic: "Use of hash variable in patterns is reserved"> + ] + + [ <.ws> '=' <.ws> ]? + } + + token assertion:sym { + | \w> + | <.panic: "Use of hash variable in patterns is reserved"> + } + token metachar:sym<{ }> { } diff --git a/src/PAST/Compiler-Regex.pir b/src/PAST/Compiler-Regex.pir index 489e5e2..8d7cced 100644 --- a/src/PAST/Compiler-Regex.pir +++ b/src/PAST/Compiler-Regex.pir @@ -744,31 +744,121 @@ second child of this node. =item 'pastnode'(PAST::Regex node) +Evaluates the supplied PAST node and does various things with the result, based on subtype. + +Subtype can be any of: + +=over 4 + +=item zerowidth + +Only test for truthiness and fail or not. No interpolation. + +=item interp_regex + +String values should be compiled into regexes and then interpolated. + +=item interp_literal + +String values should be treated as literals. + +=item interp_literal_i + +String values should be treated as literals and matched case-insensitively. + +=item + +Don't interpolate anything, just execute the PAST code + +=back + =cut -.sub 'pastnode' :method :multi(_, ['PAST';'Regex']) +.sub 'pastnode' :method :multi(_, ['PAST'; 'Regex']) .param pmc node - .local pmc cur, pos, fail, ops - (cur, pos, fail) = self.'!rxregs'('cur pos fail') + .local pmc cur, pos, fail, ops, eos, off, tgt + (cur, pos, eos, off, tgt, fail) = self.'!rxregs'('cur pos eos off tgt fail') ops = self.'post_new'('Ops', 'node'=>node, 'result'=>cur) + + .local pmc zerowidth, negate, testop, subtype + subtype = node.'subtype'() + ops.'push_pirop'('inline', subtype, negate, 'inline'=>' # rx pastnode subtype=%1 negate=%2') .local pmc cpast, cpost cpast = node[0] cpost = self.'as_post'(cpast, 'rtype'=>'P') - + self.'!cursorop'(ops, '!cursor_pos', 0, pos) ops.'push'(cpost) - .local pmc subtype, negate, testop - subtype = node.'subtype'() - if subtype != 'zerowidth' goto done + # If this is just a zerowidth assertion, we don't actually interpolate anything. Just evaluate + # and fail or not. + if subtype == 'zerowidth' goto zerowidth_test + + # Retain backwards compatibility with old pastnode semantics + unless subtype goto done + + .local string prefix + prefix = self.'unique'('pastnode_') + .local pmc precompiled_label, done_label, loop_label, iterator_reg, label_reg + $S0 = concat prefix, '_precompiled' + precompiled_label = self.'post_new'('Label', 'result'=>$S0) + $S0 = concat prefix, '_done' + done_label = self.'post_new'('Label', 'result'=>$S0) + $S0 = concat prefix, '_loop' + loop_label = self.'post_new'('Label', 'result'=>$S0) + iterator_reg = self.'uniquereg'("P") + label_reg = self.'uniquereg'("I") + + $S10 = subtype + $S10 = concat '"', $S10 + $S10 = concat $S10, '"' + self.'!cursorop'(ops, '!process_pastnode_results_for_interpolation', 1, '$P10', cpost, $S10) + + ops.'push_pirop'('iter', iterator_reg, '$P10') + ops.'push_pirop'('set_addr', label_reg, loop_label) + ops.'push'(loop_label) + ops.'push_pirop'('unless', iterator_reg, fail) + ops.'push_pirop'('shift', '$P10', iterator_reg) + self.'!cursorop'(ops, '!mark_push', 0, 0, pos, label_reg) + + # Check if it's already a compiled Regex, and call it as a method if so + ops.'push_pirop'('isa', '$I10', '$P10', "['Sub']") + ops.'push_pirop'('if', '$I10', precompiled_label) + + # Otherwise, treat it as a literal + ops.'push_pirop'('set', '$S10', '$P10') + ops.'push_pirop'('length', '$I10', '$S10') + ops.'push_pirop'('add', '$I11', pos, '$I10') + ops.'push_pirop'('gt', '$I11', eos, fail) + ops.'push_pirop'('sub', '$I11', pos, off) + ops.'push_pirop'('substr', '$S11', tgt, '$I11', '$I10') + ne subtype, 'interp_literal_i', dont_downcase + ops.'push_pirop'('downcase', '$S10', '$S10') + ops.'push_pirop'('downcase', '$S11', '$S11') + dont_downcase: + ops.'push_pirop'('ne', '$S11', '$S10', fail) + ops.'push_pirop'('add', pos, '$I10') + ops.'push_pirop'('goto', done_label) + + ops.'push'(precompiled_label) + ops.'push_pirop'('callmethod', '$P10', cur, 'result'=>'$P10') + ops.'push_pirop'('unless', '$P10', fail) + self.'!cursorop'(ops, '!mark_push', 0, 0, CURSOR_FAIL, 0, '$P10') + ops.'push_pirop'('callmethod', '"pos"', '$P10', 'result'=>pos) + + ops.'push'(done_label) + + goto done + + zerowidth_test: negate = node.'negate'() testop = self.'??!!'(negate, 'if', 'unless') ops.'push_pirop'(testop, cpost, fail) done: .return (ops) -.end +.end =item pass(PAST::Regex node) diff --git a/src/Regex/Cursor.pir b/src/Regex/Cursor.pir index 9f220d1..b393598 100644 --- a/src/Regex/Cursor.pir +++ b/src/Regex/Cursor.pir @@ -38,6 +38,34 @@ grammars. =over 4 +=item new_match() + +A method that creates an empty Match object, by default of type +C. This method can be overridden for generating HLL-specific +Match objects. + +=cut + +.sub 'new_match' :method + .local pmc match + match = new ['Regex';'Match'] + .return (match) +.end + +=item new_array() + +A method that creates an empty array object, by default of type +C. This method can be overridden for generating HLL-specific +arrays for usage within Match objects. + +=cut + +.sub 'new_array' :method + .local pmc arr + arr = new ['ResizablePMCArray'] + .return (arr) +.end + =item MATCH() Return this cursor's current Match object, generating a new one @@ -55,7 +83,7 @@ for the Cursor if one hasn't been created yet. # First, create a Match object and bind it match_make: - match = new ['Regex';'Match'] + match = self.'new_match'() setattribute self, '$!match', match setattribute match, '$!cursor', self .local pmc target, from, to @@ -78,7 +106,7 @@ for the Cursor if one hasn't been created yet. .local pmc arr .local int keyint subname = shift caparray_it - arr = new ['ResizablePMCArray'] + arr = self.'new_array'() caphash[subname] = arr keyint = is_cclass .CCLASS_NUMERIC, subname, 0 if keyint goto caparray_int @@ -275,7 +303,7 @@ provided, then the new cursor has the same type as lang. parrotclass = getattribute $P0, 'parrotclass' cur = new parrotclass - .local pmc from, pos, target, debug + .local pmc from, target, debug from = getattribute self, '$!pos' setattribute cur, '$!from', from @@ -671,6 +699,82 @@ Match the backreference given by C. .return (cur) .end +=item !process_pastnode_results_for_interpolation + +Used by the pastnode PAST::Regex type to prepare the results of the evaluation for interpolation. + +Takes two arguments: + +=over 4 + +=item The node results + +=item The subtype of the PAST::Regex node, which is one of: + +=over 4 + +=item interp_regex + +String values should be compiled into regexes and then interpolated. + +=item interp_literal + +String values should be treated as literals. + +=item interp_literal_i + +String values should be treated as literals and matched case-insensitively. + +=back + +=back + +Returns a RPA containing the elements to be interpolated + +=cut + +.sub '!process_pastnode_results_for_interpolation' :method + .param pmc node + .param string subtype + + .local pmc it, result, compiler, context + .local string codestr + + result = new ['ResizablePMCArray'] + $S0 = typeof node + if $S0 == 'ResizablePMCArray' goto array + $P1 = node + it = box 0 + goto not_array + array: + it = iter node + loop: + unless it, loop_done + $P1 = shift it + not_array: + if subtype != 'interp_regex' goto literal + # Don't need to compile it if it's already a Sub + $I0 = isa $P1, ['Sub'] + if $I0 goto literal + codestr = $P1 + $P1 = split '/', codestr + codestr = join '\\/', $P1 + codestr = concat '/', codestr + codestr = concat codestr, '/' + compiler = compreg 'NQP-rx' + $P2 = getinterp + context = $P2['context';0] + $P2 = compiler.'compile'(codestr, 'outer_ctx'=>context) + $P1 = $P2[0] + $P2 = getattribute context, 'current_sub' + $P1.'set_outer'($P2) + $P1 = $P1() + literal: + push result, $P1 + goto loop + loop_done: + .return (result) +.end =back diff --git a/t/nqp/49-regex-interpolation.t b/t/nqp/49-regex-interpolation.t new file mode 100644 index 0000000..63c9aab --- /dev/null +++ b/t/nqp/49-regex-interpolation.t @@ -0,0 +1,49 @@ +#! nqp + +plan(28); + +my $b := "b+"; +my @foo := [ "b+", "c+" ]; + +ok("ab+d" ~~ /a $b d/, 'plain scalar interpolates as literal 1'); +ok(!("abbbbbd" ~~ /a $b d/), 'plain scalar interpolates as literal 2'); +ok("ab+d" ~~ /a @foo d/, 'plain array interpolates as alternations of literals 1'); +ok("ac+d" ~~ /a @foo d/, 'plain array interpolates as alternations of literals 2'); +ok(!("abbbbbd" ~~ /a @foo d/), 'plain array interpolates as alternations of literals 3'); +ok(!("acccccd" ~~ /a @foo d/), 'plain array interpolates as alternations of literals 4'); + +ok(!("ab+d" ~~ /a <$b> d/), 'scalar assertion interpolates as regex 1'); +ok("abbbbbd" ~~ /a <$b> d/, 'scalar assertion interpolates as regex 2'); +ok(!("ab+d" ~~ /a <@foo> d/), 'array assertion interpolates as alternations of regexen 1'); +ok(!("ac+d" ~~ /a <@foo> d/), 'array assertion interpolates as alternations of regexen 2'); +ok("abbbbbd" ~~ /a <@foo> d/, 'array assertion interpolates as alternations of regexen 3'); +ok("acccccd" ~~ /a <@foo> d/, 'array assertion interpolates as alternations of regexen 4'); + +ok(!("ab+d" ~~ /a <{ "b+" }> d/), 'code assersion interpolates as regex 1'); +ok("abbbbd" ~~ /a <{ "b+" }> d/, 'code assersion interpolates as regex 2'); + +ok("abbbbd" ~~ /a <{ ["b+", "c+"] }> d/, 'code assertion that returns array interpolates as alternations of regexen 1'); +ok("accccd" ~~ /a <{ ["b+", "c+"] }> d/, 'code assertion that returns array interpolates as alternations of regexen 2'); + +my $r := /b+/; + +ok(!("ab+d" ~~ /a $r d/), 'plain scalar containing precompiled regex 1'); +ok("abbbd" ~~ /a $r d/, 'plain scalar containing precompiled regex 2'); + +my @r := [ /b+/, "c+" ]; + +ok("abbbbd" ~~ /a @r d/, 'plain array containing mix of precompiled and literal 1'); +ok("ac+d" ~~ /a @r d/, 'plain array containing mix of precompiled and literal 1'); + +my $xyz := 'xyz'; + +ok("axyzxyzd" ~~ /a $xyz+ d/, 'Quantified plain scalar 1'); +ok("ab+b+b+d" ~~ /a $b+ d/, 'Quantified plain scalar 2'); +ok("abbbc+bbbd" ~~ /a @r+ d/, 'Quantified plain array'); +ok("abbbcccbbcd" ~~ /a <{ [ "b+", /c+/ ] }>+ d/, 'Quantified code assertion'); + +ok("ad" ~~ /a { "bc" } d/, "Plain closure doesn't interpolate 1"); +ok(!("abcd" ~~ /a { "bc" } d/), "Plain closure doesn't interpolate 2"); + +ok("ad" ~~ /a d/, 'Zero-width assertions still work 1'); +ok(!("ad" ~~ /a d/), 'Zero-width assertions still work 2');