From cd7f1531f02876182a2853867b34ac9cf4406d97 Mon Sep 17 00:00:00 2001 From: Stefan O'Rear Date: Fri, 13 Aug 2010 02:58:22 -0700 Subject: [PATCH] Hook up LTM engine to regex processing --- CgOp.pm | 6 ++++ CodeGen.pm | 8 +++++ Cursor.cs | 25 ++++++++++++- Niecza/Actions.pm | 20 +++++++++++ RxOp.pm | 92 +++++++++++++++++++++++++++++++++++++++++++++++ SAFE.setting | 19 ++++++++++ test.pl | 15 +++++++- test2.pl | 37 ------------------- 8 files changed, 183 insertions(+), 39 deletions(-) diff --git a/CgOp.pm b/CgOp.pm index ae55870b..59b66d35 100644 --- a/CgOp.pm +++ b/CgOp.pm @@ -566,6 +566,12 @@ use warnings; zyg => \@args); } + sub rawnewarr { + my ($name, @args) = @_; + CgOp::Primitive->new(op => [ 'clr_new_arr', $name, scalar @args ], + zyg => \@args); + } + # must only be called during to_cgop phase! sub protosub { my ($body, @extra) = @_; diff --git a/CodeGen.pm b/CodeGen.pm index fa403140..482b7070 100644 --- a/CodeGen.pm +++ b/CodeGen.pm @@ -63,6 +63,8 @@ use 5.010; SetCaps => [m => 'Cursor'], SimpleWS => [m => 'Cursor'], Bind => [m => 'Cursor'] }, + 'Lexer' => + { Run => [m => 'Int32[]'] }, 'System.IO.File.ReadAllText' => [m => 'System.String'], @@ -417,6 +419,12 @@ use 5.010; $self->_push($class, "new $class(" . join(", ", @args) . ")"); } + sub clr_new_arr { + my ($self, $class, $nitems) = @_; + my @args = reverse map { ($self->_popn(1))[0] } 1 .. $nitems; + $self->_push($class . "[]", "new $class []{" . join(", ", @args) . "}"); + } + sub clr_string { my ($self, $text) = @_; $self->_push('System.String', qm($text)); diff --git a/Cursor.cs b/Cursor.cs index 20914f6a..2943e9a6 100644 --- a/Cursor.cs +++ b/Cursor.cs @@ -326,6 +326,28 @@ public class LADSequence : LAD { } } +public class LADImp : LAD { + public override void ToNFA(NFA pad, int from, int to) { + int knot = pad.AddNode(); + pad.nodes[knot].final = true; + pad.AddEdge(from, knot, null); + } + + public override void Dump(int indent) { + Console.WriteLine(new string(' ', indent) + "imp"); + } +} + +public class LADNull : LAD { + public override void ToNFA(NFA pad, int from, int to) { + pad.AddEdge(from, to, null); + } + + public override void Dump(int indent) { + Console.WriteLine(new string(' ', indent) + "null"); + } +} + // These objects get put in hash tables, so don't change nstates[] after // that happens public class LexerState { @@ -403,7 +425,8 @@ public class Lexer { public NFA pad = new NFA(); public string tag; - public static bool LtmTrace = true; + public static bool LtmTrace = + Environment.GetEnvironmentVariable("NIECZA_LTM_TRACE") != null; public Lexer(string tag, LAD[] alts) { this.alts = alts; diff --git a/Niecza/Actions.pm b/Niecza/Actions.pm index 1a093032..0b285301 100644 --- a/Niecza/Actions.pm +++ b/Niecza/Actions.pm @@ -355,6 +355,23 @@ sub quant_atom_list { my ($cl, $M) = @_; [ map { $_->{_ast} } @{ $M->{quantified_atom} } ]); } +my %LISTrx_types = ( + '&' => 'RxOp::Conj', + '|' => 'RxOp::Alt', + '&&' => 'RxOp::SeqConj', + '||' => 'RxOp::SeqAlt', +); +sub LISTrx { my ($cl, $M) = @_; + $M->{_ast} = $LISTrx_types{$M->{delims}[0]{sym}}->new(zyg => + [ map { $_->{_ast} } @{ $M->{list} } ]); +} + +sub regex_infix {} +sub regex_infix__S_Vert {} +sub regex_infix__S_VertVert {} +sub regex_infix__S_Amp {} +sub regex_infix__S_AmpAmp {} + sub metachar {} sub metachar__S_sigwhite { my ($cl, $M) = @_; $M->{_ast} = $::RX{s} ? RxOp::Sigspace->new : RxOp::Sequence->new; @@ -663,6 +680,9 @@ my %loose2tight = ( 'orelse' => '//', 'and' => '&&', 'or' => '||', ); sub LIST { my ($cl, $M) = @_; + if ($M->isa('STD::Regex')) { + goto &LISTrx; + } # STD guarantees that all elements of delims have the same sym # the last item may have an ast of undef due to nulltermish my $op = $M->{delims}[0]{sym}; diff --git a/RxOp.pm b/RxOp.pm index 432fcdf6..f90b583f 100644 --- a/RxOp.pm +++ b/RxOp.pm @@ -79,6 +79,11 @@ use CgOp; ]); } + sub lad { + my ($self) = @_; + CgOp::rawnew('LADStr', CgOp::clr_string($self->text)); + } + __PACKAGE__->meta->make_immutable; no Moose; } @@ -106,6 +111,13 @@ use CgOp; $self->_close_k($cn, $cont)]); } + sub lad { + my ($self) = @_; + $self->minimal ? CgOp::rawnew('LADImp') : + CgOp::rawnew('LAD' . ucfirst($qf{$self->type}), + $self->zyg->[0]->lad); + } + __PACKAGE__->meta->make_immutable; no Moose; } @@ -127,6 +139,37 @@ use CgOp; $cn, $cont; } + sub lad { + my ($self) = @_; + my @z = map { $_->lad } @{ $self->zyg }; + while (@z >= 2) { + my $x = pop @z; + $z[-1] = CgOp::rawnew('LADSequence', $z[-1], $x); + } + $z[0] // CgOp::rawnew('LADNull'); + } + + + __PACKAGE__->meta->make_immutable; + no Moose; +} + +{ + package RxOp::ConfineLang; + use Moose; + extends 'RxOp'; + + # TODO once :lang is implemented, this will be a bit more complicated + sub op { + my ($self, $cn, $cont) = @_; + $self->zyg->[0]->op($cn, $cont); + } + + sub lad { + my ($self) = @_; + $self->zyg->[0]->lad; + } + __PACKAGE__->meta->make_immutable; no Moose; } @@ -150,6 +193,11 @@ use CgOp; $self->_close_k($cn, $cont)]); } + sub lad { + my ($self) = @_; + $self->zyg->[0]->lad; + } + __PACKAGE__->meta->make_immutable; no Moose; } @@ -183,6 +231,11 @@ use CgOp; $self->_close_k($cn, $cont)]); } + sub lad { + my ($self) = @_; + $self->zyg->[0]->lad; + } + __PACKAGE__->meta->make_immutable; no Moose; } @@ -205,6 +258,11 @@ use CgOp; $self->_close_k($cn, $cont)]); } + sub lad { + my ($self) = @_; + CgOp::rawnew('LADMethod', CgOp::clr_string($self->name)); + } + __PACKAGE__->meta->make_immutable; no Moose; } @@ -225,6 +283,40 @@ use CgOp; $self->_close_k($cn, $cont)]); } + sub lad { + my ($self) = @_; + CgOp::rawnew('LADImp'); + } + + __PACKAGE__->meta->make_immutable; + no Moose; +} + +{ + package RxOp::Alt; + use Moose; + extends 'RxOp'; + + sub op { + my ($self, $cn, $cont) = @_; + my $icn = Niecza::Actions->gensym; + $icn, Op::CallSub->new( + invocant => Op::Lexical->new(name => '&_rxalt'), + positionals => [ + Op::Lexical->new(name => $icn), + Op::CgOp->new(op => CgOp::wrap(CgOp::rawnewarr('LAD', + map { $_->lad } @{ $self->zyg }))), #XXX + $self->_close_k($cn, $cont), + map { $self->_close_op($_) } @{ $self->zyg } + ]); + } + + sub lad { + my ($self) = @_; + CgOp::rawnew('LADAny', CgOp::rawnewarr('LAD', + map { $_->lad } @{ $self->zyg })); + } + __PACKAGE__->meta->make_immutable; no Moose; } diff --git a/SAFE.setting b/SAFE.setting index 8be22886..88bc84fd 100644 --- a/SAFE.setting +++ b/SAFE.setting @@ -933,6 +933,25 @@ sub _rxcut($C, $f, $k) { @l && $k(@l.shift); } +sub _rxalt($C, $lad, $k, *@alts) { + sub lbody($ix) { @alts[$ix]($C, $k) } + + Q:CgOp { + (letn csr (unbox Cursor (@ (l $C))) + lexer (rawnew Lexer (clr_string "") + (unwrap 'LAD[]' (@ (l $lad)))) + fates (rawcall (l lexer) Run (getfield backing (l csr)) + (getfield pos (l csr))) + i (int 0) + nfate (getfield Length (l fates)) + (whileloop 0 0 (< (l i) (l nfate)) (prog + (sink (subcall (@ (l &lbody)) + (box Num (cast Double (getindex (l i) (l fates)))))) + (l i (+ (l i) (int 1))))) + (null Variable)) + }; +} + sub _rxcall(@list, $k) { $k(@list.shift) while @list; } diff --git a/test.pl b/test.pl index bc099e97..c69049d7 100644 --- a/test.pl +++ b/test.pl @@ -2,7 +2,7 @@ use Test; -plan 276; +plan 280; ok 1, "one is true"; ok 2, "two is also true"; @@ -721,3 +721,16 @@ ok !("abab" ~~ / ab <.ws> ab /), "ws does not match nothing"; ok ("ab ab" ~~ rule { ab ab }), "rule gives space"; } + +{ + sub meow(*@x) { + is @x[0], 'a', "can index [0] slurpies"; + is @x[1], 'b', "can index [1] slurpies"; + } + + meow('a', 'b'); + + # doing a more reasonable test will probably require embedded blocks + ok "foobarx" ~~ / [ foo | foobar ]: x /, "LTM picks longest even if second"; + ok "foobarx" ~~ / [ foobar | foo ]: x /, "LTM picks longest even if first"; +} diff --git a/test2.pl b/test2.pl index 3b100c60..a82920e2 100644 --- a/test2.pl +++ b/test2.pl @@ -1,41 +1,4 @@ # vim: ft=perl6 use Test; -sub _rxcut($C, $f, $k) { - my @l := gather $f($C, &take); - @l && $k(@l.shift); -} - -PRE-INIT { - Cursor.HOW.add-method("orig", anon method orig { Q:CgOp { - (box Str (getfield backing (unbox Cursor (@ (l self))))) } }); - Cursor.HOW.add-method("ws", anon method ws() { - gather - Q:CgOp { - (letn rt (rawcall (unbox Cursor (@ (l self))) SimpleWS) - [ternary - (!= (l rt) (null Cursor)) - (subcall (@ (l &take)) (box (@ (l self)) (l rt))) - (null Variable)]) - }; - }); -} - -ok ("aab" ~~ /a* ab/), "a*ab backtracks"; -ok !("aab" ~~ /a*: ab/), "a*: ab doesn't"; -ok ("aab" ~~ /a*! ab/), "a*! ab backtracks"; -ok !("aab" ~~ /:r a* ab/), "ratcheting a* ab does not"; -ok !("aab" ~~ /:r a*: ab/), "ratcheting a*: ab does not"; -ok ("aab" ~~ /:r a*! ab/), "ratcheting a*! ab does"; -ok !("aab" ~~ token { a* ab }), "a* ab in a token does not"; - -ok ("ab ab" ~~ / ab <.ws> ab /), "ws matches a space"; -ok (q:to/end/ ~~ / ab <.ws> ab /), "ws matches a newline"; -ab -ab -end -ok ("ab ab" ~~ / ab <.ws> ab /), "ws matches several spaces"; -ok !("abab" ~~ / ab <.ws> ab /), "ws does not match nothing"; -ok ("ab ab" ~~ rule { ab ab }), "rule gives space"; - done-testing;