Skip to content
This repository has been archived by the owner on Feb 3, 2021. It is now read-only.

Commit

Permalink
Add support for enumerated character list matching.
Browse files Browse the repository at this point in the history
  • Loading branch information
pmichaud committed Oct 9, 2009
1 parent 7e3fa98 commit bcdd212
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 3 deletions.
29 changes: 29 additions & 0 deletions src/PAST/Compiler-Regex.pir
Expand Up @@ -463,6 +463,35 @@ Handle a concatenation of regexes.
.end


=item enumcharlist(PAST::Regex node)

Generate POST for matching a character from an enumerated
character list.

=cut

.sub 'enumcharlist' :method :multi(_, ['PAST';'Regex'])
.param pmc node

.local pmc cur, tgt, pos, off, eos, fail, ops
(cur, tgt, pos, off, eos, fail) = self.'!rxregs'('cur tgt pos off eos fail')
ops = self.'post_new'('Ops', 'node'=>node, 'result'=>cur)

.local string charlist
charlist = node[0]

ops.'push_pirop'('inline', charlist, 'inline'=>' # rx enumcharlist %0')
ops.'push_pirop'('ge', pos, eos, fail)
ops.'push_pirop'('sub', '$I10', pos, off)
ops.'push_pirop'('substr', '$S10', tgt, '$I10', 1)
$S0 = self.'escape'(charlist)
ops.'push_pirop'('index', '$I11', $S0, '$S10')
ops.'push_pirop'('lt', '$I11', 0, fail)
ops.'push_pirop'('inc', pos)
.return (ops)
.end


=item literal(PAST::Regex node)

Generate POST for matching a literal string provided as the
Expand Down
39 changes: 39 additions & 0 deletions src/Regex/P6Regex/Actions.pm
Expand Up @@ -126,6 +126,10 @@ method metachar:sym<bs>($/) {
make $<backslash>.ast;
}

method metachar:sym<assert>($/) {
make $<assertion>.ast;
}

method backslash:sym<w>($/) {
my $subtype := ~$<sym> eq 'n' ?? 'nl' !! ~$<sym>;
my $past := PAST::Regex.new( :pasttype('charclass'), :subtype($subtype) );
Expand All @@ -136,3 +140,38 @@ method backslash:sym<misc>($/) {
my $past := PAST::Regex.new( ~$/ , :pasttype('literal') );
make $past;
}

method assertion:sym<[>($/) {
make $<cclass_elem>[0].ast;
}

method cclass_elem($/) {
my $str := '';
for $<charspec> {
if $_[1] {
my $a := $_[0];
my $b := $_[1][0];
my $c := Q:PIR {
$P0 = find_lex '$a'
$S0 = $P0
$I0 = ord $S0
$P1 = find_lex '$b'
$S1 = $P1
$I1 = ord $S1
$S2 = ''
cclass_loop:
if $I0 > $I1 goto cclass_done
$S0 = chr $I0
$S2 .= $S0
inc $I0
goto cclass_loop
cclass_done:
%r = box $S2
};
$str := $str ~ $c;
}
else { $str := $str ~ $_[0]; }
}
my $past := PAST::Regex.new( $str, :pasttype('enumcharlist') );
make $past;
}
6 changes: 4 additions & 2 deletions src/Regex/P6Regex/Grammar.pm
Expand Up @@ -59,6 +59,7 @@ grammar Regex::P6Regex::Grammar is PCT::Grammar;
token metachar:sym<assert> {
'<' <assertion>
[ '>' || <.panic: "regex assertion not terminated by angle bracket"> ]
{*}
}

# proto token backslash { <...> }
Expand Down Expand Up @@ -88,8 +89,8 @@ grammar Regex::P6Regex::Grammar is PCT::Grammar;
{*}
}

token assertion:sym<[> { <?before '['> <cclass_elem>+ }
token assertion:sym<+> { <?before '+'|'-'> <cclass_elem>+ }
token assertion:sym<[> { <?before '['> <cclass_elem>+ {*} }
token assertion:sym<+> { <?before '+'|'-'> <cclass_elem>+ {*} }

token cclass_elem {
$<sign>=['+'|'-'|<?>]
Expand All @@ -101,5 +102,6 @@ grammar Regex::P6Regex::Grammar is PCT::Grammar;
']'
| $<name>=[\w+]
]
{*}
}

2 changes: 1 addition & 1 deletion t/p6regex/01-regex.t
Expand Up @@ -75,7 +75,7 @@ Description of the test.
push test_files, 'rx_metachars'
push test_files, 'rx_quantifiers'
# push test_files, 'rx_backtrack'
# push test_files, 'rx_charclass'
push test_files, 'rx_charclass'
# push test_files, 'rx_subrules'
# push test_files, 'rx_lookarounds'
# push test_files, 'rx_captures'
Expand Down
82 changes: 82 additions & 0 deletions t/p6regex/rx_charclass
@@ -0,0 +1,82 @@
## Enumerated character lists
<[c]> abcdef y character class
# todo :pugs<feature>
<[ z ]> abc def n character class ignores ws
# todo :pugs<feature>
<[dcb]>**{3} abcdef y repeated character class
^<[a]> abcdef y anchored character class
<-[e]> abcdef y negated character class
^<[a]>? abcdef y anchored optional character class
<-[e]>? abcdef y negated optional character class
<-[dcb]>**{3} abcdef n repeated negated character class
^<-[e]> abcdef y anchored negated character class
^<-[a]> abcdef n anchored negated character class
<[b..d]> abcdef y character range
# todo :pugs<feature>
<[b .. d]> c y character range ignores ws
<[b..d]> abxxef y character range
<[b..d]> axcxef y character range
<[b..d]> axxdef y character range
<[b..d]> axxxef n character range
<-[b..d]> abcdef y negated character range
# todo :pugs<feature>
<- [b..d]> abcdef y negated allows ws
<-[b..d]> bbccdd n negated character range
# todo :pge<reversed character range>
<-[d..b]> bbccdd /parse error/ illegal character range
<[-]> ab-def /parse error/ unescaped hyphen
<[\-]> ab-def y escaped hyphen
<[\-]> abcdef n escaped hyphen
<-[\-]> ---x-- y negated escaped hyphen
<-[\-]> ------ n negated escaped hyphen
<[\-+]> ab-def y escaped hyphen in range
<[\-+]> ab+def y escaped hyphen in range
<[\-+]> abcdef n escaped hyphen in range
<[+\-]> ab-def y escaped hyphen in range
<[+\-]> ab+def y escaped hyphen in range
<[+\-]> abcdef n escaped hyphen in range
<-[\-+]> ---x-- y negated escaped hyphen in range
<-[\-+]> ------ n negated escaped hyphen in range
<-[+\-]> ---x-- y negated escaped hyphen in range
<-[+\-]> ------ n negated escaped hyphen in range
<["\\]> \\ y escaped backslash
<[\]]> ] y escaped close bracket
<[\]> \\]] /parse error/ unescaped backslash (or no closing brace)
^\><[<]> >< y lt character class
^<[>]>\< >< y gt character class
# todo :pugs<feature>
^<[><]>**{2} >< y gt, lt character class
# todo :pugs<feature>
^<[<>]>**{2} >< y lt, gt character class
^<-[><]> >< n not gt, lt character class
^<-[<>]> >< n not lt, gt character class
'... --- ...' ... --- ... y literal match (\')
'... --- ...' ...---... n literal match (\')
# todo :pugs<feature>
'ab\'>cd' ab'>cd y literal match with quote
'ab\\yz' ab\x5cyz y literal match with backslash
'ab"cd' ab"cd y literal match with quote
# todo :pugs<feature>
'ab\\yz' ab\x5cyz y literal match with backslash
# todo :pugs<feature> :pge<feature>
"... --- ..." ... --- ... y literal match (\")
# todo :pugs<feature> :pge<feature>
"... --- ..." ...---... n literal match (\")
# todo :pugs<feature> :pge<feature>
"ab<\">cd" ab<">cd y literal match with quote
# todo :pugs<feature> :pge<feature>
"ab<'>cd" ab<'>cd y literal match with quote
# todo :pugs<feature> :pge<feature>
"ab\\cd" ab\x5ccd y literal match with backslash
# todo :pugs<feature> :pge<feature>
(ab)x"$0" abxab y literal match with interpolation
# todo :pugs<feature> :pge<feature>
(ab)"x$0" abxab y literal match with interpolation
'?' ab<? y literal match with question mark
'<' ab<? y literal match with lt
'<?' ab<? y literal match with lt and question mark
'<?' ab<x? n non-matching literal match
<[A..Z0..9]> abcdef n two enumerated ranges
<[A..Z0..9]> abcDef y two enumerated ranges

## vim: noexpandtab tabstop=4 shiftwidth=4

0 comments on commit bcdd212

Please sign in to comment.