Skip to content
This repository has been archived by the owner on Feb 3, 2021. It is now read-only.

Commit

Permalink
Add some more built-in regexes (char classes, word boundaries, etc.).
Browse files Browse the repository at this point in the history
  • Loading branch information
pmichaud committed Oct 12, 2009
1 parent 7155d02 commit 90293e5
Show file tree
Hide file tree
Showing 2 changed files with 186 additions and 2 deletions.
121 changes: 119 additions & 2 deletions src/Regex/Cursor-builtins.pir
Expand Up @@ -11,7 +11,83 @@ Regex::Cursor-builtins - builtin regexes for Cursor objects

.namespace ['Regex';'Cursor']

.sub '!cclass' :method
.sub 'ident' :method
.local pmc cur
.local int pos, eos
.local string tgt
(cur, pos, tgt) = self.'!cursor_start'()
eos = length tgt
$S0 = substr tgt, pos, 1
if $S0 == '_' goto ident_1
$I0 = is_cclass .CCLASS_ALPHABETIC, tgt, pos
unless $I0 goto fail
ident_1:
pos = find_not_cclass .CCLASS_WORD, tgt, pos, eos
cur.'!cursor_pass'(pos, 'ident')
fail:
.return (cur)
.end

.sub 'wb' :method
.local pmc cur
.local int pos, eos
.local string tgt
(cur, pos, tgt) = self.'!cursor_start'()
if pos == 0 goto pass
eos = length tgt
if pos == eos goto pass
$I0 = pos - 1
$I1 = is_cclass .CCLASS_WORD, tgt, $I0
$I2 = is_cclass .CCLASS_WORD, tgt, pos
if $I1 == $I2 goto fail
pass:
cur.'!cursor_pass'(pos, 'wb')
fail:
.return (cur)
.end

.sub 'ww' :method
.local pmc cur
.local int pos, eos
.local string tgt
(cur, pos, tgt) = self.'!cursor_start'()
if pos == 0 goto fail
eos = length tgt
if pos == eos goto fail
$I0 = is_cclass .CCLASS_WORD, tgt, pos
unless $I0 goto fail
$I1 = pos - 1
$I0 = is_cclass .CCLASS_WORD, tgt, $I1
unless $I0 goto fail
pass:
cur.'!cursor_pass'(pos, 'ww')
fail:
.return (cur)
.end

.sub 'ws' :method
.local pmc cur
.local int pos, eos
.local string tgt
(cur, pos, tgt) = self.'!cursor_start'()
eos = length tgt
if pos >= eos goto pass
if pos == 0 goto ws_scan
$I0 = is_cclass .CCLASS_WORD, tgt, pos
unless $I0 goto ws_scan
$I1 = pos - 1
$I0 = is_cclass .CCLASS_WORD, tgt, $I1
if $I0 goto fail
ws_scan:
pos = find_not_cclass .CCLASS_WHITESPACE, tgt, pos, eos
pass:
cur.'!cursor_pass'(pos, 'ws')
fail:
.return (cur)
.end

.sub '!cclass' :anon
.param pmc self
.param string name
.param int cclass
.local pmc cur
Expand Down Expand Up @@ -42,9 +118,50 @@ Regex::Cursor-builtins - builtin regexes for Cursor objects
.return (cur)
.end

.sub 'upper' :method
.tailcall '!cclass'(self, 'upper', .CCLASS_UPPERCASE)
.end

.sub 'lower' :method
.tailcall '!cclass'(self, 'lower', .CCLASS_LOWERCASE)
.end

.sub 'digit' :method
.tailcall self.'!cclass'('digit', .CCLASS_NUMERIC)
.tailcall '!cclass'(self, 'digit', .CCLASS_NUMERIC)
.end

.sub 'xdigit' :method
.tailcall '!cclass'(self, 'xdigit', .CCLASS_HEXADECIMAL)
.end

.sub 'print' :method
.tailcall '!cclass'(self, 'print', .CCLASS_PRINTING)
.end

.sub 'graph' :method
.tailcall '!cclass'(self, 'graph', .CCLASS_GRAPHICAL)
.end

.sub 'cntrl' :method
.tailcall '!cclass'(self, 'cntrl', .CCLASS_CONTROL)
.end

.sub 'punct' :method
.tailcall '!cclass'(self, 'punct', .CCLASS_PUNCTUATION)
.end

.sub 'alnum' :method
.tailcall '!cclass'(self, 'alnum', .CCLASS_ALPHANUMERIC)
.end

.sub 'space' :method
.tailcall '!cclass'(self, 'space', .CCLASS_WHITESPACE)
.end

.sub 'blank' :method
.tailcall '!cclass'(self, 'blank', .CCLASS_BLANK)
.end


=head1 AUTHORS

Expand Down
67 changes: 67 additions & 0 deletions t/p6regex/rx_subrules
@@ -0,0 +1,67 @@
## builtin subrules

<ident> 2+3 ab2 /mob<ident>: <ab2 @ 4>/ capturing builtin <ident>
<name> ab::cd::x3::42 /mob<name>: <ab::cd::x3 @ 0>/ capturing builtin <name>

<.ident> 2+3 ab2 y non-capturing builtin <.ident>
<.name> ab::cd::x3::42 y non-capturing builtin <.name>

<?wb>def abc\ndef\n-==\nghi y word boundary \W\w
abc<?wb> abc\ndef\n-==\nghi y word boundary \w\W
<?wb>abc abc\ndef\n-==\nghi y BOS word boundary
ghi<?wb> abc\ndef\n-==\nghi y EOS word boundary
a<?wb> abc\ndef\n-==\nghi n \w\w word boundary
\-<?wb> abc\ndef\n-==\nghi n \W\W word boundary
<!wb>def abc\ndef\n-==\nghi n nonword boundary \W\w
abc<!wb> abc\ndef\n-==\nghi n nonword boundary \w\W
<!wb>abc abc\ndef\n-==\nghi n BOS nonword boundary
ghi<!wb> abc\ndef\n-==\nghi n EOS nonword boundary
a<!wb> abc\ndef\n-==\nghi y \w\w nonword boundary
\-<!wb> abc\ndef\n-==\nghi y \W\W nonword boundary

<upper> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<upper>: <A @ 45>/ <upper>
<+upper> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <A @ 45>/ <+upper>
<+upper>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <ABCDEFGHIJ @ 45>/ <+upper>+
<lower> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<lower>: <a @ 55>/ <lower>
<+lower> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <a @ 55>/ <+lower>
<+lower>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <abcdefghij @ 55>/ <+lower>+
<alpha> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<alpha>: <_ @ 31>/ <alpha>
<+alpha> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <_ @ 31>/ <+alpha>
<+alpha>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <_ @ 31>/ <+alpha>+
<digit> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<digit>: <0 @ 35>/ <digit>
<+digit> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <0 @ 35>/ <+digit>
<+digit>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <0123456789 @ 35>/ <+digit>+
<xdigit> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<xdigit>: <0 @ 35>/ <xdigit>
<+xdigit> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <0 @ 35>/ <+xdigit>
<+xdigit>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <0123456789ABCDEF @ 35>/ <+xdigit>+
<space> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<space>: <\t @ 0>/ <space>
<+space> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <\t @ 0>/ <+space>
<+space>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <\t\n\r @ 0>/ <+space>+
<blank> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<blank>: <\t @ 0>/ <blank>
<+blank> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <\t @ 0>/ <+blank>
<+blank>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <\t @ 0>/ <+blank>+
<cntrl> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<cntrl>: <\t @ 0>/ <cntrl>
<+cntrl> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <\t @ 0>/ <+cntrl>
<+cntrl>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <\t\n\r @ 0>/ <+cntrl>+
<punct> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<punct>: <! @ 4>/ <punct>
<+punct> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <! @ 4>/ <+punct>
<+punct>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <!"#$%&/ <+punct>+
<alnum> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob<alnum>: <0 @ 35>/ <alnum>
<+alnum> \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <0 @ 35>/ <+alnum>
<+alnum>+ \t\n\r !"#$%&'()*+,-./:;<=>?@[\]^`_{|}0123456789ABCDEFGHIJabcdefghij /mob: <0123456789ABCDEFGHIJabcdefghij @ 35>/ <+alnum>+
<+alnum+[_]> ident_1 y union of character classes
<+[ab]+[\-]>+ aaa-bbb y enumerated character classes
<+ [ a b ]+[\-]>+ aaa-bbb y whitespace is ignored within square brackets and after the initial +
<+[ab]+[\-]>+ -ab- y enumerated character classes variant
<+[ab]+[\-]>+ ---- y enumerated character classes variant
<+[ab]+[\-]>+ - y enumerated character classes variant
<-[ab]+[cd]>+ ccdd y enumerated character classes variant
^<-[ab]+[cd]>+$ caad n enumerated character classes variant
<- [ a b ]+[cd]>+ ccdd y whitespace is ignored within square brackets and after the initial -
^<-upper>dent ident_1 y inverted character class
^<-upper>dent Ident_1 n inverted character class
<+alpha-[Jj]>+ abc y character class with no j
<+ alpha - [ Jj ]> abc y character class with no j with ws
^<+alpha-[Jj]>+$ aJc n character class with no j fail

## vim: noexpandtab tabstop=4 shiftwidth=4

0 comments on commit 90293e5

Please sign in to comment.