Permalink
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
528 lines (481 sloc) 15.4 KB
package rure
// #cgo LDFLAGS: -lrure
//
// #include <stdio.h>
// #include <stdlib.h>
// #include "rure.h"
//
// /*
// * rure_iter_collect exhausts the given iterator over the given haystack,
// * and reports all successive non-overlapping match locations in *matches.
// * *num_matches is set to the number of matches found.
// *
// * *matches contains `*num_matches * 2` offsets, where `2 * i` and
// * `2 * i + 1` represent the start and end byte offsets of match `i`.
// */
// void rure_iter_collect(rure_iter *it,
// const uint8_t *haystack, size_t length,
// size_t **matches, size_t *num_matches)
// {
// rure_match m = {0};
// size_t len = 0;
// size_t cap = 64;
// *matches = malloc(cap * sizeof(size_t));
// if (NULL == *matches) {
// fprintf(stderr, "rure_iter_collect: out of memory, aborting\n");
// abort();
// }
// while (rure_iter_next(it, haystack, length, &m)) {
// if ((len * 2 + 1) >= cap) {
// cap *= 2;
// *matches = realloc(*matches, cap * sizeof(size_t));
// if (NULL == *matches) {
// fprintf(
// stderr, "rure_iter_collect: out of memory, aborting\n");
// abort();
// }
// }
// (*matches)[len * 2 + 0] = m.start;
// (*matches)[len * 2 + 1] = m.end;
// len++;
// }
// *num_matches = len;
// }
import "C"
import (
"fmt"
"reflect"
"runtime"
"unsafe"
)
// Flags for modifying regex behavior. All flags can be modified in the
// expression itself using standard syntax. e.g., `(?i)` enables case
// insensitivity and `(?-i)` disables it.
const (
// FlagCaseI is the case insensitive (i) flag.
FlagCaseI = C.RURE_FLAG_CASEI
// FlagMulti is the multi-line matching (m) flag.
// (^ and $ match new line boundaries.)
FlagMulti = C.RURE_FLAG_MULTI
// FlagDotNL is the any character (s) flag. (. matches new line.)
FlagDotNL = C.RURE_FLAG_DOTNL
// FlagSwapGreed is the greedy swap (U) flag.
// (e.g., + is ungreedy and +? is greedy.)
FlagSwapGreed = C.RURE_FLAG_SWAP_GREED
// FlagSpace is the ignore whitespace (x) flag.
FlagSpace = C.RURE_FLAG_SPACE
// FlagUnicode is the Unicode (u) flag.
FlagUnicode = C.RURE_FLAG_UNICODE
// FlagDefault is used when calling MustCompile or Compile.
FlagDefault = FlagUnicode
)
// Regex is a compiled regular expression.
//
// It can be used safely from multiple goroutines simultaneously.
type Regex struct {
pattern string
p *C.rure
}
// Options represents non-flag compile time options.
//
// For example, calling SetSizeLimit will place an upper bound on how big
// the compiled regular expression can be.
type Options struct {
p *C.rure_options
}
// Captures represents start and end locations for every matching capture group
// in a regular expression match.
//
// It is not safe to use from multiple goroutines simultaneously.
type Captures struct {
re *Regex
p *C.rure_captures
ok bool
}
// Iter is an iterator over successive non-overlapping matches in a haystack.
//
// It is not safe to use from multiple goroutines simultaneously.
type Iter struct {
re *Regex
p *C.rure_iter
haystack []byte
match C.rure_match
}
// Error is an error that caused compilation of a regular expression to fail.
//
// Most errors are syntax errors, but an error can be returned if the compiled
// regular expression would be too big.
type Error struct {
p *C.rure_error
}
// MustCompile is like Compile, but if there was a problem compiling the
// pattern, then it will panic.
func MustCompile(pattern string) *Regex {
re, err := Compile(pattern)
if err != nil {
panic(fmt.Sprintf("regex.MustCompile failed on %s: %s", pattern, err))
}
return re
}
// Compile is like CompileOptions, but uses default flags (Unicode enabled)
// and default size limits.
//
// If there was a problem compiling the pattern, then no Regex is returned and
// a non-nil error is returned.
func Compile(pattern string) (*Regex, error) {
return CompileOptions(pattern, FlagDefault, nil)
}
// CompileOptions compiles a pattern (in UTF-8) to a regular expression
// suitable for searching text.
//
// Flags is a bitfield of the Flag constants in this package. A value of `0`
// disables all flags.
//
// Options is a set non-flag configuration settings for the compiled regular
// expression. When set to nil, default settings are used.
//
// If there was a problem compiling the pattern (including if it is not valid
// UTF-8), then an error is returned.
//
// N.B. When disabling Unicode support (either via an explicit flag here or in
// the pattern), one can use escape sequences to match bytes that aren't valid
// UTF-8. For example, use `\\xFF` instead of `\xFF`.
func CompileOptions(
pattern string,
flags uint32,
options *Options,
) (*Regex, error) {
re := &Regex{pattern: pattern}
runtime.SetFinalizer(re, func(re *Regex) {
if re.p != nil {
C.rure_free(re.p)
re.p = nil
}
})
var optp *C.rure_options
if options != nil {
optp = options.p
}
err := newError()
re.p = C.rure_compile(
asUint8Ptr(noCopyBytes(pattern)),
C.size_t(len(pattern)),
FlagDefault,
optp,
err.p,
)
if re.p == nil {
return nil, err
}
return re, nil
}
func (re *Regex) String() string {
return re.pattern
}
// IsMatch returns true if text matches re.
func (re *Regex) IsMatch(text string) bool {
return re.IsMatchBytesAt(noCopyBytes(text), 0)
}
// IsMatchBytes returns true if text matches re.
func (re *Regex) IsMatchBytes(text []byte) bool {
return re.IsMatchBytesAt(text, 0)
}
// IsMatchAt returns true if text matches re starting at index i.
func (re *Regex) IsMatchAt(text string, i int) bool {
return re.IsMatchBytesAt(noCopyBytes(text), i)
}
// IsMatchBytesAt returns true if text matches re starting at index i.
func (re *Regex) IsMatchBytesAt(text []byte, i int) bool {
return bool(C.rure_is_match(
re.p, asUint8Ptr(text), C.size_t(len(text)), C.size_t(i)))
}
// ShortestMatch returns the end location of a match in text if it exists. This
// may return an end location that occurs before the end of the proper
// leftmost-first match.
//
// If no match exists, false is returned.
//
// For example, matching `a+` against `aaaaa` will return `1` (while `Find`
// will report `5` as the end).
func (re *Regex) ShortestMatch(text string) (end int, ok bool) {
return re.ShortestMatchBytes(noCopyBytes(text))
}
// ShortestMatchBytes returns the end location of a match in text if it exists.
// This may return an end location that occurs before the end of the proper
// leftmost-first match.
//
// If no match exists, false is returned.
//
// For example, matching `a+` against `aaaaa` will return `1` (while `Find`
// will report `5` as the end).
func (re *Regex) ShortestMatchBytes(text []byte) (end int, ok bool) {
var cend C.size_t
ok = bool(C.rure_shortest_match(
re.p, asUint8Ptr(text), C.size_t(len(text)), 0, &cend))
end = int(cend)
return
}
// Find returns the start and end location of the leftmost-first match in text
// if it exists.
//
// If no match exists, false is returned.
func (re *Regex) Find(text string) (start, end int, ok bool) {
return re.FindBytes(noCopyBytes(text))
}
// FindBytes returns the start and end location of the leftmost-first match in
// text if it exists.
//
// If no match exists, false is returned.
func (re *Regex) FindBytes(text []byte) (start, end int, ok bool) {
match := C.rure_match{}
ok = bool(C.rure_find(
re.p, asUint8Ptr(text), C.size_t(len(text)), 0, &match))
if ok {
start, end = int(match.start), int(match.end)
}
return
}
// FindAll returns all successive non-overlapping matches of re in text.
//
// The slice returned contains a pair of start and end offsets for each match
// found. The start and end offset for match i is indexed by i*2 and i*2+1,
// respectively.
//
// This may be faster than using Iter since the slice of matches is built in
// C code.
func (re *Regex) FindAll(text string) []int {
return re.FindAllBytes(noCopyBytes(text))
}
// FindAllBytes returns all successive non-overlapping matches of re in text.
//
// The slice returned contains a pair of start and end offsets for each match
// found. The start and end offset for match i is indexed by i*2 and i*2+1,
// respectively.
//
// This may be faster than using Iter since the slice of matches is built in
// C code.
func (re *Regex) FindAllBytes(text []byte) []int {
it := C.rure_iter_new(re.p)
defer C.rure_iter_free(it)
haystack := asUint8Ptr(text)
len := C.size_t(len(text))
nmatches := C.size_t(0)
matches := (*C.size_t)(nil)
defer func() {
if matches != nil {
C.free(unsafe.Pointer(matches))
}
}()
C.rure_iter_collect(it, haystack, len, &matches, &nmatches)
if nmatches == 0 {
return nil
}
// Copy the matches from C memory to Go memory.
matchesInts := make([]int, 2*nmatches)
p := uintptr(unsafe.Pointer(matches))
stride := unsafe.Sizeof(C.size_t(0))
for i := uintptr(0); i < uintptr(nmatches); i++ {
base := i * 2
start := unsafe.Pointer(p + (base * stride))
end := unsafe.Pointer(p + ((base + 1) * stride))
matchesInts[base] = int(*(*C.size_t)(start))
matchesInts[base+1] = int(*(*C.size_t)(end))
}
return matchesInts
}
// NewCaptures allocates room for storing the start and end offset of each
// capturing group in re.
//
// Captures may be reused in subsequent calls. When it is reused, its internal
// state is reset.
//
// Captures may not be used from multiple threads simultaneously.
func (re *Regex) NewCaptures() *Captures {
caps := &Captures{re: re, p: C.rure_captures_new(re.p)}
runtime.SetFinalizer(caps, func(caps *Captures) {
if caps.p != nil {
C.rure_captures_free(caps.p)
caps.p = nil
}
})
return caps
}
// Captures populates caps with the start and end locations of all matching
// capturing groups in re for text.
//
// If no match is found, then false is returned.
//
// A caps value can be built with re.NewCaptures().
//
// caps must not be nil.
func (re *Regex) Captures(caps *Captures, text string) bool {
return re.CapturesBytes(caps, noCopyBytes(text))
}
// CapturesBytes populates caps with the start and end locations of all
// matching capturing groups in re for text.
//
// If no match is found, then false is returned.
//
// A caps value can be built with re.NewCaptures().
//
// caps must not be nil.
func (re *Regex) CapturesBytes(caps *Captures, text []byte) bool {
caps.ok = bool(C.rure_find_captures(
re.p, asUint8Ptr(text), C.size_t(len(text)), 0, caps.p))
return caps.ok
}
// Iter returns an iterator over successive non-overlapping matches of re
// in text.
//
// Next must be called on the iterator before accessing match information.
func (re *Regex) Iter(text string) *Iter {
return re.IterBytes(noCopyBytes(text))
}
// IterBytes returns an iterator over successive non-overlapping matches of re
// in text.
//
// Next must be called on the iterator before accessing match information.
func (re *Regex) IterBytes(text []byte) *Iter {
return newIter(re, text)
}
// NewOptions returns a fresh options value for configuring non-flag options
// of a regex.
//
// Options can be passed to multiple calls to CompileOptions.
//
// Options is not safe to mutate from multiple goroutines simultaneously, but
// it may be used in calls to CompileOptions from multiple goroutines
// simultaneously.
func NewOptions() *Options {
opts := &Options{C.rure_options_new()}
runtime.SetFinalizer(opts, func(opts *Options) {
if opts.p != nil {
C.rure_options_free(opts.p)
opts.p = nil
}
})
return opts
}
// SetSizeLimit sets the approximate size limit (in bytes) of the compiled
// regular expression.
//
// If a pattern would result in a compiled program large than this size, then
// compilation will return an error.
func (opts *Options) SetSizeLimit(limit int) {
C.rure_options_size_limit(opts.p, C.size_t(limit))
}
// SetDFASizeLimit sets the approximate size limit (in bytes) of the DFA's
// cache size. It is a per-thread limit, so that a Regex used in multiple
// threads simultaneously may use up to this many bytes per-thread of usage.
//
// 0 is a legal value.
func (opts *Options) SetDFASizeLimit(limit int) {
C.rure_options_dfa_size_limit(opts.p, C.size_t(limit))
}
// IsMatch returns true if caps corresponds to a match in a regular expression.
func (caps *Captures) IsMatch() bool {
return caps.ok
}
// Group returns the start and end offsets for the capturing group indexed by
// i. Capturing groups are indexed by the appearance of their opening
// parenthesis in the pattern.
//
// If the capturing group was not part of the match, then this returns false.
//
// Note that capture group 0 always corresponds to the full match of the
// regular expression and is always unnamed.
func (caps *Captures) Group(i int) (start, end int, ok bool) {
match := C.rure_match{}
ok = bool(C.rure_captures_at(caps.p, C.size_t(i), &match))
if ok {
start, end = int(match.start), int(match.end)
}
return
}
// GroupName is like Group, but uses the name of a capturing group instead of
// its index. Named capture groups look like (?P<foo>re) in the pattern.
//
// If no such named capture group exists or if it wasn't part of the match
// of the regular expression, GroupName returns false.
func (caps *Captures) GroupName(name string) (start, end int, ok bool) {
i := C.rure_capture_name_index(caps.re.p, C.CString(name))
if i == -1 {
return
}
return caps.Group(int(i))
}
// Len returns the number of capturing groups.
//
// Once caps is created, this never changes.
func (caps *Captures) Len() int {
return int(C.rure_captures_len(caps.p))
}
func newIter(re *Regex, haystack []byte) *Iter {
it := &Iter{
re: re,
haystack: haystack,
p: C.rure_iter_new(re.p),
}
runtime.SetFinalizer(it, func(it *Iter) {
if it.p != nil {
C.rure_iter_free(it.p)
}
})
return it
}
// Next advances the iterator. If it finds a match, it returns true, and
// otherwise returns false. Once it returns false, it will always return false.
//
// This must be called before a call to Match.
//
// If caps is nil, then capture information is not extracted. If caps is not
// nil, then the start and end offsets of each matching capturing group are
// stored in caps.
func (it *Iter) Next(caps *Captures) bool {
haystack := asUint8Ptr(it.haystack)
len := C.size_t(len(it.haystack))
if caps == nil {
return bool(C.rure_iter_next(it.p, haystack, len, &it.match))
}
caps.ok = bool(C.rure_iter_next_captures(it.p, haystack, len, caps.p))
C.rure_captures_at(caps.p, 0, &it.match)
return caps.ok
}
// Match returns the start and end offsets of the current match in the
// iteraotr.
func (it *Iter) Match() (start, end int) {
return int(it.match.start), int(it.match.end)
}
func newError() *Error {
err := &Error{C.rure_error_new()}
runtime.SetFinalizer(err, func(err *Error) {
if err.p != nil {
C.rure_error_free(err.p)
err.p = nil
}
})
return err
}
func (err *Error) Error() string {
return C.GoString(C.rure_error_message(err.p))
}
// Converts a string to a []byte without allocating.
//
// This is very dangerous and must be handled with care. In particular, the
// input given must be kept alive for the duration of the return value.
func noCopyBytes(s string) []byte {
sh := (*reflect.StringHeader)(unsafe.Pointer(&s))
bh := reflect.SliceHeader{
Data: sh.Data,
Len: sh.Len,
Cap: sh.Len,
}
return *(*[]byte)(unsafe.Pointer(&bh))
}
// Converts a byte slice to a *C.uint8_t.
//
// This works even for empty slices.
func asUint8Ptr(bs []byte) *C.uint8_t {
bh := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
return (*C.uint8_t)(unsafe.Pointer(bh.Data))
}