Permalink
Cannot retrieve contributors at this time
Join GitHub today
GitHub is home to over 28 million developers working together to host and review code, manage projects, and build software together.
Sign up
Fetching contributors…
| package rure | |
| // #cgo LDFLAGS: -lrure | |
| // | |
| // #include <stdio.h> | |
| // #include <stdlib.h> | |
| // #include "rure.h" | |
| // | |
| // /* | |
| // * rure_iter_collect exhausts the given iterator over the given haystack, | |
| // * and reports all successive non-overlapping match locations in *matches. | |
| // * *num_matches is set to the number of matches found. | |
| // * | |
| // * *matches contains `*num_matches * 2` offsets, where `2 * i` and | |
| // * `2 * i + 1` represent the start and end byte offsets of match `i`. | |
| // */ | |
| // void rure_iter_collect(rure_iter *it, | |
| // const uint8_t *haystack, size_t length, | |
| // size_t **matches, size_t *num_matches) | |
| // { | |
| // rure_match m = {0}; | |
| // size_t len = 0; | |
| // size_t cap = 64; | |
| // *matches = malloc(cap * sizeof(size_t)); | |
| // if (NULL == *matches) { | |
| // fprintf(stderr, "rure_iter_collect: out of memory, aborting\n"); | |
| // abort(); | |
| // } | |
| // while (rure_iter_next(it, haystack, length, &m)) { | |
| // if ((len * 2 + 1) >= cap) { | |
| // cap *= 2; | |
| // *matches = realloc(*matches, cap * sizeof(size_t)); | |
| // if (NULL == *matches) { | |
| // fprintf( | |
| // stderr, "rure_iter_collect: out of memory, aborting\n"); | |
| // abort(); | |
| // } | |
| // } | |
| // (*matches)[len * 2 + 0] = m.start; | |
| // (*matches)[len * 2 + 1] = m.end; | |
| // len++; | |
| // } | |
| // *num_matches = len; | |
| // } | |
| import "C" | |
| import ( | |
| "fmt" | |
| "reflect" | |
| "runtime" | |
| "unsafe" | |
| ) | |
| // Flags for modifying regex behavior. All flags can be modified in the | |
| // expression itself using standard syntax. e.g., `(?i)` enables case | |
| // insensitivity and `(?-i)` disables it. | |
| const ( | |
| // FlagCaseI is the case insensitive (i) flag. | |
| FlagCaseI = C.RURE_FLAG_CASEI | |
| // FlagMulti is the multi-line matching (m) flag. | |
| // (^ and $ match new line boundaries.) | |
| FlagMulti = C.RURE_FLAG_MULTI | |
| // FlagDotNL is the any character (s) flag. (. matches new line.) | |
| FlagDotNL = C.RURE_FLAG_DOTNL | |
| // FlagSwapGreed is the greedy swap (U) flag. | |
| // (e.g., + is ungreedy and +? is greedy.) | |
| FlagSwapGreed = C.RURE_FLAG_SWAP_GREED | |
| // FlagSpace is the ignore whitespace (x) flag. | |
| FlagSpace = C.RURE_FLAG_SPACE | |
| // FlagUnicode is the Unicode (u) flag. | |
| FlagUnicode = C.RURE_FLAG_UNICODE | |
| // FlagDefault is used when calling MustCompile or Compile. | |
| FlagDefault = FlagUnicode | |
| ) | |
| // Regex is a compiled regular expression. | |
| // | |
| // It can be used safely from multiple goroutines simultaneously. | |
| type Regex struct { | |
| pattern string | |
| p *C.rure | |
| } | |
| // Options represents non-flag compile time options. | |
| // | |
| // For example, calling SetSizeLimit will place an upper bound on how big | |
| // the compiled regular expression can be. | |
| type Options struct { | |
| p *C.rure_options | |
| } | |
| // Captures represents start and end locations for every matching capture group | |
| // in a regular expression match. | |
| // | |
| // It is not safe to use from multiple goroutines simultaneously. | |
| type Captures struct { | |
| re *Regex | |
| p *C.rure_captures | |
| ok bool | |
| } | |
| // Iter is an iterator over successive non-overlapping matches in a haystack. | |
| // | |
| // It is not safe to use from multiple goroutines simultaneously. | |
| type Iter struct { | |
| re *Regex | |
| p *C.rure_iter | |
| haystack []byte | |
| match C.rure_match | |
| } | |
| // Error is an error that caused compilation of a regular expression to fail. | |
| // | |
| // Most errors are syntax errors, but an error can be returned if the compiled | |
| // regular expression would be too big. | |
| type Error struct { | |
| p *C.rure_error | |
| } | |
| // MustCompile is like Compile, but if there was a problem compiling the | |
| // pattern, then it will panic. | |
| func MustCompile(pattern string) *Regex { | |
| re, err := Compile(pattern) | |
| if err != nil { | |
| panic(fmt.Sprintf("regex.MustCompile failed on %s: %s", pattern, err)) | |
| } | |
| return re | |
| } | |
| // Compile is like CompileOptions, but uses default flags (Unicode enabled) | |
| // and default size limits. | |
| // | |
| // If there was a problem compiling the pattern, then no Regex is returned and | |
| // a non-nil error is returned. | |
| func Compile(pattern string) (*Regex, error) { | |
| return CompileOptions(pattern, FlagDefault, nil) | |
| } | |
| // CompileOptions compiles a pattern (in UTF-8) to a regular expression | |
| // suitable for searching text. | |
| // | |
| // Flags is a bitfield of the Flag constants in this package. A value of `0` | |
| // disables all flags. | |
| // | |
| // Options is a set non-flag configuration settings for the compiled regular | |
| // expression. When set to nil, default settings are used. | |
| // | |
| // If there was a problem compiling the pattern (including if it is not valid | |
| // UTF-8), then an error is returned. | |
| // | |
| // N.B. When disabling Unicode support (either via an explicit flag here or in | |
| // the pattern), one can use escape sequences to match bytes that aren't valid | |
| // UTF-8. For example, use `\\xFF` instead of `\xFF`. | |
| func CompileOptions( | |
| pattern string, | |
| flags uint32, | |
| options *Options, | |
| ) (*Regex, error) { | |
| re := &Regex{pattern: pattern} | |
| runtime.SetFinalizer(re, func(re *Regex) { | |
| if re.p != nil { | |
| C.rure_free(re.p) | |
| re.p = nil | |
| } | |
| }) | |
| var optp *C.rure_options | |
| if options != nil { | |
| optp = options.p | |
| } | |
| err := newError() | |
| re.p = C.rure_compile( | |
| asUint8Ptr(noCopyBytes(pattern)), | |
| C.size_t(len(pattern)), | |
| FlagDefault, | |
| optp, | |
| err.p, | |
| ) | |
| if re.p == nil { | |
| return nil, err | |
| } | |
| return re, nil | |
| } | |
| func (re *Regex) String() string { | |
| return re.pattern | |
| } | |
| // IsMatch returns true if text matches re. | |
| func (re *Regex) IsMatch(text string) bool { | |
| return re.IsMatchBytesAt(noCopyBytes(text), 0) | |
| } | |
| // IsMatchBytes returns true if text matches re. | |
| func (re *Regex) IsMatchBytes(text []byte) bool { | |
| return re.IsMatchBytesAt(text, 0) | |
| } | |
| // IsMatchAt returns true if text matches re starting at index i. | |
| func (re *Regex) IsMatchAt(text string, i int) bool { | |
| return re.IsMatchBytesAt(noCopyBytes(text), i) | |
| } | |
| // IsMatchBytesAt returns true if text matches re starting at index i. | |
| func (re *Regex) IsMatchBytesAt(text []byte, i int) bool { | |
| return bool(C.rure_is_match( | |
| re.p, asUint8Ptr(text), C.size_t(len(text)), C.size_t(i))) | |
| } | |
| // ShortestMatch returns the end location of a match in text if it exists. This | |
| // may return an end location that occurs before the end of the proper | |
| // leftmost-first match. | |
| // | |
| // If no match exists, false is returned. | |
| // | |
| // For example, matching `a+` against `aaaaa` will return `1` (while `Find` | |
| // will report `5` as the end). | |
| func (re *Regex) ShortestMatch(text string) (end int, ok bool) { | |
| return re.ShortestMatchBytes(noCopyBytes(text)) | |
| } | |
| // ShortestMatchBytes returns the end location of a match in text if it exists. | |
| // This may return an end location that occurs before the end of the proper | |
| // leftmost-first match. | |
| // | |
| // If no match exists, false is returned. | |
| // | |
| // For example, matching `a+` against `aaaaa` will return `1` (while `Find` | |
| // will report `5` as the end). | |
| func (re *Regex) ShortestMatchBytes(text []byte) (end int, ok bool) { | |
| var cend C.size_t | |
| ok = bool(C.rure_shortest_match( | |
| re.p, asUint8Ptr(text), C.size_t(len(text)), 0, &cend)) | |
| end = int(cend) | |
| return | |
| } | |
| // Find returns the start and end location of the leftmost-first match in text | |
| // if it exists. | |
| // | |
| // If no match exists, false is returned. | |
| func (re *Regex) Find(text string) (start, end int, ok bool) { | |
| return re.FindBytes(noCopyBytes(text)) | |
| } | |
| // FindBytes returns the start and end location of the leftmost-first match in | |
| // text if it exists. | |
| // | |
| // If no match exists, false is returned. | |
| func (re *Regex) FindBytes(text []byte) (start, end int, ok bool) { | |
| match := C.rure_match{} | |
| ok = bool(C.rure_find( | |
| re.p, asUint8Ptr(text), C.size_t(len(text)), 0, &match)) | |
| if ok { | |
| start, end = int(match.start), int(match.end) | |
| } | |
| return | |
| } | |
| // FindAll returns all successive non-overlapping matches of re in text. | |
| // | |
| // The slice returned contains a pair of start and end offsets for each match | |
| // found. The start and end offset for match i is indexed by i*2 and i*2+1, | |
| // respectively. | |
| // | |
| // This may be faster than using Iter since the slice of matches is built in | |
| // C code. | |
| func (re *Regex) FindAll(text string) []int { | |
| return re.FindAllBytes(noCopyBytes(text)) | |
| } | |
| // FindAllBytes returns all successive non-overlapping matches of re in text. | |
| // | |
| // The slice returned contains a pair of start and end offsets for each match | |
| // found. The start and end offset for match i is indexed by i*2 and i*2+1, | |
| // respectively. | |
| // | |
| // This may be faster than using Iter since the slice of matches is built in | |
| // C code. | |
| func (re *Regex) FindAllBytes(text []byte) []int { | |
| it := C.rure_iter_new(re.p) | |
| defer C.rure_iter_free(it) | |
| haystack := asUint8Ptr(text) | |
| len := C.size_t(len(text)) | |
| nmatches := C.size_t(0) | |
| matches := (*C.size_t)(nil) | |
| defer func() { | |
| if matches != nil { | |
| C.free(unsafe.Pointer(matches)) | |
| } | |
| }() | |
| C.rure_iter_collect(it, haystack, len, &matches, &nmatches) | |
| if nmatches == 0 { | |
| return nil | |
| } | |
| // Copy the matches from C memory to Go memory. | |
| matchesInts := make([]int, 2*nmatches) | |
| p := uintptr(unsafe.Pointer(matches)) | |
| stride := unsafe.Sizeof(C.size_t(0)) | |
| for i := uintptr(0); i < uintptr(nmatches); i++ { | |
| base := i * 2 | |
| start := unsafe.Pointer(p + (base * stride)) | |
| end := unsafe.Pointer(p + ((base + 1) * stride)) | |
| matchesInts[base] = int(*(*C.size_t)(start)) | |
| matchesInts[base+1] = int(*(*C.size_t)(end)) | |
| } | |
| return matchesInts | |
| } | |
| // NewCaptures allocates room for storing the start and end offset of each | |
| // capturing group in re. | |
| // | |
| // Captures may be reused in subsequent calls. When it is reused, its internal | |
| // state is reset. | |
| // | |
| // Captures may not be used from multiple threads simultaneously. | |
| func (re *Regex) NewCaptures() *Captures { | |
| caps := &Captures{re: re, p: C.rure_captures_new(re.p)} | |
| runtime.SetFinalizer(caps, func(caps *Captures) { | |
| if caps.p != nil { | |
| C.rure_captures_free(caps.p) | |
| caps.p = nil | |
| } | |
| }) | |
| return caps | |
| } | |
| // Captures populates caps with the start and end locations of all matching | |
| // capturing groups in re for text. | |
| // | |
| // If no match is found, then false is returned. | |
| // | |
| // A caps value can be built with re.NewCaptures(). | |
| // | |
| // caps must not be nil. | |
| func (re *Regex) Captures(caps *Captures, text string) bool { | |
| return re.CapturesBytes(caps, noCopyBytes(text)) | |
| } | |
| // CapturesBytes populates caps with the start and end locations of all | |
| // matching capturing groups in re for text. | |
| // | |
| // If no match is found, then false is returned. | |
| // | |
| // A caps value can be built with re.NewCaptures(). | |
| // | |
| // caps must not be nil. | |
| func (re *Regex) CapturesBytes(caps *Captures, text []byte) bool { | |
| caps.ok = bool(C.rure_find_captures( | |
| re.p, asUint8Ptr(text), C.size_t(len(text)), 0, caps.p)) | |
| return caps.ok | |
| } | |
| // Iter returns an iterator over successive non-overlapping matches of re | |
| // in text. | |
| // | |
| // Next must be called on the iterator before accessing match information. | |
| func (re *Regex) Iter(text string) *Iter { | |
| return re.IterBytes(noCopyBytes(text)) | |
| } | |
| // IterBytes returns an iterator over successive non-overlapping matches of re | |
| // in text. | |
| // | |
| // Next must be called on the iterator before accessing match information. | |
| func (re *Regex) IterBytes(text []byte) *Iter { | |
| return newIter(re, text) | |
| } | |
| // NewOptions returns a fresh options value for configuring non-flag options | |
| // of a regex. | |
| // | |
| // Options can be passed to multiple calls to CompileOptions. | |
| // | |
| // Options is not safe to mutate from multiple goroutines simultaneously, but | |
| // it may be used in calls to CompileOptions from multiple goroutines | |
| // simultaneously. | |
| func NewOptions() *Options { | |
| opts := &Options{C.rure_options_new()} | |
| runtime.SetFinalizer(opts, func(opts *Options) { | |
| if opts.p != nil { | |
| C.rure_options_free(opts.p) | |
| opts.p = nil | |
| } | |
| }) | |
| return opts | |
| } | |
| // SetSizeLimit sets the approximate size limit (in bytes) of the compiled | |
| // regular expression. | |
| // | |
| // If a pattern would result in a compiled program large than this size, then | |
| // compilation will return an error. | |
| func (opts *Options) SetSizeLimit(limit int) { | |
| C.rure_options_size_limit(opts.p, C.size_t(limit)) | |
| } | |
| // SetDFASizeLimit sets the approximate size limit (in bytes) of the DFA's | |
| // cache size. It is a per-thread limit, so that a Regex used in multiple | |
| // threads simultaneously may use up to this many bytes per-thread of usage. | |
| // | |
| // 0 is a legal value. | |
| func (opts *Options) SetDFASizeLimit(limit int) { | |
| C.rure_options_dfa_size_limit(opts.p, C.size_t(limit)) | |
| } | |
| // IsMatch returns true if caps corresponds to a match in a regular expression. | |
| func (caps *Captures) IsMatch() bool { | |
| return caps.ok | |
| } | |
| // Group returns the start and end offsets for the capturing group indexed by | |
| // i. Capturing groups are indexed by the appearance of their opening | |
| // parenthesis in the pattern. | |
| // | |
| // If the capturing group was not part of the match, then this returns false. | |
| // | |
| // Note that capture group 0 always corresponds to the full match of the | |
| // regular expression and is always unnamed. | |
| func (caps *Captures) Group(i int) (start, end int, ok bool) { | |
| match := C.rure_match{} | |
| ok = bool(C.rure_captures_at(caps.p, C.size_t(i), &match)) | |
| if ok { | |
| start, end = int(match.start), int(match.end) | |
| } | |
| return | |
| } | |
| // GroupName is like Group, but uses the name of a capturing group instead of | |
| // its index. Named capture groups look like (?P<foo>re) in the pattern. | |
| // | |
| // If no such named capture group exists or if it wasn't part of the match | |
| // of the regular expression, GroupName returns false. | |
| func (caps *Captures) GroupName(name string) (start, end int, ok bool) { | |
| i := C.rure_capture_name_index(caps.re.p, C.CString(name)) | |
| if i == -1 { | |
| return | |
| } | |
| return caps.Group(int(i)) | |
| } | |
| // Len returns the number of capturing groups. | |
| // | |
| // Once caps is created, this never changes. | |
| func (caps *Captures) Len() int { | |
| return int(C.rure_captures_len(caps.p)) | |
| } | |
| func newIter(re *Regex, haystack []byte) *Iter { | |
| it := &Iter{ | |
| re: re, | |
| haystack: haystack, | |
| p: C.rure_iter_new(re.p), | |
| } | |
| runtime.SetFinalizer(it, func(it *Iter) { | |
| if it.p != nil { | |
| C.rure_iter_free(it.p) | |
| } | |
| }) | |
| return it | |
| } | |
| // Next advances the iterator. If it finds a match, it returns true, and | |
| // otherwise returns false. Once it returns false, it will always return false. | |
| // | |
| // This must be called before a call to Match. | |
| // | |
| // If caps is nil, then capture information is not extracted. If caps is not | |
| // nil, then the start and end offsets of each matching capturing group are | |
| // stored in caps. | |
| func (it *Iter) Next(caps *Captures) bool { | |
| haystack := asUint8Ptr(it.haystack) | |
| len := C.size_t(len(it.haystack)) | |
| if caps == nil { | |
| return bool(C.rure_iter_next(it.p, haystack, len, &it.match)) | |
| } | |
| caps.ok = bool(C.rure_iter_next_captures(it.p, haystack, len, caps.p)) | |
| C.rure_captures_at(caps.p, 0, &it.match) | |
| return caps.ok | |
| } | |
| // Match returns the start and end offsets of the current match in the | |
| // iteraotr. | |
| func (it *Iter) Match() (start, end int) { | |
| return int(it.match.start), int(it.match.end) | |
| } | |
| func newError() *Error { | |
| err := &Error{C.rure_error_new()} | |
| runtime.SetFinalizer(err, func(err *Error) { | |
| if err.p != nil { | |
| C.rure_error_free(err.p) | |
| err.p = nil | |
| } | |
| }) | |
| return err | |
| } | |
| func (err *Error) Error() string { | |
| return C.GoString(C.rure_error_message(err.p)) | |
| } | |
| // Converts a string to a []byte without allocating. | |
| // | |
| // This is very dangerous and must be handled with care. In particular, the | |
| // input given must be kept alive for the duration of the return value. | |
| func noCopyBytes(s string) []byte { | |
| sh := (*reflect.StringHeader)(unsafe.Pointer(&s)) | |
| bh := reflect.SliceHeader{ | |
| Data: sh.Data, | |
| Len: sh.Len, | |
| Cap: sh.Len, | |
| } | |
| return *(*[]byte)(unsafe.Pointer(&bh)) | |
| } | |
| // Converts a byte slice to a *C.uint8_t. | |
| // | |
| // This works even for empty slices. | |
| func asUint8Ptr(bs []byte) *C.uint8_t { | |
| bh := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) | |
| return (*C.uint8_t)(unsafe.Pointer(bh.Data)) | |
| } |