auto merge of #13700 : BurntSushi/rust/regexp, r=alexcrichton

Implements [RFC 7](https://github.com/rust-lang/rfcs/blob/master/active/0007-regexps.md) and will hopefully resolve #3591. The crate is marked as experimental. It includes a syntax extension for compiling regexps to native Rust code. Embeds and passes the `basic`, `nullsubexpr` and `repetition` tests from [Glenn Fowler's (slightly modified by Russ Cox for leftmost-first semantics) testregex test suite](http://www2.research.att.com/~astopen/testregex/testregex.html). I've also hand written a plethora of other tests that exercise Unicode support, the parser, public API, etc. Also includes a `regex-dna` benchmark for the shootout. I know the addition looks huge at first, but consider these things: 1. More than half the number of lines is dedicated to Unicode character classes. 2. Of the ~4,500 lines remaining, 1,225 of them are comments. 3. Another ~800 are tests. 4. That leaves 2500 lines for the meat. The parser is ~850 of them. The public API, compiler, dynamic VM and code generator (for `regexp!`) make up the rest.
rust-lang · Apr 25, 2014 · eea4909 · eea4909
2 parents 2bb2341 + 7269bc7
commit eea4909
Show file tree

Hide file tree

Showing 24 changed files with 11,108 additions and 6 deletions.
diff --git a/mk/crates.mk b/mk/crates.mk
@@ -51,8 +51,8 @@
 
 TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
                  uuid serialize sync getopts collections num test time rand \
-		 workcache url log
-HOST_CRATES := syntax rustc rustdoc fourcc hexfloat
+		 workcache url log regex
+HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros
 CRATES := $(TARGET_CRATES) $(HOST_CRATES)
 TOOLS := compiletest rustdoc rustc
 
@@ -84,6 +84,8 @@ DEPS_rand := std
 DEPS_url := std collections
 DEPS_workcache := std serialize collections log
 DEPS_log := std sync
+DEPS_regex := std collections
+DEPS_regex_macros = syntax std regex
 
 TOOL_DEPS_compiletest := test green rustuv getopts
 TOOL_DEPS_rustdoc := rustdoc native

diff --git a/mk/main.mk b/mk/main.mk
@@ -311,8 +311,6 @@ HSREQ$(1)_H_$(3) = $$(HBIN$(1)_H_$(3))/rustc$$(X_$(3))
 else
 HSREQ$(1)_H_$(3) = \
 	$$(HBIN$(1)_H_$(3))/rustc$$(X_$(3)) \
-	$$(HLIB$(1)_H_$(3))/stamp.rustc \
-	$$(foreach dep,$$(RUST_DEPS_rustc),$$(HLIB$(1)_H_$(3))/stamp.$$(dep)) \
 	$$(MKFILE_DEPS)
 endif
 
@@ -334,8 +332,7 @@ SREQ$(1)_T_$(2)_H_$(3) = \
 CSREQ$(1)_T_$(2)_H_$(3) = \
 	$$(TSREQ$(1)_T_$(2)_H_$(3)) \
 	$$(HBIN$(1)_H_$(3))/rustdoc$$(X_$(3)) \
-	$$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep)) \
-	$$(foreach dep,$$(HOST_CRATES),$$(HLIB$(1)_H_$(3))/stamp.$$(dep))
+	$$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep))
 
 ifeq ($(1),0)
 # Don't run the stage0 compiler under valgrind - that ship has sailed

diff --git a/src/README.md b/src/README.md
@@ -19,6 +19,7 @@ Source layout:
 | `libfourcc/`        | Data format identifier library                            |
 | `libgetopts/`       | Get command-line-options library                          |
 | `libglob/`          | Unix glob patterns library                                |
+| `libregex/`         | Regular expressions                                       |
 | `libsemver/`        | Rust's semantic versioning library                        |
 | `libserialize/`     | Encode-Decode types library                               |
 | `libsync/`          | Concurrency mechanisms and primitives                     |

diff --git a/src/doc/index.md b/src/doc/index.md
@@ -41,6 +41,7 @@ li {list-style-type: none; }
 * [The `native` 1:1 threading runtime](native/index.html)
 * [The `num` arbitrary precision numerics library](num/index.html)
 * [The `rand` library for random numbers and distributions](rand/index.html)
+* [The `regex` library for regular expressions](regex/index.html)
 * [The `rustc` compiler](rustc/index.html)
 * [The `rustuv` M:N I/O library](rustuv/index.html)
 * [The `semver` version collation library](semver/index.html)

diff --git a/src/etc/regex-match-tests.py b/src/etc/regex-match-tests.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python2
+
+# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+from __future__ import absolute_import, division, print_function
+import argparse
+import datetime
+import os.path as path
+
+
+def print_tests(tests):
+    print('\n'.join([test_tostr(t) for t in tests]))
+
+
+def read_tests(f):
+    basename, _ = path.splitext(path.basename(f))
+    tests = []
+    for lineno, line in enumerate(open(f), 1):
+        fields = filter(None, map(str.strip, line.split('\t')))
+        if not (4 <= len(fields) <= 5) \
+           or 'E' not in fields[0] or fields[0][0] == '#':
+            continue
+
+        opts, pat, text, sgroups = fields[0:4]
+        groups = []  # groups as integer ranges
+        if sgroups == 'NOMATCH':
+            groups = [None]
+        elif ',' in sgroups:
+            noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
+            for g in noparen:
+                s, e = map(str.strip, g.split(','))
+                if s == '?' and e == '?':
+                    groups.append(None)
+                else:
+                    groups.append((int(s), int(e)))
+        else:
+            # This skips tests that should result in an error.
+            # There aren't many, so I think we can just capture those
+            # manually. Possibly fix this in future.
+            continue
+
+        if pat == 'SAME':
+            pat = tests[-1][1]
+        if '$' in opts:
+            pat = pat.decode('string_escape')
+            text = text.decode('string_escape')
+        if 'i' in opts:
+            pat = '(?i)%s' % pat
+
+        name = '%s_%d' % (basename, lineno)
+        tests.append((name, pat, text, groups))
+    return tests
+
+
+def test_tostr(t):
+    lineno, pat, text, groups = t
+    options = map(group_tostr, groups)
+    return 'mat!(match_%s, r"%s", r"%s", %s)' \
+           % (lineno, pat, '' if text == "NULL" else text, ', '.join(options))
+
+
+def group_tostr(g):
+    if g is None:
+        return 'None'
+    else:
+        return 'Some((%d, %d))' % (g[0], g[1])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Generate match tests from an AT&T POSIX test file.')
+    aa = parser.add_argument
+    aa('files', nargs='+',
+       help='A list of dat AT&T POSIX test files. See src/libregexp/testdata')
+    args = parser.parse_args()
+
+    tests = []
+    for f in args.files:
+        tests += read_tests(f)
+
+    tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// ignore-tidy-linelength
+
+// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests'
+// on {date}.
+'''
+    print(tpl.format(date=str(datetime.datetime.now())))
+
+    for f in args.files:
+        print('// Tests from %s' % path.basename(f))
+        print_tests(read_tests(f))
+        print('')
diff --git a/src/etc/regex-unicode-tables.py b/src/etc/regex-unicode-tables.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python2
+
+# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+from __future__ import absolute_import, division, print_function
+import argparse
+from collections import defaultdict
+import csv
+import datetime
+import urllib2
+
+BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
+DATA = 'UnicodeData.txt'
+SCRIPTS = 'Scripts.txt'
+
+# Mapping taken from Table 12 from:
+# http://www.unicode.org/reports/tr44/#General_Category_Values
+expanded_categories = {
+    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
+    'Lm': ['L'], 'Lo': ['L'],
+    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
+    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
+    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
+    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
+    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
+    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
+    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
+}
+
+
+def as_4byte_uni(n):
+    s = hex(n)[2:]
+    return '\\U%s%s' % ('0' * (8 - len(s)), s)
+
+
+def expand_cat(c):
+    return expanded_categories.get(c, []) + [c]
+
+
+def is_valid_unicode(n):
+    return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
+
+
+def read_cats(f):
+    assigned = defaultdict(list)
+    for row in csv.reader(f, delimiter=';'):
+        (hex, cats) = (int(row[0], 16), expand_cat(row[2]))
+        if not is_valid_unicode(hex):
+            continue
+        for cat in cats:
+            assigned[cat].append(hex)
+    return assigned
+
+
+def read_scripts(f):
+    assigned = defaultdict(list)
+    for line in f:
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+        hexes, name = map(str.strip, line.split(';'))[:2]
+        name = name[:name.index('#')].strip()
+        if '..' not in hexes:
+            hex = int(hexes, 16)
+            if is_valid_unicode(hex):
+                assigned[name].append(hex)
+        else:
+            hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
+            for hex in xrange(hex1, hex2 + 1):
+                if is_valid_unicode(hex):
+                    assigned[name].append(hex)
+    return assigned
+
+
+def group(letters):
+    letters = sorted(set(letters))
+    grouped = []
+    cur_start = letters.pop(0)
+    cur_end = cur_start
+    for letter in letters:
+        assert letter > cur_end, \
+            'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
+
+        if letter == cur_end + 1:
+            cur_end = letter
+        else:
+            grouped.append((cur_start, cur_end))
+            cur_start, cur_end = letter, letter
+    grouped.append((cur_start, cur_end))
+    return grouped
+
+
+def ranges_to_rust(rs):
+    rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
+    return ',\n    '.join(rs)
+
+
+def groups_to_rust(groups):
+    rust_groups = []
+    for group_name in sorted(groups):
+        rust_groups.append('("%s", &[\n    %s\n    ]),'
+                           % (group_name, ranges_to_rust(groups[group_name])))
+    return '\n'.join(rust_groups)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Generate Unicode character class tables.')
+    aa = parser.add_argument
+    aa('--local', action='store_true',
+       help='When set, Scripts.txt and UnicodeData.txt will be read from '
+            'the CWD.')
+    aa('--base-url', type=str, default=BASE_URL,
+       help='The base URL to use for downloading Unicode data files.')
+    args = parser.parse_args()
+
+    if args.local:
+        cats = read_cats(open(DATA))
+        scripts = read_scripts(open(SCRIPTS))
+    else:
+        cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
+        scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
+
+    # Get Rust code for all Unicode general categories and scripts.
+    combined = dict(cats, **scripts)
+    unigroups = groups_to_rust({k: group(letters)
+                                for k, letters in combined.items()})
+
+    # Now get Perl character classes that are Unicode friendly.
+    perld = range(ord('0'), ord('9') + 1)
+    dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
+
+    perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
+    sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
+
+    low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
+    perlw = [ord('_')] + perld + low + up
+    wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
+
+    tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
+// on {date}.
+
+use parse::{{Class, NamedClasses}};
+
+pub static UNICODE_CLASSES: NamedClasses = &[
+
+{groups}
+
+];
+
+pub static PERLD: Class = &[
+    {dgroups}
+];
+
+pub static PERLS: Class = &[
+    {sgroups}
+];
+
+pub static PERLW: Class = &[
+    {wgroups}
+];
+'''
+    now = datetime.datetime.now()
+    print(tpl.format(date=str(now), groups=unigroups,
+                     dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))