Skip to content

Commit c9a5fb2

Browse files
committed
Merging changes
2 parents f758c04 + 431ad54 commit c9a5fb2

File tree

5 files changed

+85865
-86073
lines changed

5 files changed

+85865
-86073
lines changed

itest/src/edu/stanford/nlp/pipeline/ChineseSegmenterAnnotatorITest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,18 @@ public void testPipeline() {
3838
new int[]{0, 1, 3, 5, 7, 8},
3939
new int[]{1, 3, 5, 7, 8, 9});
4040

41+
// test that it does something reasonable with spaces
42+
testOne("我在 加州 工作 ",
43+
new String[]{"我", "在", "加州", "工作"},
44+
new int[]{0, 1, 3, 6},
45+
new int[]{1, 2, 5, 8});
46+
47+
// test that it does something reasonable with NBSP
48+
testOne("我在 加州 工作 ",
49+
new String[]{"我", "在", "加州", "工作"},
50+
new int[]{0, 1, 3, 6},
51+
new int[]{1, 2, 5, 8});
52+
4153
// All of the tools should now produce () instead of -LRB- -RRB-
4254
testOne("你马上回来(北京)吗?",
4355
new String[]{"你", "马上", "回来", "(", "北京", ")", "吗", "?"},

src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,9 @@ public List<String> segmentString(String sentence, DocumentReaderAndWriter<IN> r
689689
if (segmented.length() == 0) {
690690
return Collections.emptyList();
691691
} else {
692+
// \\p{Zs} would catch more whitespace options than \\s,
693+
// but hopefully the upstream segmentation handled
694+
// unusual whitespace such as NBSP already
692695
return Arrays.asList(segmented.split("\\s"));
693696
}
694697
}
Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
/* \u3000 is ideographic space; \u205F is medium math space */
2-
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u3000]
2+
/* \u2063 is an invisible separator */
3+
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000]
34
SPACES = {SPACE}+
45
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
56
SPACENL = ({SPACE}|{NEWLINE})
67
SPACENLS = {SPACENL}+
78

89
/* These next ones are useful to get a fixed length trailing context. */
9-
SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u205F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
10-
NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
10+
SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
11+
NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
1112

1213
FILENAME_EXT = 3gp|aac|aspx|avi|bat|bmp|bz2|c|class|cgi|cpp|csv|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|m4a|m4v|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|pptx|ps|psd|py|rtf|sql|tar|tgz|tif|tiff|tmp|txt|wav|wm[va]|x|xls|xlsx|xml|zip
1314
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}

0 commit comments

Comments
 (0)