stanfordnlp
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/ChineseSegmenterAnnotatorITest.java
Lines changed: 12 additions & 0 deletions b/‎itest/src/edu/stanford/nlp/pipeline/ChineseSegmenterAnnotatorITest.java
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
Lines changed: 3 additions & 0 deletions b/‎src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/edu/stanford/nlp/process/LexCommon.tokens
Lines changed: 4 additions & 3 deletions b/‎src/edu/stanford/nlp/process/LexCommon.tokens
Lines changed: 4 additions & 3 deletions
@@ -38,6 +38,18 @@ public void testPipeline() {
         new int[]{0, 1, 3, 5, 7, 8},
         new int[]{1, 3, 5, 7, 8, 9});
 
+    // test that it does something reasonable with spaces
+    testOne("我在 加州 工作 ",
+            new String[]{"我", "在", "加州", "工作"},
+            new int[]{0, 1, 3, 6},
+            new int[]{1, 2, 5, 8});
+
+    // test that it does something reasonable with NBSP
+    testOne("我在 加州 工作 ",
+            new String[]{"我", "在", "加州", "工作"},
+            new int[]{0, 1, 3, 6},
+            new int[]{1, 2, 5, 8});
+
     // All of the tools should now produce () instead of -LRB- -RRB-
     testOne("你马上回来(北京)吗？",
             new String[]{"你", "马上", "回来", "(", "北京", ")", "吗", "？"},
 
@@ -689,6 +689,9 @@ public List<String> segmentString(String sentence, DocumentReaderAndWriter<IN> r
     if (segmented.length() == 0) {
       return Collections.emptyList();
     } else {
+      // \\p{Zs} would catch more whitespace options than \\s,
+      // but hopefully the upstream segmentation handled
+      // unusual whitespace such as NBSP already
       return Arrays.asList(segmented.split("\\s"));
     }
   }
 
@@ -1,13 +1,14 @@
 /* \u3000 is ideographic space; \u205F is medium math space */
-SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u3000]
+/* \u2063 is an invisible separator */
+SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000]
 SPACES = {SPACE}+
 NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
 SPACENL = ({SPACE}|{NEWLINE})
 SPACENLS = {SPACENL}+
 
 /* These next ones are useful to get a fixed length trailing context. */
-SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u205F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
-NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
+SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
+NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
 
 FILENAME_EXT = 3gp|aac|aspx|avi|bat|bmp|bz2|c|class|cgi|cpp|csv|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|m4a|m4v|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|pptx|ps|psd|py|rtf|sql|tar|tgz|tif|tiff|tmp|txt|wav|wm[va]|x|xls|xlsx|xml|zip
 FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
Original file line number	Diff line number	Diff line change
`@@ -689,6 +689,9 @@ public List<String> segmentString(String sentence, DocumentReaderAndWriter<IN> r`
`689`	`689`	`if (segmented.length() == 0) {`
`690`	`690`	`return Collections.emptyList();`
`691`	`691`	`} else {`
	`692`	`+ // \\p{Zs} would catch more whitespace options than \\s,`
	`693`	`+ // but hopefully the upstream segmentation handled`
	`694`	`+ // unusual whitespace such as NBSP already`
`692`	`695`	`return Arrays.asList(segmented.split("\\s"));`
`693`	`696`	`}`
`694`	`697`	`}`