Skip to content

Commit 2d88d17

Browse files
committed
Merge remote-tracking branch 'refs/remotes/origin/dev' into dev
2 parents d445264 + e0f6185 commit 2d88d17

File tree

21 files changed

+82830
-74615
lines changed

21 files changed

+82830
-74615
lines changed

itest/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessorITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ public void testPostProcessor() {
5050
}
5151

5252
/**
53-
* You probably can't tell in your editor, but the input has 4 characters for
53+
* You probably can't tell in your editor, but the input has 4 characters for
5454
* <pre>für</pre>
5555
* and the output has 3
5656
*/

itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public void testSimpleSentence() throws IOException {
4646
"5\tneat\tneat\tADJ\tJJ\tDegree=Pos\t0\troot\t0:root\t_\n" +
4747
"6\t.\t.\tPUNCT\t.\t_\t5\tpunct\t5:punct\t_\n" +
4848
"\n" +
49-
"1\tBetter\tbetter\tADJ\tJJR\tDegree=Cmp\t0\troot\t0:root\t_\n" +
49+
"1\tBetter\tgood\tADJ\tJJR\tDegree=Cmp\t0\troot\t0:root\t_\n" +
5050
"2\tthan\tthan\tADP\tIN\t_\t3\tcase\t3:case\t_\n" +
5151
"3\tXML\txml\tNOUN\tNN\tNumber=Sing\t1\tobl\t1:obl:than\t_\n" +
5252
"4\t.\t.\tPUNCT\t.\t_\t1\tpunct\t1:punct\t_\n\n";

src/edu/stanford/nlp/international/french/process/FrenchLexer.flex

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -286,11 +286,9 @@ SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
286286
SPAMP = &amp;
287287
SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
288288
SPLET = &[aeiouAEIOU](acute|grave|uml);
289-
/* \u3000 is ideographic space */
290-
SPACE = [ \t\u00A0\u2000-\u200A\u3000]
291-
SPACES = {SPACE}+
292-
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
293-
SPACENL = ({SPACE}|{NEWLINE})
289+
290+
%include ../../../process/LexCommon.tokens
291+
294292
SENTEND = {SPACENL}({SPACENL}|([A-Z]|{SGML}))
295293
HYPHEN = [-_\u058A\u2010\u2011]
296294
HYPHENS = \-+
@@ -452,6 +450,11 @@ MISCSYMBOL = [+%&~\^|\\¦\u00A7¨\u00A9\u00AC\u00AE¯\u00B0-\u00B3\u00B4-\u00BA\
452450
return getNext();
453451
}
454452
}
453+
454+
/* TODO: not using LexCommon.productions because there are no PerLine settings */
455+
/* we might want to add those settings to the other tokenizers anyway */
456+
{FILENAME}/({SPACE}|[.?!,\"'<()]) { return getNext(); }
457+
455458
{ORDINAL}/{SPACE} { return getNext(); }
456459
{SPAMP} { return getNormalizedAmpNext(); }
457460
{SPPUNC} |

src/edu/stanford/nlp/international/french/process/FrenchLexer.java

Lines changed: 10243 additions & 6247 deletions
Large diffs are not rendered by default.

src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -45,64 +45,6 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
4545
token.setValue(token.word()+"-"+token.sentIndex());
4646
}
4747

48-
/**
49-
* Some people write umlauts as two characters instead of just one
50-
*<br>
51-
* German CoreNLP doesn't handle the two character versions correctly,
52-
* so here we condense it into the one character version
53-
*/
54-
public static void condenseUmlauts(CoreLabel token) {
55-
String value = token.value();
56-
String updatedValue = condenseUmlauts(value);
57-
if (updatedValue != null) {
58-
token.setValue(updatedValue);
59-
}
60-
61-
String word = token.word();
62-
String updatedWord = condenseUmlauts(word);
63-
if (updatedWord != null) {
64-
token.setWord(updatedWord);
65-
}
66-
}
67-
68-
public static String condenseUmlauts(String value) {
69-
StringBuilder ns = null;
70-
for (int i = 0; i < value.length(); ++i) {
71-
final char cur = value.charAt(i);
72-
if ((int) cur == 776) {
73-
// this is the umlaut character
74-
if (ns == null) {
75-
ns = new StringBuilder(value.length());
76-
ns.append(value.substring(0, i));
77-
}
78-
final char prev = ns.length() == 0 ? ' ' : ns.charAt(ns.length() - 1);
79-
if (prev == 'a') {
80-
ns.setCharAt(ns.length() - 1, 'ä');
81-
} else if (prev == 'A') {
82-
ns.setCharAt(ns.length() - 1, 'Ä');
83-
} else if (prev == 'o') {
84-
ns.setCharAt(ns.length() - 1, 'ö');
85-
} else if (prev == 'O') {
86-
ns.setCharAt(ns.length() - 1, 'Ö');
87-
} else if (prev == 'u') {
88-
ns.setCharAt(ns.length() - 1, 'ü');
89-
} else if (prev == 'U') {
90-
ns.setCharAt(ns.length() - 1, 'Ü');
91-
} else {
92-
ns.append(cur);
93-
}
94-
} else {
95-
if (ns != null) {
96-
ns.append(cur);
97-
}
98-
}
99-
}
100-
if (ns != null) {
101-
return ns.toString();
102-
}
103-
return null;
104-
}
105-
10648
@Override
10749
public List<CoreLabel> process(List<CoreLabel> tokens) {
10850
List<CoreLabel> processedTokens = new ArrayList<CoreLabel>();
@@ -134,9 +76,6 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
13476
}
13577
}
13678

137-
for (CoreLabel label : processedTokens) {
138-
condenseUmlauts(label);
139-
}
14079
return processedTokens;
14180
}
14281

src/edu/stanford/nlp/pipeline/JSONOutputter.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
188188
l3.set("characterOffsetEnd", token.endPosition());
189189
if (token.containsKey(CoreAnnotations.CodepointOffsetBeginAnnotation.class) &&
190190
token.containsKey(CoreAnnotations.CodepointOffsetEndAnnotation.class)) {
191-
l3.set("codepointOffsetBegin", token.beginPosition());
192-
l3.set("codepointOffsetEnd", token.endPosition());
191+
l3.set("codepointOffsetBegin", token.get(CoreAnnotations.CodepointOffsetBeginAnnotation.class));
192+
l3.set("codepointOffsetEnd", token.get(CoreAnnotations.CodepointOffsetEndAnnotation.class));
193193
}
194194
l3.set("pos", token.tag());
195195
l3.set("ner", token.ner());
@@ -216,6 +216,11 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
216216
l2.set("originalText", token.originalText());
217217
l2.set("characterOffsetBegin", token.beginPosition());
218218
l2.set("characterOffsetEnd", token.endPosition());
219+
if (token.containsKey(CoreAnnotations.CodepointOffsetBeginAnnotation.class) &&
220+
token.containsKey(CoreAnnotations.CodepointOffsetEndAnnotation.class)) {
221+
l2.set("codepointOffsetBegin", token.get(CoreAnnotations.CodepointOffsetBeginAnnotation.class));
222+
l2.set("codepointOffsetEnd", token.get(CoreAnnotations.CodepointOffsetEndAnnotation.class));
223+
}
219224
}));
220225
}
221226
}

src/edu/stanford/nlp/pipeline/UDFeatureAnnotator.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import edu.stanford.nlp.trees.ud.UniversalDependenciesFeatureAnnotator;
1010
import edu.stanford.nlp.util.CoreMap;
1111

12-
import java.io.IOException;
1312
import java.util.Collections;
1413
import java.util.Set;
1514

@@ -24,11 +23,7 @@ public class UDFeatureAnnotator extends SentenceAnnotator {
2423

2524

2625
public UDFeatureAnnotator() {
27-
try {
28-
this.featureAnnotator = new UniversalDependenciesFeatureAnnotator();
29-
} catch (IOException e) {
30-
throw new RuntimeException(e);
31-
}
26+
this.featureAnnotator = new UniversalDependenciesFeatureAnnotator();
3227
}
3328

3429
@Override
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/* \u3000 is ideographic space; \u205F is medium math space */
2+
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
3+
SPACES = {SPACE}+
4+
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
5+
SPACENL = ({SPACE}|{NEWLINE})
6+
7+
FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
8+
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}

src/edu/stanford/nlp/process/Morpha.flex

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,17 @@ G = [^ \t\r\n\u2028\u2029\u000B\u000C\u0085_]
463463
GM = [^ \t\r\n\u2028\u2029\u000B\u000C\u0085_-]
464464
SKIP = [ \t\r\n\u2028\u2029\u000B\u000C\u0085]
465465

466+
/* adjectives such as tame which become tamer, tamest */
467+
E_ADJS = "able"|"absolute"|"abstruse"|"acute"|"ample"|"austere"|"bare"|"base"|"blithe"|"blonde"|"blue"|"brave"|"brittle"|"brusque"|"capable"|"chaste"|"choice"|"close"|"coarse"|"complete"|"concise"|"crude"|"cute"|"demure"|"dense"|"dire"|"divine"|"doggone"|"eerie"|"extreme"|"false"|"feeble"|"fickle"|"fierce"|"fine"|"free"|"game"|"gauche"|"gentle"|"gladsome"|"grave"|"grewsome"|"gruesome"|"hale"|"handsome"|"hoarse"|"huge"|"humane"|"humble"|"idle"|"immense"|"inane"|"insane"|"intense"|"irate"|"kittle"|"lame"|"large"|"late"|"lithe"|"little"|"loose"|"mature"|"mere"|"mickle"|"minute"|"mute"|"naive"|"naïve"|"negative"|"nice"|"nimble"|"noble"|"nude"|"obscene"|"obscure"|"obtuse"|"opaque"|"pale"|"polite"|"positive"|"possible"|"precise"|"private"|"pure"|"purple"|"rare"|"rathe"|"remote"|"resolute"|"rife"|"ripe"|"rude"|"safe"|"sage"|"sane"|"savage"|"scarce"|"secure"|"sensible"|"serene"|"severe"|"simple"|"sincere"|"sore"|"spare"|"sparse"|"spruce"|"square"|"stable"|"stale"|"strange"|"suave"|"sublime"|"subtile"|"subtle"|"supple"|"supreme"|"sure"|"svelte"|"tame"|"tense"|"terse"|"trite"|"true"|"unique"|"unripe"|"unsafe"|"unstable"|"untrue"|"unwise"|"urbane"|"vague"|"vile"|"white"|"wholesome"|"wide"|"winsome"|"wise"|"yare"
468+
469+
/* adjectives such as hot which become hotter, hottest */
470+
XX_ADJS = "awfull"|"badd"|"bigg"|"bumm"|"carefull"|"cheerfull"|"cruell"|"dimm"|"dolefull"|"drabb"|"dunn"|"fatt"|"fearfull"|"fitt"|"flatt"|"flipp"|"fruitfull"|"full"|"gladd"|"glibb"|"glumm"|"gracefull"|"gratefull"|"grimm"|"grumm"|"hipp"|"hott"|"joyfull"|"levell"|"madd"|"mournfull"|"painfull"|"peacefull"|"pitifull"|"primm"|"redd"|"rumm"|"sadd"|"slimm"|"smugg"|"snugg"|"squatt"|"tann"|"thankfull"|"thinn"|"tranquill"|"trimm"|"wann"|"wett"|"woefull"|"wonderfull"
471+
472+
/* adjectives such as gooey which become gooier, gooiest */
473+
EY_ADJS = "cag"|"cak"|"clay"|"cliqu"|"crep"|"dic"|"dop"|"glu"|"goo"|"grip"|"hok"|"hom"|"hors"|"jok"|"lak"|"mop"|"shal"
474+
475+
COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
476+
466477
%%
467478

468479
/* can and will not always modal so can be inflected */
@@ -2074,6 +2085,36 @@ SKIP = [ \t\r\n\u2028\u2029\u000B\u000C\u0085]
20742085
<scan>"us"/_P { return(stem(2,"we","")); }
20752086
<scan>"I"/_P { return(proper_name_stem()); }
20762087
<scan>"an"/_[AD] { return(stem(1, "", "n")); }
2088+
<scan>"those"/_DT { return(stem(3, "at", "")); }
2089+
<scan>"these"/_DT { return(stem(3, "is", "")); }
2090+
<scan>"dat"/_DT { return(stem(3, "that", "")); }
2091+
2092+
<scan>"worse"/_JJR { return(stem(5, "bad", "")); }
2093+
<scan>"worst"/_JJS { return(stem(5, "bad", "")); }
2094+
<scan>"worse"/_RBR { return(stem(5, "badly", "")); }
2095+
<scan>"worst"/_RBS { return(stem(5, "badly", "")); }
2096+
<scan>"better"/_JJR { return(stem(6, "good", "")); }
2097+
<scan>"best"/_JJS { return(stem(4, "good", "")); }
2098+
<scan>"better"/_RBR { return(stem(6, "well", "")); }
2099+
<scan>"best"/_RBS { return(stem(4, "well", "")); }
2100+
2101+
/* further_JJR discussion stays further in GUM */
2102+
<scan>"further"/_JJR { return(stem(0, "", "")); }
2103+
/* further_RBR extend becomes far */
2104+
<scan>"f"[au]"rther"/_RBR { return(stem(6, "ar", "")); }
2105+
<scan>"f"[au]"rthest"/_RBS { return(stem(7, "ar", "")); }
2106+
2107+
<scan>{E_ADJS}r/_{COMP_SUP} { return(stem(1, "", "")); }
2108+
<scan>{E_ADJS}st/_{COMP_SUP} { return(stem(2, "", "")); }
2109+
<scan>{XX_ADJS}er/_{COMP_SUP} { return(stem(3, "", "")); }
2110+
<scan>{XX_ADJS}est/_{COMP_SUP} { return(stem(4, "", "")); }
2111+
<scan>{EY_ADJS}ier/_{COMP_SUP} { return(stem(3, "ey", "")); }
2112+
<scan>{EY_ADJS}iest/_{COMP_SUP} { return(stem(4, "ey", "")); }
2113+
<scan>{G}+ier/_{COMP_SUP} { return(stem(3, "y", "")); }
2114+
<scan>{G}+iest/_{COMP_SUP} { return(stem(4, "y", "")); }
2115+
<scan>{G}+er/_{COMP_SUP} { return(stem(2, "", "")); }
2116+
<scan>{G}+est/_{COMP_SUP} { return(stem(3, "", "")); }
2117+
20772118
<scan>{G}+/_NN[^P] { yybegin(noun); yypushback(yylength()); return(next()); }
20782119
<scan>{G}+/_NNP { return(proper_name_stem()); }
20792120
<scan>{G}+/_V { yybegin(verb); yypushback(yylength()); return(next()); }

0 commit comments

Comments
 (0)