stanfordnlp
diff --git a/‎itest/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessorITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessorITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/pipeline/CoNLLUOutputterITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/edu/stanford/nlp/international/french/process/FrenchLexer.flex
Lines changed: 8 additions & 5 deletions b/‎src/edu/stanford/nlp/international/french/process/FrenchLexer.flex
Lines changed: 8 additions & 5 deletions
diff --git a/‎src/edu/stanford/nlp/international/french/process/FrenchLexer.java
Lines changed: 10243 additions & 6247 deletions b/‎src/edu/stanford/nlp/international/french/process/FrenchLexer.java
Lines changed: 10243 additions & 6247 deletions
diff --git a/‎src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java
Lines changed: 0 additions & 61 deletions b/‎src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java
Lines changed: 0 additions & 61 deletions
diff --git a/‎src/edu/stanford/nlp/pipeline/JSONOutputter.java
Lines changed: 7 additions & 2 deletions b/‎src/edu/stanford/nlp/pipeline/JSONOutputter.java
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/edu/stanford/nlp/pipeline/UDFeatureAnnotator.java
Lines changed: 1 addition & 6 deletions b/‎src/edu/stanford/nlp/pipeline/UDFeatureAnnotator.java
Lines changed: 1 addition & 6 deletions
diff --git a/‎src/edu/stanford/nlp/process/LexCommon.tokens
Lines changed: 8 additions & 0 deletions b/‎src/edu/stanford/nlp/process/LexCommon.tokens
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/edu/stanford/nlp/process/Morpha.flex
Lines changed: 41 additions & 0 deletions b/‎src/edu/stanford/nlp/process/Morpha.flex
Lines changed: 41 additions & 0 deletions
@@ -50,7 +50,7 @@ public void testPostProcessor() {
   }
 
   /**
-   * You probably can't tell in your editor, but the input has 4 characters for 
+   * You probably can't tell in your editor, but the input has 4 characters for
    * <pre>für</pre>
    * and the output has 3
    */
 
@@ -46,7 +46,7 @@ public void testSimpleSentence() throws IOException {
             "5\tneat\tneat\tADJ\tJJ\tDegree=Pos\t0\troot\t0:root\t_\n" +
             "6\t.\t.\tPUNCT\t.\t_\t5\tpunct\t5:punct\t_\n" +
             "\n" +
-            "1\tBetter\tbetter\tADJ\tJJR\tDegree=Cmp\t0\troot\t0:root\t_\n" +
+            "1\tBetter\tgood\tADJ\tJJR\tDegree=Cmp\t0\troot\t0:root\t_\n" +
             "2\tthan\tthan\tADP\tIN\t_\t3\tcase\t3:case\t_\n" +
             "3\tXML\txml\tNOUN\tNN\tNumber=Sing\t1\tobl\t1:obl:than\t_\n" +
             "4\t.\t.\tPUNCT\t.\t_\t1\tpunct\t1:punct\t_\n\n";
 
@@ -286,11 +286,9 @@ SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
 SPAMP = &amp;
 SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
 SPLET = &[aeiouAEIOU](acute|grave|uml);
-/* \u3000 is ideographic space */
-SPACE = [ \t\u00A0\u2000-\u200A\u3000]
-SPACES = {SPACE}+
-NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
-SPACENL = ({SPACE}|{NEWLINE})
+
+%include ../../../process/LexCommon.tokens
+
 SENTEND = {SPACENL}({SPACENL}|([A-Z]|{SGML}))
 HYPHEN = [-_\u058A\u2010\u2011]
 HYPHENS = \-+
@@ -452,6 +450,11 @@ MISCSYMBOL = [+%&~\^|\\¦\u00A7¨\u00A9\u00AC\u00AE¯\u00B0-\u00B3\u00B4-\u00BA\
                             return getNext();
                           }
                         }
+
+/* TODO: not using LexCommon.productions because there are no PerLine settings */
+/* we might want to add those settings to the other tokenizers anyway */
+{FILENAME}/({SPACE}|[.?!,\"'<()])      { return getNext(); }
+
 {ORDINAL}/{SPACE}       { return getNext(); }
 {SPAMP}                 { return getNormalizedAmpNext(); }
 {SPPUNC} |
 
@@ -45,64 +45,6 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
     token.setValue(token.word()+"-"+token.sentIndex());
   }
 
-  /**
-   * Some people write umlauts as two characters instead of just one
-   *<br>
-   * German CoreNLP doesn't handle the two character versions correctly,
-   * so here we condense it into the one character version
-   */
-  public static void condenseUmlauts(CoreLabel token) {
-    String value = token.value();
-    String updatedValue = condenseUmlauts(value);
-    if (updatedValue != null) {
-      token.setValue(updatedValue);
-    }
-
-    String word = token.word();
-    String updatedWord = condenseUmlauts(word);
-    if (updatedWord != null) {
-      token.setWord(updatedWord);
-    }
-  }
-    
-  public static String condenseUmlauts(String value) {
-    StringBuilder ns = null;
-    for (int i = 0; i < value.length(); ++i) {
-      final char cur = value.charAt(i);
-      if ((int) cur == 776) {
-        // this is the umlaut character
-        if (ns == null) {
-          ns = new StringBuilder(value.length());
-          ns.append(value.substring(0, i));
-        }
-        final char prev = ns.length() == 0 ? ' ' : ns.charAt(ns.length() - 1);
-        if (prev == 'a') {
-          ns.setCharAt(ns.length() - 1, 'ä');
-        } else if (prev == 'A') {
-          ns.setCharAt(ns.length() - 1, 'Ä');
-        } else if (prev == 'o') {
-          ns.setCharAt(ns.length() - 1, 'ö');
-        } else if (prev == 'O') {
-          ns.setCharAt(ns.length() - 1, 'Ö');
-        } else if (prev == 'u') {
-          ns.setCharAt(ns.length() - 1, 'ü');
-        } else if (prev == 'U') {
-          ns.setCharAt(ns.length() - 1, 'Ü');
-        } else {
-          ns.append(cur);
-        }
-      } else {
-        if (ns != null) {
-          ns.append(cur);
-        }
-      }
-    }
-    if (ns != null) {
-      return ns.toString();
-    }
-    return null;
-  }
-
   @Override
   public List<CoreLabel> process(List<CoreLabel> tokens) {
     List<CoreLabel> processedTokens = new ArrayList<CoreLabel>();
@@ -134,9 +76,6 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
       }
     }
 
-    for (CoreLabel label : processedTokens) {
-      condenseUmlauts(label);
-    }
     return processedTokens;
   }
 
 
@@ -188,8 +188,8 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
               l3.set("characterOffsetEnd", token.endPosition());
               if (token.containsKey(CoreAnnotations.CodepointOffsetBeginAnnotation.class) &&
                   token.containsKey(CoreAnnotations.CodepointOffsetEndAnnotation.class)) {
-                l3.set("codepointOffsetBegin", token.beginPosition());
-                l3.set("codepointOffsetEnd", token.endPosition());
+                l3.set("codepointOffsetBegin", token.get(CoreAnnotations.CodepointOffsetBeginAnnotation.class));
+                l3.set("codepointOffsetEnd", token.get(CoreAnnotations.CodepointOffsetEndAnnotation.class));
               }
               l3.set("pos", token.tag());
               l3.set("ner", token.ner());
@@ -216,6 +216,11 @@ public void print(Annotation doc, OutputStream target, Options options) throws I
                 l2.set("originalText", token.originalText());
                 l2.set("characterOffsetBegin", token.beginPosition());
                 l2.set("characterOffsetEnd", token.endPosition());
+                if (token.containsKey(CoreAnnotations.CodepointOffsetBeginAnnotation.class) &&
+                    token.containsKey(CoreAnnotations.CodepointOffsetEndAnnotation.class)) {
+                  l2.set("codepointOffsetBegin", token.get(CoreAnnotations.CodepointOffsetBeginAnnotation.class));
+                  l2.set("codepointOffsetEnd", token.get(CoreAnnotations.CodepointOffsetEndAnnotation.class));
+                }
           }));
         }
       }
 
@@ -9,7 +9,6 @@
 import edu.stanford.nlp.trees.ud.UniversalDependenciesFeatureAnnotator;
 import edu.stanford.nlp.util.CoreMap;
 
-import java.io.IOException;
 import java.util.Collections;
 import java.util.Set;
 
@@ -24,11 +23,7 @@ public class UDFeatureAnnotator extends SentenceAnnotator {
 
 
   public UDFeatureAnnotator() {
-    try {
-      this.featureAnnotator = new UniversalDependenciesFeatureAnnotator();
-    } catch (IOException e) {
-      throw new RuntimeException(e);
-    }
+    this.featureAnnotator = new UniversalDependenciesFeatureAnnotator();
   }
 
   @Override
 
@@ -0,0 +1,8 @@
+/* \u3000 is ideographic space; \u205F is medium math space */
+SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
+SPACES = {SPACE}+
+NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
+SPACENL = ({SPACE}|{NEWLINE})
+
+FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
+FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
@@ -463,6 +463,17 @@ G = [^ \t\r\n\u2028\u2029\u000B\u000C\u0085_]
 GM = [^ \t\r\n\u2028\u2029\u000B\u000C\u0085_-]
 SKIP = [ \t\r\n\u2028\u2029\u000B\u000C\u0085]
 
+/* adjectives such as tame which become tamer, tamest */
+E_ADJS = "able"|"absolute"|"abstruse"|"acute"|"ample"|"austere"|"bare"|"base"|"blithe"|"blonde"|"blue"|"brave"|"brittle"|"brusque"|"capable"|"chaste"|"choice"|"close"|"coarse"|"complete"|"concise"|"crude"|"cute"|"demure"|"dense"|"dire"|"divine"|"doggone"|"eerie"|"extreme"|"false"|"feeble"|"fickle"|"fierce"|"fine"|"free"|"game"|"gauche"|"gentle"|"gladsome"|"grave"|"grewsome"|"gruesome"|"hale"|"handsome"|"hoarse"|"huge"|"humane"|"humble"|"idle"|"immense"|"inane"|"insane"|"intense"|"irate"|"kittle"|"lame"|"large"|"late"|"lithe"|"little"|"loose"|"mature"|"mere"|"mickle"|"minute"|"mute"|"naive"|"naïve"|"negative"|"nice"|"nimble"|"noble"|"nude"|"obscene"|"obscure"|"obtuse"|"opaque"|"pale"|"polite"|"positive"|"possible"|"precise"|"private"|"pure"|"purple"|"rare"|"rathe"|"remote"|"resolute"|"rife"|"ripe"|"rude"|"safe"|"sage"|"sane"|"savage"|"scarce"|"secure"|"sensible"|"serene"|"severe"|"simple"|"sincere"|"sore"|"spare"|"sparse"|"spruce"|"square"|"stable"|"stale"|"strange"|"suave"|"sublime"|"subtile"|"subtle"|"supple"|"supreme"|"sure"|"svelte"|"tame"|"tense"|"terse"|"trite"|"true"|"unique"|"unripe"|"unsafe"|"unstable"|"untrue"|"unwise"|"urbane"|"vague"|"vile"|"white"|"wholesome"|"wide"|"winsome"|"wise"|"yare"
+
+/* adjectives such as hot which become hotter, hottest */
+XX_ADJS = "awfull"|"badd"|"bigg"|"bumm"|"carefull"|"cheerfull"|"cruell"|"dimm"|"dolefull"|"drabb"|"dunn"|"fatt"|"fearfull"|"fitt"|"flatt"|"flipp"|"fruitfull"|"full"|"gladd"|"glibb"|"glumm"|"gracefull"|"gratefull"|"grimm"|"grumm"|"hipp"|"hott"|"joyfull"|"levell"|"madd"|"mournfull"|"painfull"|"peacefull"|"pitifull"|"primm"|"redd"|"rumm"|"sadd"|"slimm"|"smugg"|"snugg"|"squatt"|"tann"|"thankfull"|"thinn"|"tranquill"|"trimm"|"wann"|"wett"|"woefull"|"wonderfull"
+
+/* adjectives such as gooey which become gooier, gooiest */
+EY_ADJS = "cag"|"cak"|"clay"|"cliqu"|"crep"|"dic"|"dop"|"glu"|"goo"|"grip"|"hok"|"hom"|"hors"|"jok"|"lak"|"mop"|"shal"
+
+COMP_SUP = "JJR"|"JJS"|"RBR"|"RBS"
+
 %%
 
  /* can and will not always modal so can be inflected */
@@ -2074,6 +2085,36 @@ SKIP = [ \t\r\n\u2028\u2029\u000B\u000C\u0085]
 <scan>"us"/_P     { return(stem(2,"we","")); }
 <scan>"I"/_P      { return(proper_name_stem()); }
 <scan>"an"/_[AD]     { return(stem(1, "", "n")); }
+<scan>"those"/_DT     { return(stem(3, "at", "")); }
+<scan>"these"/_DT     { return(stem(3, "is", "")); }
+<scan>"dat"/_DT     { return(stem(3, "that", "")); }
+
+<scan>"worse"/_JJR    { return(stem(5, "bad", "")); }
+<scan>"worst"/_JJS    { return(stem(5, "bad", "")); }
+<scan>"worse"/_RBR    { return(stem(5, "badly", "")); }
+<scan>"worst"/_RBS    { return(stem(5, "badly", "")); }
+<scan>"better"/_JJR   { return(stem(6, "good", "")); }
+<scan>"best"/_JJS     { return(stem(4, "good", "")); }
+<scan>"better"/_RBR   { return(stem(6, "well", "")); }
+<scan>"best"/_RBS     { return(stem(4, "well", "")); }
+
+/* further_JJR discussion stays further in GUM */
+<scan>"further"/_JJR  { return(stem(0, "", "")); }
+/* further_RBR extend becomes far */
+<scan>"f"[au]"rther"/_RBR  { return(stem(6, "ar", "")); }
+<scan>"f"[au]"rthest"/_RBS { return(stem(7, "ar", "")); }
+
+<scan>{E_ADJS}r/_{COMP_SUP}     { return(stem(1, "", "")); }
+<scan>{E_ADJS}st/_{COMP_SUP}    { return(stem(2, "", "")); }
+<scan>{XX_ADJS}er/_{COMP_SUP}   { return(stem(3, "", "")); }
+<scan>{XX_ADJS}est/_{COMP_SUP}  { return(stem(4, "", "")); }
+<scan>{EY_ADJS}ier/_{COMP_SUP}  { return(stem(3, "ey", "")); }
+<scan>{EY_ADJS}iest/_{COMP_SUP} { return(stem(4, "ey", "")); }
+<scan>{G}+ier/_{COMP_SUP}       { return(stem(3, "y", "")); }
+<scan>{G}+iest/_{COMP_SUP}      { return(stem(4, "y", "")); }
+<scan>{G}+er/_{COMP_SUP}        { return(stem(2, "", "")); }
+<scan>{G}+est/_{COMP_SUP}       { return(stem(3, "", "")); }
+
 <scan>{G}+/_NN[^P] { yybegin(noun); yypushback(yylength()); return(next()); }
 <scan>{G}+/_NNP    { return(proper_name_stem()); }
 <scan>{G}+/_V     { yybegin(verb); yypushback(yylength()); return(next()); }
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ public void testPostProcessor() {`
`50`	`50`	`}`
`51`	`51`
`52`	`52`	`/**`
`53`		`- * You probably can't tell in your editor, but the input has 4 characters for`
	`53`	`+ * You probably can't tell in your editor, but the input has 4 characters for`
`54`	`54`	`* <pre>für</pre>`
`55`	`55`	`* and the output has 3`
`56`	`56`	`*/`