Skip to content

Commit d445264

Browse files
committed
Make debug output more complete; log if still using fixJFlex4SpaceAfterTokenBug
1 parent 5439371 commit d445264

File tree

2 files changed

+86993
-86926
lines changed

2 files changed

+86993
-86926
lines changed

src/edu/stanford/nlp/process/PTBLexer.flex

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,7 @@ import edu.stanford.nlp.util.logging.Redwood;
514514
while (yylength() > 0) {
515515
char last = yycharat(yylength()-1);
516516
if (last == ' ' || last == '\t' || (last >= '\n' && last <= '\r' || last == '\u0085')) {
517+
if (DEBUG) { logger.info("fixJFlex4SpaceAfterTokenBug still needed for " + yytext() + "!"); }
517518
yypushback(1);
518519
} else {
519520
break;
@@ -534,12 +535,16 @@ import edu.stanford.nlp.util.logging.Redwood;
534535
s = yytext(); // return the word WITH the final period
535536
yypushback(1); // (reduplication:) also return a period for next time
536537
}
537-
return getNext(s, yytext());
538+
String txt = yytext();
539+
if (DEBUG) { logger.info("Used {ABBREV2} to recognize " + txt + " as " + s); }
540+
return getNext(s, txt);
538541
}
539542

540543
private Object processAbbrev3() {
541544
fixJFlex4SpaceAfterTokenBug();
542-
return getNext();
545+
String txt = yytext();
546+
if (DEBUG) { logger.info("Used {ABBREV3} to recognize " + txt); }
547+
return getNext(txt, txt);
543548
}
544549

545550
/** Assuming we're at an end of sentence (uppercase following), we usually put back a period to become end-of-sentence. */
@@ -552,7 +557,9 @@ import edu.stanford.nlp.util.logging.Redwood;
552557
s = yytext();
553558
yypushback(1); // return a period for next time
554559
}
555-
return getNext(s, yytext());
560+
String txt = yytext();
561+
if (DEBUG) { logger.info("Used {ABBREV1} to recognize " + txt + " as " + s); }
562+
return getNext(s, txt);
556563
}
557564

558565
%}
@@ -1041,23 +1048,45 @@ RM/{NUM} { String txt = yytext();
10411048
if (DEBUG) { logger.info("Used {SWEARING} to recognize " + txt + " as " + normTok); }
10421049
return getNext(normTok, txt);
10431050
}
1044-
{BANGWORDS} { return getNext(); }
1045-
<YyNotTokenizePerLine>{BANGMAGAZINES}/{SPACENL}magazine { return getNext(); }
1046-
<YyTokenizePerLine>{BANGMAGAZINES}/{SPACE}magazine { return getNext(); }
1047-
{THING3} { if (escapeForwardSlashAsterisk) {
1048-
breakByHyphensSlashes(yytext());
1049-
return getNext(LexerUtils.escapeChar(yytext(), '/'), yytext());
1051+
{BANGWORDS} { String txt = yytext();
1052+
if (DEBUG) { logger.info("Used {BANGWORDS} to recognize "+ txt); }
1053+
return getNext(txt, txt);
1054+
}
1055+
<YyNotTokenizePerLine>{BANGMAGAZINES}/{SPACENL}magazine {
1056+
String txt = yytext();
1057+
if (DEBUG) { logger.info("Used {BANGMAGAZINES} to recognize "+ txt); }
1058+
return getNext(txt, txt);
1059+
}
1060+
<YyTokenizePerLine>{BANGMAGAZINES}/{SPACE}magazine {
1061+
String txt = yytext();
1062+
if (DEBUG) { logger.info("Used {BANGMAGAZINES} to recognize "+ txt); }
1063+
return getNext(txt, txt);
1064+
}
1065+
{THING3} { breakByHyphensSlashes(yytext());
1066+
if (escapeForwardSlashAsterisk) {
1067+
String txt = yytext();
1068+
String normTok = LexerUtils.escapeChar(txt, '/');
1069+
if (DEBUG) { logger.info("Used {THING3} to recognize " + txt + " as " + normTok); }
1070+
return getNext(normTok, txt);
10501071
} else {
1051-
breakByHyphensSlashes(yytext());
1052-
return getNext();
1072+
String txt = yytext();
1073+
if (DEBUG) { logger.info("Used {THING3} to recognize " + txt); }
1074+
return getNext(txt, txt);
10531075
}
10541076
}
1055-
{DOLSIGN} { return getNext(); }
1056-
{DOLSIGN2} { if (normalizeCurrency) {
1057-
return getNext(LexerUtils.normalizeCurrency(yytext()), yytext());
1077+
{DOLSIGN} { String txt = yytext();
1078+
if (DEBUG) { logger.info("Used {DOLSIGN} to recognize " + txt); }
1079+
return getNext(txt, txt);
1080+
}
1081+
{DOLSIGN2} { String txt = yytext();
1082+
String normTok;
1083+
if (normalizeCurrency) {
1084+
normTok = LexerUtils.normalizeCurrency(txt);
10581085
} else {
1059-
return getNext(LexerUtils.minimallyNormalizeCurrency(yytext()), yytext());
1086+
normTok = LexerUtils.minimallyNormalizeCurrency(txt);
10601087
}
1088+
if (DEBUG) { logger.info("Used {DOLSIGN2} to recognize " + txt + " as " + normTok); }
1089+
return getNext(normTok, txt);
10611090
}
10621091
/* Any acronym can be treated as sentence final iff followed by this list of words (pronouns, determiners, and prepositions, etc.). "U.S." is the single big source of errors. Character classes make this rule case sensitive! (This is needed!!). A one letter acronym candidate like "Z." or "I." in this context usually isn't, and so we return the leter and pushback the period for next time. We can't have "To" in list, as often get adjacent in headlines: "U.S. To Ask ...." */
10631092
<YyNotTokenizePerLine>{ABBREV2}/({SPACENLS})([A]|[A]bout|[A]ccording|[A]dditionally|[A]fter|[A]ll|[A]lso|[A]lthough|[A]n|[A]nother|[A]s|[A]t|[B]efore|[B]oth|[B]ut|[B]y|[D]id|[D]uring|[E]ach|[E]arlier|[F]ollowing|[F]or|[F]rom|[H]e|[H]er|[H]ere|[H]is|[H]ow|[H]owever|[I]f|[I]n|[I]t|[I]ts|[L]ast|[L]ater|[M]any|[M]ore|[M]ost|[M]rs?\.|[M]s\.|[N]ow|[O]n|[O]nce|[O]ne|[O]ther|[O]ur|[S]he|[S]ince|[S]o|[S]ome|[S]uch|[T]hat|[T]he|[T]heir|[T]hen|[T]here|[T]hese|[T]hey|[T]his|[T]wo|[U]nder|[U]pon|[W]e|[W]hen|[W]hile|[W]hat|[W]ho|[W]hy|[Y]et|[Y]ou|{SGML1})({SPACENL}|[?!]) {

0 commit comments

Comments
 (0)