@@ -514,6 +514,7 @@ import edu.stanford.nlp.util.logging.Redwood;
514
514
while (yylength() > 0 ) {
515
515
char last = yycharat(yylength()- 1 );
516
516
if (last == ' ' || last == ' \t ' || (last >= ' \n ' && last <= ' \r ' || last == ' \u 0085' )) {
517
+ if (DEBUG ) { logger. info(" fixJFlex4SpaceAfterTokenBug still needed for " + yytext() + " !" ); }
517
518
yypushback(1 );
518
519
} else {
519
520
break ;
@@ -534,12 +535,16 @@ import edu.stanford.nlp.util.logging.Redwood;
534
535
s = yytext(); // return the word WITH the final period
535
536
yypushback(1 ); // (reduplication:) also return a period for next time
536
537
}
537
- return getNext(s, yytext());
538
+ String txt = yytext();
539
+ if (DEBUG ) { logger. info(" Used {ABBREV2} to recognize " + txt + " as " + s); }
540
+ return getNext(s, txt);
538
541
}
539
542
540
543
private Object processAbbrev3() {
541
544
fixJFlex4SpaceAfterTokenBug();
542
- return getNext();
545
+ String txt = yytext();
546
+ if (DEBUG ) { logger. info(" Used {ABBREV3} to recognize " + txt); }
547
+ return getNext(txt, txt);
543
548
}
544
549
545
550
/* * Assuming we're at an end of sentence (uppercase following), we usually put back a period to become end-of-sentence. */
@@ -552,7 +557,9 @@ import edu.stanford.nlp.util.logging.Redwood;
552
557
s = yytext();
553
558
yypushback(1 ); // return a period for next time
554
559
}
555
- return getNext(s, yytext());
560
+ String txt = yytext();
561
+ if (DEBUG ) { logger. info(" Used {ABBREV1} to recognize " + txt + " as " + s); }
562
+ return getNext(s, txt);
556
563
}
557
564
558
565
%}
@@ -1041,23 +1048,45 @@ RM/{NUM} { String txt = yytext();
1041
1048
if (DEBUG ) { logger. info(" Used {SWEARING} to recognize " + txt + " as " + normTok); }
1042
1049
return getNext(normTok, txt);
1043
1050
}
1044
- {BANGWORDS} { return getNext(); }
1045
- <YyNotTokenizePerLine> {BANGMAGAZINES} / {SPACENL} magazine { return getNext(); }
1046
- <YyTokenizePerLine> {BANGMAGAZINES} / {SPACE} magazine { return getNext(); }
1047
- {THING3} { if (escapeForwardSlashAsterisk) {
1048
- breakByHyphensSlashes(yytext());
1049
- return getNext(LexerUtils . escapeChar(yytext(), ' /' ), yytext());
1051
+ {BANGWORDS} { String txt = yytext();
1052
+ if (DEBUG ) { logger. info(" Used {BANGWORDS} to recognize " + txt); }
1053
+ return getNext(txt, txt);
1054
+ }
1055
+ <YyNotTokenizePerLine> {BANGMAGAZINES} / {SPACENL} magazine {
1056
+ String txt = yytext();
1057
+ if (DEBUG ) { logger. info(" Used {BANGMAGAZINES} to recognize " + txt); }
1058
+ return getNext(txt, txt);
1059
+ }
1060
+ <YyTokenizePerLine> {BANGMAGAZINES} / {SPACE} magazine {
1061
+ String txt = yytext();
1062
+ if (DEBUG ) { logger. info(" Used {BANGMAGAZINES} to recognize " + txt); }
1063
+ return getNext(txt, txt);
1064
+ }
1065
+ {THING3} { breakByHyphensSlashes(yytext());
1066
+ if (escapeForwardSlashAsterisk) {
1067
+ String txt = yytext();
1068
+ String normTok = LexerUtils . escapeChar(txt, ' /' );
1069
+ if (DEBUG ) { logger. info(" Used {THING3} to recognize " + txt + " as " + normTok); }
1070
+ return getNext(normTok, txt);
1050
1071
} else {
1051
- breakByHyphensSlashes(yytext());
1052
- return getNext();
1072
+ String txt = yytext();
1073
+ if (DEBUG ) { logger. info(" Used {THING3} to recognize " + txt); }
1074
+ return getNext(txt, txt);
1053
1075
}
1054
1076
}
1055
- {DOLSIGN} { return getNext(); }
1056
- {DOLSIGN2} { if (normalizeCurrency) {
1057
- return getNext(LexerUtils . normalizeCurrency(yytext()), yytext());
1077
+ {DOLSIGN} { String txt = yytext();
1078
+ if (DEBUG ) { logger. info(" Used {DOLSIGN} to recognize " + txt); }
1079
+ return getNext(txt, txt);
1080
+ }
1081
+ {DOLSIGN2} { String txt = yytext();
1082
+ String normTok;
1083
+ if (normalizeCurrency) {
1084
+ normTok = LexerUtils . normalizeCurrency(txt);
1058
1085
} else {
1059
- return getNext( LexerUtils . minimallyNormalizeCurrency(yytext()), yytext() );
1086
+ normTok = LexerUtils . minimallyNormalizeCurrency(txt );
1060
1087
}
1088
+ if (DEBUG ) { logger. info(" Used {DOLSIGN2} to recognize " + txt + " as " + normTok); }
1089
+ return getNext(normTok, txt);
1061
1090
}
1062
1091
/* Any acronym can be treated as sentence final iff followed by this list of words (pronouns, determiners, and prepositions, etc.). "U.S." is the single big source of errors. Character classes make this rule case sensitive! (This is needed!!). A one letter acronym candidate like "Z." or "I." in this context usually isn't, and so we return the leter and pushback the period for next time. We can't have "To" in list, as often get adjacent in headlines: "U.S. To Ask ...." */
1063
1092
<YyNotTokenizePerLine> {ABBREV2} /( {SPACENLS} )( [ A] | [ A] bout| [ A] ccording| [ A] dditionally| [ A] fter| [ A] ll| [ A] lso| [ A] lthough| [ A] n| [ A] nother| [ A] s| [ A] t| [ B] efore| [ B] oth| [ B] ut| [ B] y| [ D] id| [ D] uring| [ E] ach| [ E] arlier| [ F] ollowing| [ F] or| [ F] rom| [ H] e| [ H] er| [ H] ere| [ H] is| [ H] ow| [ H] owever| [ I] f| [ I] n| [ I] t| [ I] ts| [ L] ast| [ L] ater| [ M] any| [ M] ore| [ M] ost| [ M] rs? \. | [ M] s\. | [ N] ow| [ O] n| [ O] nce| [ O] ne| [ O] ther| [ O] ur| [ S] he| [ S] ince| [ S] o| [ S] ome| [ S] uch| [ T] hat| [ T] he| [ T] heir| [ T] hen| [ T] here| [ T] hese| [ T] hey| [ T] his| [ T] wo| [ U] nder| [ U] pon| [ W] e| [ W] hen| [ W] hile| [ W] hat| [ W] ho| [ W] hy| [ Y] et| [ Y] ou| {SGML1} )( {SPACENL} | [ ?!] ) {
0 commit comments