Skip to content

Commit e23a3cc

Browse files
committed
Some tokenizer clean-up; very minor enhancements
- Add a few file extensions. - Improve APOWORD for smart quote - Recognize 's when there is non-Latin letters following (not non-alphabetic) - Add debug logging lines to quite a few other rules (but not yet all)
1 parent 40fee82 commit e23a3cc

File tree

3 files changed

+94347
-87658
lines changed

3 files changed

+94347
-87658
lines changed
Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
/* \u3000 is ideographic space; \u205F is medium math space */
2-
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
2+
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u3000]
33
SPACES = {SPACE}+
44
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
55
SPACENL = ({SPACE}|{NEWLINE})
6+
SPACENLS = {SPACENL}+
67

7-
FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
8+
/* These next ones are useful to get a fixed length trailing context. */
9+
SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u205F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
10+
NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
11+
12+
FILENAME_EXT = 3gp|aac|aspx|avi|bat|bmp|bz2|c|class|cgi|cpp|csv|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|m4a|m4v|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|pptx|ps|psd|py|rtf|sql|tar|tgz|tif|tiff|tmp|txt|wav|wm[va]|x|xls|xlsx|xml|zip
813
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}

src/edu/stanford/nlp/process/PTBLexer.flex

Lines changed: 159 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,7 @@ import edu.stanford.nlp.util.logging.Redwood;
477477
}
478478

479479
private Object getNext() {
480-
final String txt = yytext();
480+
String txt = yytext();
481481
return getNext(txt, txt);
482482
}
483483

@@ -589,10 +589,6 @@ SPLET = &[aeiouAEIOU](acute|grave|uml);
589589

590590
%include LexCommon.tokens
591591

592-
SPACENLS = {SPACENL}+
593-
/* These next ones are useful to get a fixed length trailing context. */
594-
SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
595-
NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
596592
SENTEND1 = {SPACENL}({SPACENL}|[:uppercase:]|{SGML1})
597593
SENTEND2 = {SPACE}({SPACE}|[:uppercase:]|{SGML2})
598594
DIGIT = [:digit:]|[\u07C0-\u07C9]
@@ -672,7 +668,7 @@ SREDAUX = n{APOSETCETERA}t
672668
/* [yY]' is for Y'know, y'all and I for I. So exclude from one letter first */
673669
/* Rest are for French borrowings. n allows n'ts in "don'ts" */
674670
/* Arguably, c'mon should be split to "c'm" + "on", but not yet. 'Twixt for betwixt */
675-
APOWORD = {APOS}n{APOS}?|[lLdDjJ]{APOS}|Dunkin{APOS}|somethin{APOS}|ol{APOS}|{APOS}em|diff{APOSETCETERA}rent|[A-HJ-XZn]{APOSETCETERA}[:letter:]{2}[:letter:]*|{APOS}[1-9]0s|[1-9]0{APOS}s|{APOS}till?|[:letter:][:letter:]*[aeiouyAEIOUY]{APOSETCETERA}[aeioulA-Z][:letter:]*|{APOS}cause|cont'd\.?|nor'easter|c'mon|e'er|s'mores|ev'ry|li'l|nat'l|ass't|'twixt|O{APOSETCETERA}o
671+
APOWORD = {APOS}n{APOS}?|[lLdDjJ]{APOS}|(Dunkin|somethin|ol){APOS}|{APOS}em|diff{APOSETCETERA}rent|[A-HJ-XZn]{APOSETCETERA}[:letter:]{2}[:letter:]*|{APOS}[1-9]0s|[1-9]0{APOS}s|{APOS}till?|[:letter:][:letter:]*[aeiouyAEIOUY]{APOSETCETERA}[aeioulA-Z][:letter:]*|{APOS}cause|cont{APOSETCETERA}d\.?|nor{APOSETCETERA}easter|c{APOSETCETERA}mon|e{APOSETCETERA}er|s{APOSETCETERA}mores|ev{APOSETCETERA}ry|li{APOSETCETERA}l|nat{APOSETCETERA}l|ass{APOSETCETERA}t|'twixt|O{APOSETCETERA}o
676672
APOWORD2 = y{APOS}
677673
/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
678674
FULLURL = (ftp|svn|svn\+ssh|http|https|mailto):\/\/[^ \t\n\f\r<>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+[^ \t\n\f\r<>|.!?¡¿,·;:&`\"\'\*\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
@@ -963,13 +959,13 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
963959
if (DEBUG) { logger.info("Used {TWITTER} to recognize " + tok); }
964960
return getNext(tok, tok);
965961
}
966-
{REDAUX}/[^\p{Alpha}'’] { String tok = yytext();
962+
{REDAUX}/[^\p{Latin}'’] { String tok = yytext();
967963
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
968964
if (DEBUG) { logger.info("Used {REDAUX} to recognize " + tok + " as " + norm +
969965
"; probablyLeft=" + false); }
970966
return getNext(norm, tok);
971967
}
972-
{SREDAUX}/[^\p{Alpha}'’] { String tok = yytext();
968+
{SREDAUX}/[^\p{Latin}'’] { String tok = yytext();
973969
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
974970
if (DEBUG) { logger.info("Used {SREDAUX} to recognize " + tok + " as " + norm +
975971
"; probablyLeft=" + false); }
@@ -1073,7 +1069,7 @@ RM/{NUM} { String txt = yytext();
10731069
}
10741070
{DOLSIGN} { String txt = yytext();
10751071
if (DEBUG) { logger.info("Used {DOLSIGN} to recognize " + txt); }
1076-
return getNext(txt, txt);
1072+
return getNext(txt, txt);
10771073
}
10781074
{DOLSIGN2} { String txt = yytext();
10791075
String normTok;
@@ -1100,26 +1096,49 @@ RM/{NUM} { String txt = yytext();
11001096
<YyTokenizePerLine>{ABBREV3}/{SPACENL}?[:digit:] {
11011097
return processAbbrev3();
11021098
}
1103-
<YyNotTokenizePerLine>{ABBREVSN}/{SPACENL}+(Africa|Korea|Cal) { return getNext(); }
1104-
<YyTokenizePerLine>{ABBREVSN}/{SPACE}+(Africa|Korea|Cal) { return getNext(); }
1099+
<YyNotTokenizePerLine>{ABBREVSN}/{SPACENL}+(Africa|Korea|Cal) {
1100+
String txt = yytext();
1101+
if (DEBUG) { logger.info("Used {N/S Place} to recognize " + txt); }
1102+
return getNext(txt, txt);
1103+
}
1104+
<YyTokenizePerLine>{ABBREVSN}/{SPACE}+(Africa|Korea|Cal) {
1105+
String txt = yytext();
1106+
if (DEBUG) { logger.info("Used {N/S Place} (2) to recognize " + txt); }
1107+
return getNext(txt, txt);
1108+
}
11051109
/* Special case to get pty. ltd. or pty limited. Also added "Co." since someone complained, but usually a comma after it. */
1106-
(pty|pte|pvt|co)\./{SPACE}(ltd|lim|llc) { return getNext(); }
1110+
(pty|pte|pvt|co)\./{SPACE}(ltd|lim|llc) {
1111+
String txt = yytext();
1112+
if (DEBUG) { logger.info("Used {pty ltd} to recognize " + txt); }
1113+
return getNext(txt, txt); }
11071114
/* Special case to get op. cit.. or loc. cit. */
1108-
(op|loc)\./{SPACE}cit\. { return getNext(); }
1115+
(op|loc)\./{SPACE}cit\. {
1116+
String txt = yytext();
1117+
if (DEBUG) { logger.info("Used {op/loc cit} to recognize " + txt); }
1118+
return getNext(txt, txt); }
11091119
<YyNotTokenizePerLine>{ABBREV1}/{SENTEND1} {
11101120
return processAbbrev1();
11111121
}
11121122
<YyTokenizePerLine>{ABBREV1}/{SENTEND2} {
11131123
return processAbbrev1();
11141124
}
1115-
<YyNotTokenizePerLine>{ABBREV1}s?/[^][^] { return getNext(); }
1116-
<YyTokenizePerLine>{ABBREV1}s?/[^\r\n][^\r\n] { return getNext(); }
1117-
{ABBREV1}s? { // this one should only match if we're basically at the end of file
1125+
<YyNotTokenizePerLine>{ABBREV1}s?/[^][^] {
1126+
String txt = yytext();
1127+
if (DEBUG) { logger.info("Used {ABBREV1 pl} to recognize " + txt); }
1128+
return getNext(txt, txt);
1129+
}
1130+
<YyTokenizePerLine>{ABBREV1}s?/[^\r\n][^\r\n] {
1131+
String txt = yytext();
1132+
if (DEBUG) { logger.info("Used {ABBREV1 pl} (2) to recognize " + txt); }
1133+
return getNext(txt, txt);
1134+
}
1135+
{ABBREV1}s? {
1136+
// this one should only match if we're basically at the end of file
11181137
// since the last one matches two things, even newlines (if not tokenize per line)
11191138
return processAbbrev1();
11201139
}
11211140
{ABBREV2}s? { String tok = yytext();
1122-
if (DEBUG) { logger.info("Used {ABBREV2} to recognize " + tok); }
1141+
if (DEBUG) { logger.info("Used {ABBREV2 pl} to recognize " + tok); }
11231142
return getNext(tok, tok);
11241143
}
11251144
/* Last millennium (in the WSJ) "Alex." is generally an abbreviation for Alex. Brown, brokers! Recognize just this case. */
@@ -1140,20 +1159,44 @@ RM/{NUM} { String txt = yytext();
11401159
if (DEBUG) { logger.info("Used {ABBREV4} to recognize " + tok); }
11411160
return getNext(tok, tok);
11421161
}
1143-
{TBSPEC2}/{SPACENL} { return getNext(); }
1144-
{ISO8601DATETIME} { return getNext(); }
1162+
{TBSPEC2}/{SPACENL} {
1163+
String txt = yytext();
1164+
if (DEBUG) { logger.info("Used {TBSPEC2} to recognize " + txt); }
1165+
return getNext(txt, txt);
1166+
}
1167+
{ISO8601DATETIME} {
1168+
String txt = yytext();
1169+
if (DEBUG) { logger.info("Used {ISO8601DATETIME} to recognize " + txt); }
1170+
return getNext(txt, txt);
1171+
}
11451172
//{ISO8601DATE} { return getNext(); }
1146-
{DEGREES} { return getNext(); }
1173+
{DEGREES} {
1174+
String txt = yytext();
1175+
if (DEBUG) { logger.info("Used {DEGREES} to recognize " + txt); }
1176+
return getNext(txt, txt);
1177+
}
11471178
/* Ideally would factor this out for use in other tokenizers,
11481179
* but the other tokenizers don't have TokenizerPerLine options */
1149-
<YyNotTokenizePerLine>{FILENAME}/({SPACENL}|[.?!,\"'<()]) { return getNext(); }
1150-
<YyTokenizePerLine>{FILENAME}/({SPACE}|[.?!,\"'<()]) { return getNext(); }
1180+
<YyNotTokenizePerLine>{FILENAME}/({SPACENL}|[.?!,\"'<()]) {
1181+
String txt = yytext();
1182+
if (DEBUG) { logger.info("Used {FILENAME} to recognize " + txt); }
1183+
return getNext(txt, txt);
1184+
}
1185+
<YyTokenizePerLine>{FILENAME}/({SPACE}|[.?!,\"'<()]) {
1186+
String txt = yytext();
1187+
if (DEBUG) { logger.info("Used {FILENAME} (2) to recognize " + txt); }
1188+
return getNext(txt, txt);
1189+
}
11511190
{WORD}\./{INSENTP} { String origTok = yytext();
11521191
String norm = LexerUtils.removeSoftHyphens(origTok);
11531192
if (DEBUG) { logger.info("Used {WORD} (3) to recognize " + origTok + " as " + norm); }
11541193
return getNext(norm, origTok);
11551194
}
1156-
{SSN} { return getNext(); }
1195+
{SSN} {
1196+
String txt = yytext();
1197+
if (DEBUG) { logger.info("Used {SSN} to recognize " + txt); }
1198+
return getNext(txt, txt);
1199+
}
11571200
{PHONE} { String txt = yytext();
11581201
String norm = txt;
11591202
if (normalizeSpace) {
@@ -1184,48 +1227,81 @@ RM/{NUM} { String txt = yytext();
11841227
{ASIANSMILEY} { String txt = yytext();
11851228
String origText = txt;
11861229
txt = LexerUtils.pennNormalizeParens(txt, normalizeParentheses);
1230+
if (DEBUG) { logger.info("Used {ASIANSMILEY} to recognize " + origText + " as " + txt); }
11871231
return getNext(txt, origText);
11881232
}
11891233
{EMOJI} { String txt = yytext();
11901234
if (DEBUG) { logger.info("Used {EMOJI} to recognize " + txt); }
11911235
return getNext(txt, txt);
11921236
}
1193-
{LESSTHAN} { return getNext("<", yytext()); }
1194-
{GREATERTHAN} { return getNext(">", yytext()); }
1195-
\{ { if (normalizeOtherBrackets) {
1196-
return getNext(openbrace, yytext()); }
1237+
{LESSTHAN} {
1238+
String txt = yytext();
1239+
if (DEBUG) { logger.info("Used {LESSTHAN} to recognize " + txt + " as <"); }
1240+
return getNext("<", yytext());
1241+
}
1242+
{GREATERTHAN} {
1243+
String txt = yytext();
1244+
if (DEBUG) { logger.info("Used {GREATERTHAN} to recognize " + txt + " as >"); }
1245+
return getNext(">", yytext());
1246+
}
1247+
\{ {
1248+
String txt = yytext();
1249+
if (normalizeOtherBrackets) {
1250+
if (DEBUG) { logger.info("Used {{} to recognize " + txt + " as " + openbrace); }
1251+
return getNext(openbrace, txt); }
11971252
else {
1198-
return getNext();
1253+
if (DEBUG) { logger.info("Used {{} to recognize " + txt); }
1254+
return getNext(txt, txt);
11991255
}
12001256
}
1201-
\} { if (normalizeOtherBrackets) {
1202-
return getNext(closebrace, yytext()); }
1257+
\} {
1258+
String txt = yytext();
1259+
if (normalizeOtherBrackets) {
1260+
if (DEBUG) { logger.info("Used {}} to recognize " + txt + " as " + closebrace); }
1261+
return getNext(closebrace, txt); }
12031262
else {
1204-
return getNext();
1263+
if (DEBUG) { logger.info("Used {}} to recognize " + txt); }
1264+
return getNext(txt, txt);
12051265
}
12061266
}
1207-
\[ { if (normalizeOtherBrackets) {
1208-
return getNext("-LSB-", yytext()); }
1267+
\[ {
1268+
String txt = yytext();
1269+
if (normalizeOtherBrackets) {
1270+
if (DEBUG) { logger.info("Used {[} to recognize " + txt + " as " + "-LSB-"); }
1271+
return getNext("-LSB-", txt); }
12091272
else {
1210-
return getNext();
1273+
if (DEBUG) { logger.info("Used {[} to recognize " + txt); }
1274+
return getNext(txt, txt);
12111275
}
12121276
}
1213-
\] { if (normalizeOtherBrackets) {
1214-
return getNext("-RSB-", yytext()); }
1277+
\] {
1278+
String txt = yytext();
1279+
if (normalizeOtherBrackets) {
1280+
if (DEBUG) { logger.info("Used {]} to recognize " + txt + " as " + "-RSB-"); }
1281+
return getNext("-RSB-", txt); }
12151282
else {
1216-
return getNext();
1283+
if (DEBUG) { logger.info("Used {]} to recognize " + txt); }
1284+
return getNext(txt, txt);
12171285
}
12181286
}
1219-
\( { if (normalizeParentheses) {
1220-
return getNext(openparen, yytext()); }
1287+
\( {
1288+
String txt = yytext();
1289+
if (normalizeParentheses) {
1290+
if (DEBUG) { logger.info("Used {(} to recognize " + txt + " as " + openparen); }
1291+
return getNext(openparen, txt); }
12211292
else {
1222-
return getNext();
1293+
if (DEBUG) { logger.info("Used {(} to recognize " + txt); }
1294+
return getNext(txt, txt);
12231295
}
12241296
}
1225-
\) { if (normalizeParentheses) {
1226-
return getNext(closeparen, yytext()); }
1297+
\) {
1298+
String txt = yytext();
1299+
if (normalizeParentheses) {
1300+
if (DEBUG) { logger.info("Used {)} to recognize " + txt + " as " + closeparen); }
1301+
return getNext(closeparen, txt); }
12271302
else {
1228-
return getNext();
1303+
if (DEBUG) { logger.info("Used {)} to recognize " + txt); }
1304+
return getNext(txt, txt);
12291305
}
12301306
}
12311307
{HYPHENS} { final String origTxt = yytext();
@@ -1270,17 +1346,42 @@ RM/{NUM} { String txt = yytext();
12701346
if (DEBUG) { logger.info("Used {LDOTS5} to recognize " + tok + " as " + norm); }
12711347
return getNext(norm, tok);
12721348
}
1273-
{FNMARKS} { return getNext(); }
1274-
{ASTS} { if (escapeForwardSlashAsterisk) {
1275-
return getNext(LexerUtils.escapeChar(yytext(), '*'), yytext()); }
1349+
{FNMARKS} {
1350+
String txt = yytext();
1351+
if (DEBUG) { logger.info("Used {FNMARKS} to recognize " + txt); }
1352+
return getNext(txt, txt);
1353+
}
1354+
{ASTS} {
1355+
String txt = yytext();
1356+
if (escapeForwardSlashAsterisk) {
1357+
String normTok = LexerUtils.escapeChar(yytext(), '*');
1358+
if (DEBUG) { logger.info("Used {ASTS} to recognize " + txt + " as " + normTok); }
1359+
return getNext(normTok, yytext()); }
12761360
else {
1277-
return getNext();
1361+
if (DEBUG) { logger.info("Used {ASTS} to recognize " + txt); }
1362+
return getNext(txt, txt);
12781363
}
12791364
}
1280-
{INSENTP} { return getNext(); }
1281-
[?!]+|[\u2047\u2048] { return getNext(); }
1282-
[.¡¿\u037E\u0589\u061F\u06D4\u0700-\u0702\u07FA\u3002] { return getNext(); }
1283-
=+ { return getNext(); }
1365+
{INSENTP} {
1366+
String txt = yytext();
1367+
if (DEBUG) { logger.info("Used {INSENTP} to recognize " + txt); }
1368+
return getNext(txt, txt);
1369+
}
1370+
[?!]+|[\u2047\u2048] {
1371+
String txt = yytext();
1372+
if (DEBUG) { logger.info("Used {[?!]+]} to recognize " + txt); }
1373+
return getNext(txt, txt);
1374+
}
1375+
[.¡¿\u037E\u0589\u061F\u06D4\u0700-\u0702\u07FA\u3002] {
1376+
String txt = yytext();
1377+
if (DEBUG) { logger.info("Used {sent end punct} to recognize " + txt); }
1378+
return getNext(txt, txt);
1379+
}
1380+
=+ {
1381+
String txt = yytext();
1382+
if (DEBUG) { logger.info("Used {=} to recognize " + txt); }
1383+
return getNext(txt, txt);
1384+
}
12841385
\/ { if (escapeForwardSlashAsterisk) {
12851386
return getNext(LexerUtils.escapeChar(yytext(), '/'), yytext()); }
12861387
else {
@@ -1392,7 +1493,11 @@ RM/{NUM} { String txt = yytext();
13921493
}
13931494

13941495
{FAKEDUCKFEET} { return getNext(); }
1395-
{MISCSYMBOL} { return getNext(); }
1496+
{MISCSYMBOL} {
1497+
String tok = yytext();
1498+
if (DEBUG) { logger.info("Used {MISCSYMBOL} to recognize " + tok); }
1499+
return getNext(tok, tok);
1500+
}
13961501
{CP1252_MISC_SYMBOL} { String tok = yytext();
13971502
String norm = LexerUtils.processCp1252misc(tok);
13981503
if (DEBUG) { logger.info("Used {CP1252_MISC_SYMBOL} to recognize " + tok + " as " + norm); }
@@ -1453,9 +1558,9 @@ RM/{NUM} { String txt = yytext();
14531558
<<EOF>> { if (invertible) {
14541559
// prevWordAfter.append(yytext());
14551560
String str = prevWordAfter.toString();
1456-
if (DEBUG) { logger.info("At end of text making after: |" + str + "|"); }
1561+
// if (DEBUG) { logger.info("At end of text making after: |" + str + "|"); }
14571562
prevWord.set(CoreAnnotations.AfterAnnotation.class, str);
1458-
if (DEBUG) { logger.info("prevWord is |" + prevWord.get(CoreAnnotations.TextAnnotation.class) + "|, its after is " +
1563+
if (DEBUG) { logger.info("At end of text, prevWord is |" + prevWord.get(CoreAnnotations.TextAnnotation.class) + "|, its after set to " +
14591564
"|" + prevWord.get(CoreAnnotations.AfterAnnotation.class) + "|"); }
14601565
prevWordAfter.setLength(0);
14611566
}

0 commit comments

Comments
 (0)