@@ -477,7 +477,7 @@ import edu.stanford.nlp.util.logging.Redwood;
477
477
}
478
478
479
479
private Object getNext() {
480
- final String txt = yytext();
480
+ String txt = yytext();
481
481
return getNext(txt, txt);
482
482
}
483
483
@@ -589,10 +589,6 @@ SPLET = &[aeiouAEIOU](acute|grave|uml);
589
589
590
590
%include LexCommon.tokens
591
591
592
- SPACENLS = {SPACENL} +
593
- /* These next ones are useful to get a fixed length trailing context. */
594
- SPACENL_ONE_CHAR = [ \t \u00A0\u2000 - \u200A\u202F\u3000 \r\n \u2028\u2029\u000B\u000C\u0085 ]
595
- NOT_SPACENL_ONE_CHAR = [^ \t \u00A0\u2000 - \u200A\u202F\u3000 \r\n \u2028\u2029\u000B\u000C\u0085 ]
596
592
SENTEND1 = {SPACENL} ( {SPACENL} |[:uppercase:]| {SGML1} )
597
593
SENTEND2 = {SPACE} ( {SPACE} |[:uppercase:]| {SGML2} )
598
594
DIGIT = [:digit:]| [ \u07C0 - \u07C9 ]
@@ -672,7 +668,7 @@ SREDAUX = n{APOSETCETERA}t
672
668
/* [yY]' is for Y'know, y'all and I for I. So exclude from one letter first */
673
669
/* Rest are for French borrowings. n allows n'ts in "don'ts" */
674
670
/* Arguably, c'mon should be split to "c'm" + "on", but not yet. 'Twixt for betwixt */
675
- APOWORD = {APOS} n{APOS} ?| [ lLdDjJ] {APOS} | Dunkin{APOS} | somethin{APOS} | ol{APOS} | {APOS} em| diff{APOSETCETERA} rent| [ A- HJ- XZn] {APOSETCETERA} [:letter:]{2}[:letter:]*| {APOS} [ 1- 9] 0s| [ 1- 9] 0{APOS} s| {APOS} till?|[:letter:][:letter:]* [ aeiouyAEIOUY] {APOSETCETERA} [ aeioulA- Z] [:letter:]*| {APOS} cause| cont' d\. ?| nor' easter| c' mon| e' er| s' mores| ev' ry| li' l| nat' l| ass' t| 'twixt| O{APOSETCETERA} o
671
+ APOWORD = {APOS} n{APOS} ?| [ lLdDjJ] {APOS} |( Dunkin| somethin| ol) {APOS} | {APOS} em| diff{APOSETCETERA} rent| [ A- HJ- XZn] {APOSETCETERA} [:letter:]{2}[:letter:]*| {APOS} [ 1- 9] 0s| [ 1- 9] 0{APOS} s| {APOS} till?|[:letter:][:letter:]* [ aeiouyAEIOUY] {APOSETCETERA} [ aeioulA- Z] [:letter:]*| {APOS} cause| cont{APOSETCETERA} d\. ?| nor{APOSETCETERA} easter| c{APOSETCETERA} mon| e{APOSETCETERA} er| s{APOSETCETERA} mores| ev{APOSETCETERA} ry| li{APOSETCETERA} l| nat{APOSETCETERA} l| ass{APOSETCETERA} t| 'twixt| O{APOSETCETERA} o
676
672
APOWORD2 = y{APOS}
677
673
/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
678
674
FULLURL = ( ftp| svn| svn\+ ssh| http| https| mailto) :\/\/ [^ \t\n\f\r <>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + [^ \t\n\f\r <>|.!?¡¿,·;:&`\"\'\* \p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
@@ -963,13 +959,13 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
963
959
if (DEBUG ) { logger. info(" Used {TWITTER} to recognize " + tok); }
964
960
return getNext(tok, tok);
965
961
}
966
- {REDAUX} / [^\p{Alpha } '’] { String tok = yytext();
962
+ {REDAUX} / [^\p{Latin } '’] { String tok = yytext();
967
963
String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
968
964
if (DEBUG ) { logger. info(" Used {REDAUX} to recognize " + tok + " as " + norm +
969
965
" ; probablyLeft=" + false ); }
970
966
return getNext(norm, tok);
971
967
}
972
- {SREDAUX} / [^\p{Alpha } '’] { String tok = yytext();
968
+ {SREDAUX} / [^\p{Latin } '’] { String tok = yytext();
973
969
String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
974
970
if (DEBUG ) { logger. info(" Used {SREDAUX} to recognize " + tok + " as " + norm +
975
971
" ; probablyLeft=" + false ); }
@@ -1073,7 +1069,7 @@ RM/{NUM} { String txt = yytext();
1073
1069
}
1074
1070
{DOLSIGN} { String txt = yytext();
1075
1071
if (DEBUG ) { logger. info(" Used {DOLSIGN} to recognize " + txt); }
1076
- return getNext(txt, txt);
1072
+ return getNext(txt, txt);
1077
1073
}
1078
1074
{DOLSIGN2} { String txt = yytext();
1079
1075
String normTok;
@@ -1100,26 +1096,49 @@ RM/{NUM} { String txt = yytext();
1100
1096
<YyTokenizePerLine> {ABBREV3} / {SPACENL} ?[:digit:] {
1101
1097
return processAbbrev3();
1102
1098
}
1103
- <YyNotTokenizePerLine> {ABBREVSN} / {SPACENL} +( Africa| Korea| Cal) { return getNext(); }
1104
- <YyTokenizePerLine> {ABBREVSN} / {SPACE} +( Africa| Korea| Cal) { return getNext(); }
1099
+ <YyNotTokenizePerLine> {ABBREVSN} / {SPACENL} +( Africa| Korea| Cal) {
1100
+ String txt = yytext();
1101
+ if (DEBUG ) { logger. info(" Used {N/S Place} to recognize " + txt); }
1102
+ return getNext(txt, txt);
1103
+ }
1104
+ <YyTokenizePerLine> {ABBREVSN} / {SPACE} +( Africa| Korea| Cal) {
1105
+ String txt = yytext();
1106
+ if (DEBUG ) { logger. info(" Used {N/S Place} (2) to recognize " + txt); }
1107
+ return getNext(txt, txt);
1108
+ }
1105
1109
/* Special case to get pty. ltd. or pty limited. Also added "Co." since someone complained, but usually a comma after it. */
1106
- ( pty| pte| pvt| co) \. / {SPACE} ( ltd| lim| llc) { return getNext(); }
1110
+ ( pty| pte| pvt| co) \. / {SPACE} ( ltd| lim| llc) {
1111
+ String txt = yytext();
1112
+ if (DEBUG ) { logger. info(" Used {pty ltd} to recognize " + txt); }
1113
+ return getNext(txt, txt); }
1107
1114
/* Special case to get op. cit.. or loc. cit. */
1108
- ( op| loc) \. / {SPACE} cit\. { return getNext(); }
1115
+ ( op| loc) \. / {SPACE} cit\. {
1116
+ String txt = yytext();
1117
+ if (DEBUG ) { logger. info(" Used {op/loc cit} to recognize " + txt); }
1118
+ return getNext(txt, txt); }
1109
1119
<YyNotTokenizePerLine> {ABBREV1} / {SENTEND1} {
1110
1120
return processAbbrev1();
1111
1121
}
1112
1122
<YyTokenizePerLine> {ABBREV1} / {SENTEND2} {
1113
1123
return processAbbrev1();
1114
1124
}
1115
- <YyNotTokenizePerLine> {ABBREV1} s?/ [^][^] { return getNext(); }
1116
- <YyTokenizePerLine> {ABBREV1} s?/ [^\r\n][^\r\n] { return getNext(); }
1117
- {ABBREV1} s? { // this one should only match if we're basically at the end of file
1125
+ <YyNotTokenizePerLine> {ABBREV1} s?/ [^][^] {
1126
+ String txt = yytext();
1127
+ if (DEBUG ) { logger. info(" Used {ABBREV1 pl} to recognize " + txt); }
1128
+ return getNext(txt, txt);
1129
+ }
1130
+ <YyTokenizePerLine> {ABBREV1} s?/ [^\r\n][^\r\n] {
1131
+ String txt = yytext();
1132
+ if (DEBUG ) { logger. info(" Used {ABBREV1 pl} (2) to recognize " + txt); }
1133
+ return getNext(txt, txt);
1134
+ }
1135
+ {ABBREV1} s? {
1136
+ // this one should only match if we're basically at the end of file
1118
1137
// since the last one matches two things, even newlines (if not tokenize per line)
1119
1138
return processAbbrev1();
1120
1139
}
1121
1140
{ABBREV2} s? { String tok = yytext();
1122
- if (DEBUG ) { logger. info(" Used {ABBREV2} to recognize " + tok); }
1141
+ if (DEBUG ) { logger. info(" Used {ABBREV2 pl } to recognize " + tok); }
1123
1142
return getNext(tok, tok);
1124
1143
}
1125
1144
/* Last millennium (in the WSJ) "Alex." is generally an abbreviation for Alex. Brown, brokers! Recognize just this case. */
@@ -1140,20 +1159,44 @@ RM/{NUM} { String txt = yytext();
1140
1159
if (DEBUG ) { logger. info(" Used {ABBREV4} to recognize " + tok); }
1141
1160
return getNext(tok, tok);
1142
1161
}
1143
- {TBSPEC2} / {SPACENL} { return getNext(); }
1144
- {ISO8601DATETIME} { return getNext(); }
1162
+ {TBSPEC2} / {SPACENL} {
1163
+ String txt = yytext();
1164
+ if (DEBUG ) { logger. info(" Used {TBSPEC2} to recognize " + txt); }
1165
+ return getNext(txt, txt);
1166
+ }
1167
+ {ISO8601DATETIME} {
1168
+ String txt = yytext();
1169
+ if (DEBUG ) { logger. info(" Used {ISO8601DATETIME} to recognize " + txt); }
1170
+ return getNext(txt, txt);
1171
+ }
1145
1172
// {ISO8601DATE} { return getNext(); }
1146
- {DEGREES} { return getNext(); }
1173
+ {DEGREES} {
1174
+ String txt = yytext();
1175
+ if (DEBUG ) { logger. info(" Used {DEGREES} to recognize " + txt); }
1176
+ return getNext(txt, txt);
1177
+ }
1147
1178
/* Ideally would factor this out for use in other tokenizers,
1148
1179
* but the other tokenizers don't have TokenizerPerLine options */
1149
- <YyNotTokenizePerLine> {FILENAME} /( {SPACENL} | [ .?!,\" '<()] ) { return getNext(); }
1150
- <YyTokenizePerLine> {FILENAME} /( {SPACE} | [ .?!,\" '<()] ) { return getNext(); }
1180
+ <YyNotTokenizePerLine> {FILENAME} /( {SPACENL} | [ .?!,\" '<()] ) {
1181
+ String txt = yytext();
1182
+ if (DEBUG ) { logger. info(" Used {FILENAME} to recognize " + txt); }
1183
+ return getNext(txt, txt);
1184
+ }
1185
+ <YyTokenizePerLine> {FILENAME} /( {SPACE} | [ .?!,\" '<()] ) {
1186
+ String txt = yytext();
1187
+ if (DEBUG ) { logger. info(" Used {FILENAME} (2) to recognize " + txt); }
1188
+ return getNext(txt, txt);
1189
+ }
1151
1190
{WORD} \. / {INSENTP} { String origTok = yytext();
1152
1191
String norm = LexerUtils . removeSoftHyphens(origTok);
1153
1192
if (DEBUG ) { logger. info(" Used {WORD} (3) to recognize " + origTok + " as " + norm); }
1154
1193
return getNext(norm, origTok);
1155
1194
}
1156
- {SSN} { return getNext(); }
1195
+ {SSN} {
1196
+ String txt = yytext();
1197
+ if (DEBUG ) { logger. info(" Used {SSN} to recognize " + txt); }
1198
+ return getNext(txt, txt);
1199
+ }
1157
1200
{PHONE} { String txt = yytext();
1158
1201
String norm = txt;
1159
1202
if (normalizeSpace) {
@@ -1184,48 +1227,81 @@ RM/{NUM} { String txt = yytext();
1184
1227
{ASIANSMILEY} { String txt = yytext();
1185
1228
String origText = txt;
1186
1229
txt = LexerUtils . pennNormalizeParens(txt, normalizeParentheses);
1230
+ if (DEBUG ) { logger. info(" Used {ASIANSMILEY} to recognize " + origText + " as " + txt); }
1187
1231
return getNext(txt, origText);
1188
1232
}
1189
1233
{EMOJI} { String txt = yytext();
1190
1234
if (DEBUG ) { logger. info(" Used {EMOJI} to recognize " + txt); }
1191
1235
return getNext(txt, txt);
1192
1236
}
1193
- {LESSTHAN} { return getNext(" <" , yytext()); }
1194
- {GREATERTHAN} { return getNext(" >" , yytext()); }
1195
- \{ { if (normalizeOtherBrackets) {
1196
- return getNext(openbrace, yytext()); }
1237
+ {LESSTHAN} {
1238
+ String txt = yytext();
1239
+ if (DEBUG ) { logger. info(" Used {LESSTHAN} to recognize " + txt + " as <" ); }
1240
+ return getNext(" <" , yytext());
1241
+ }
1242
+ {GREATERTHAN} {
1243
+ String txt = yytext();
1244
+ if (DEBUG ) { logger. info(" Used {GREATERTHAN} to recognize " + txt + " as >" ); }
1245
+ return getNext(" >" , yytext());
1246
+ }
1247
+ \{ {
1248
+ String txt = yytext();
1249
+ if (normalizeOtherBrackets) {
1250
+ if (DEBUG ) { logger. info(" Used {{} to recognize " + txt + " as " + openbrace); }
1251
+ return getNext(openbrace, txt); }
1197
1252
else {
1198
- return getNext();
1253
+ if (DEBUG ) { logger. info(" Used {{} to recognize " + txt); }
1254
+ return getNext(txt, txt);
1199
1255
}
1200
1256
}
1201
- \} { if (normalizeOtherBrackets) {
1202
- return getNext(closebrace, yytext()); }
1257
+ \} {
1258
+ String txt = yytext();
1259
+ if (normalizeOtherBrackets) {
1260
+ if (DEBUG ) { logger. info(" Used {}} to recognize " + txt + " as " + closebrace); }
1261
+ return getNext(closebrace, txt); }
1203
1262
else {
1204
- return getNext();
1263
+ if (DEBUG ) { logger. info(" Used {}} to recognize " + txt); }
1264
+ return getNext(txt, txt);
1205
1265
}
1206
1266
}
1207
- \[ { if (normalizeOtherBrackets) {
1208
- return getNext(" -LSB-" , yytext()); }
1267
+ \[ {
1268
+ String txt = yytext();
1269
+ if (normalizeOtherBrackets) {
1270
+ if (DEBUG ) { logger. info(" Used {[} to recognize " + txt + " as " + " -LSB-" ); }
1271
+ return getNext(" -LSB-" , txt); }
1209
1272
else {
1210
- return getNext();
1273
+ if (DEBUG ) { logger. info(" Used {[} to recognize " + txt); }
1274
+ return getNext(txt, txt);
1211
1275
}
1212
1276
}
1213
- \] { if (normalizeOtherBrackets) {
1214
- return getNext(" -RSB-" , yytext()); }
1277
+ \] {
1278
+ String txt = yytext();
1279
+ if (normalizeOtherBrackets) {
1280
+ if (DEBUG ) { logger. info(" Used {]} to recognize " + txt + " as " + " -RSB-" ); }
1281
+ return getNext(" -RSB-" , txt); }
1215
1282
else {
1216
- return getNext();
1283
+ if (DEBUG ) { logger. info(" Used {]} to recognize " + txt); }
1284
+ return getNext(txt, txt);
1217
1285
}
1218
1286
}
1219
- \( { if (normalizeParentheses) {
1220
- return getNext(openparen, yytext()); }
1287
+ \( {
1288
+ String txt = yytext();
1289
+ if (normalizeParentheses) {
1290
+ if (DEBUG ) { logger. info(" Used {(} to recognize " + txt + " as " + openparen); }
1291
+ return getNext(openparen, txt); }
1221
1292
else {
1222
- return getNext();
1293
+ if (DEBUG ) { logger. info(" Used {(} to recognize " + txt); }
1294
+ return getNext(txt, txt);
1223
1295
}
1224
1296
}
1225
- \) { if (normalizeParentheses) {
1226
- return getNext(closeparen, yytext()); }
1297
+ \) {
1298
+ String txt = yytext();
1299
+ if (normalizeParentheses) {
1300
+ if (DEBUG ) { logger. info(" Used {)} to recognize " + txt + " as " + closeparen); }
1301
+ return getNext(closeparen, txt); }
1227
1302
else {
1228
- return getNext();
1303
+ if (DEBUG ) { logger. info(" Used {)} to recognize " + txt); }
1304
+ return getNext(txt, txt);
1229
1305
}
1230
1306
}
1231
1307
{HYPHENS} { final String origTxt = yytext();
@@ -1270,17 +1346,42 @@ RM/{NUM} { String txt = yytext();
1270
1346
if (DEBUG ) { logger. info(" Used {LDOTS5} to recognize " + tok + " as " + norm); }
1271
1347
return getNext(norm, tok);
1272
1348
}
1273
- {FNMARKS} { return getNext(); }
1274
- {ASTS} { if (escapeForwardSlashAsterisk) {
1275
- return getNext(LexerUtils . escapeChar(yytext(), ' *' ), yytext()); }
1349
+ {FNMARKS} {
1350
+ String txt = yytext();
1351
+ if (DEBUG ) { logger. info(" Used {FNMARKS} to recognize " + txt); }
1352
+ return getNext(txt, txt);
1353
+ }
1354
+ {ASTS} {
1355
+ String txt = yytext();
1356
+ if (escapeForwardSlashAsterisk) {
1357
+ String normTok = LexerUtils . escapeChar(yytext(), ' *' );
1358
+ if (DEBUG ) { logger. info(" Used {ASTS} to recognize " + txt + " as " + normTok); }
1359
+ return getNext(normTok, yytext()); }
1276
1360
else {
1277
- return getNext();
1361
+ if (DEBUG ) { logger. info(" Used {ASTS} to recognize " + txt); }
1362
+ return getNext(txt, txt);
1278
1363
}
1279
1364
}
1280
- {INSENTP} { return getNext(); }
1281
- [ ?!] +| [ \u2047\u2048 ] { return getNext(); }
1282
- [ .¡¿\u037E\u0589\u061F\u06D4\u0700 - \u0702\u07FA\u3002 ] { return getNext(); }
1283
- =+ { return getNext(); }
1365
+ {INSENTP} {
1366
+ String txt = yytext();
1367
+ if (DEBUG ) { logger. info(" Used {INSENTP} to recognize " + txt); }
1368
+ return getNext(txt, txt);
1369
+ }
1370
+ [ ?!] +| [ \u2047\u2048 ] {
1371
+ String txt = yytext();
1372
+ if (DEBUG ) { logger. info(" Used {[?!]+]} to recognize " + txt); }
1373
+ return getNext(txt, txt);
1374
+ }
1375
+ [ .¡¿\u037E\u0589\u061F\u06D4\u0700 - \u0702\u07FA\u3002 ] {
1376
+ String txt = yytext();
1377
+ if (DEBUG ) { logger. info(" Used {sent end punct} to recognize " + txt); }
1378
+ return getNext(txt, txt);
1379
+ }
1380
+ =+ {
1381
+ String txt = yytext();
1382
+ if (DEBUG ) { logger. info(" Used {=} to recognize " + txt); }
1383
+ return getNext(txt, txt);
1384
+ }
1284
1385
\/ { if (escapeForwardSlashAsterisk) {
1285
1386
return getNext(LexerUtils . escapeChar(yytext(), ' /' ), yytext()); }
1286
1387
else {
@@ -1392,7 +1493,11 @@ RM/{NUM} { String txt = yytext();
1392
1493
}
1393
1494
1394
1495
{FAKEDUCKFEET} { return getNext(); }
1395
- {MISCSYMBOL} { return getNext(); }
1496
+ {MISCSYMBOL} {
1497
+ String tok = yytext();
1498
+ if (DEBUG ) { logger. info(" Used {MISCSYMBOL} to recognize " + tok); }
1499
+ return getNext(tok, tok);
1500
+ }
1396
1501
{CP1252_MISC_SYMBOL} { String tok = yytext();
1397
1502
String norm = LexerUtils . processCp1252misc(tok);
1398
1503
if (DEBUG ) { logger. info(" Used {CP1252_MISC_SYMBOL} to recognize " + tok + " as " + norm); }
@@ -1453,9 +1558,9 @@ RM/{NUM} { String txt = yytext();
1453
1558
<<EOF>> { if (invertible) {
1454
1559
// prevWordAfter.append(yytext());
1455
1560
String str = prevWordAfter. toString();
1456
- if (DEBUG ) { logger. info(" At end of text making after: |" + str + " |" ); }
1561
+ // if (DEBUG) { logger.info("At end of text making after: |" + str + "|"); }
1457
1562
prevWord. set(CoreAnnotations . AfterAnnotation . class, str);
1458
- if (DEBUG ) { logger. info(" prevWord is |" + prevWord. get(CoreAnnotations . TextAnnotation . class) + " |, its after is " +
1563
+ if (DEBUG ) { logger. info(" At end of text, prevWord is |" + prevWord. get(CoreAnnotations . TextAnnotation . class) + " |, its after set to " +
1459
1564
" |" + prevWord. get(CoreAnnotations . AfterAnnotation . class) + " |" ); }
1460
1565
prevWordAfter. setLength(0 );
1461
1566
}
0 commit comments