Skip to content

Commit 489a3ca

Browse files
committed
Precompile regular expressions in Lang.loadFromResource(String,
Languages) - Precompile regular expressions in PhoneticEngine.encode(String, LanguageSet) - Precompile regular expressions in org.apache.commons.codec.language.bm.Rule.parse*(*)
1 parent 812d020 commit 489a3ca

File tree

5 files changed

+25
-9
lines changed

5 files changed

+25
-9
lines changed

src/changes/changes.xml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,11 @@ The <action> type attribute can be add,update,fix,remove.
4848
<action type="fix" dev="ggregory" due-to="Gary Gregory">Remove -nouses directive from maven-bundle-plugin. OSGi package imports now state 'uses' definitions for package imports, this doesn't affect JPMS (from org.apache.commons:commons-parent:80).</action>
4949
<action type="fix" dev="ggregory" due-to="Gary Gregory">Refactor DigestUtils.updateDigest(MessageDigest, File) to use NIO.</action>
5050
<action type="fix" dev="ggregory" due-to="Gary Gregory" issue="CODEC-328" >Clarify Javadoc for org.apache.commons.codec.digest.UnixCrypt.crypt(byte[],String).</action>
51-
<action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile and resuse x3 regular expression in DaitchMokotoffSoundex.Rule.</action>
52-
<action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile and resuse regular expressions in DaitchMokotoffSoundex.parseRules(Scanner, String, Map, Map).</action>
51+
<action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in DaitchMokotoffSoundex.Rule.</action>
52+
<action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in DaitchMokotoffSoundex.parseRules(Scanner, String, Map, Map).</action>
53+
<action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in Lang.loadFromResource(String, Languages).</action>
54+
<action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in PhoneticEngine.encode(String, LanguageSet).</action>
55+
<action type="fix" dev="ggregory" due-to="Gary Gregory">Precompile regular expressions in org.apache.commons.codec.language.bm.Rule.parse*(*).</action>
5356
<!-- ADD -->
5457
<action type="add" dev="ggregory" due-to="Gary Gregory">Add HmacUtils.hmac(Path).</action>
5558
<action type="add" dev="ggregory" due-to="Gary Gregory">Add HmacUtils.hmacHex(Path).</action>

src/main/java/org/apache/commons/codec/language/bm/Lang.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ public boolean matches(final String txt) {
103103

104104
private static final String LANGUAGE_RULES_RN = "/org/apache/commons/codec/language/bm/%s_lang.txt";
105105

106+
private static final Pattern PLUS = Pattern.compile("\\+");
107+
106108
static {
107109
for (final NameType s : NameType.values()) {
108110
LANGS.put(s, loadFromResource(String.format(LANGUAGE_RULES_RN, s.getName()), Languages.getInstance(s)));
@@ -163,15 +165,15 @@ public static Lang loadFromResource(final String languageRulesResourceName, fina
163165
}
164166

165167
// split it up
166-
final String[] parts = line.split("\\s+");
168+
final String[] parts = ResourceConstants.SPACES.split(line);
167169

168170
if (parts.length != 3) {
169171
throw new IllegalArgumentException("Malformed line '" + rawLine +
170172
"' in language resource '" + languageRulesResourceName + "'");
171173
}
172174

173175
final Pattern pattern = Pattern.compile(parts[0]);
174-
final String[] langs = parts[1].split("\\+");
176+
final String[] langs = PLUS.split(parts[1]);
175177
final boolean accept = parts[2].equals("true");
176178

177179
rules.add(new LangRule(pattern, new HashSet<>(Arrays.asList(langs)), accept));

src/main/java/org/apache/commons/codec/language/bm/PhoneticEngine.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import java.util.Objects;
3030
import java.util.Set;
3131
import java.util.TreeMap;
32+
import java.util.regex.Pattern;
3233
import java.util.stream.Collectors;
3334

3435
import org.apache.commons.codec.language.bm.Languages.LanguageSet;
@@ -227,6 +228,8 @@ public boolean isFound() {
227228

228229
private static final Map<NameType, Set<String>> NAME_PREFIXES = new EnumMap<>(NameType.class);
229230

231+
private static final Pattern QUOTE = Pattern.compile("'");
232+
230233
static {
231234
NAME_PREFIXES.put(NameType.ASHKENAZI,
232235
Collections.unmodifiableSet(
@@ -401,14 +404,14 @@ public String encode(String input, final Languages.LanguageSet languageSet) {
401404
}
402405
}
403406

404-
final List<String> words = Arrays.asList(input.split("\\s+"));
407+
final List<String> words = Arrays.asList(ResourceConstants.SPACES.split(input));
405408
final List<String> words2 = new ArrayList<>();
406409

407410
// special-case handling of word prefixes based upon the name type
408411
switch (this.nameType) {
409412
case SEPHARDIC:
410413
words.forEach(aWord -> {
411-
final String[] parts = aWord.split("'", -1);
414+
final String[] parts = QUOTE.split(aWord, -1);
412415
words2.add(parts[parts.length - 1]);
413416
});
414417
words2.removeAll(NAME_PREFIXES.get(this.nameType));

src/main/java/org/apache/commons/codec/language/bm/ResourceConstants.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.commons.codec.language.bm;
1919

20+
import java.util.regex.Pattern;
21+
2022
import org.apache.commons.codec.CharEncoding;
2123

2224
/**
@@ -32,5 +34,7 @@ final class ResourceConstants {
3234
static final String ENCODING = CharEncoding.UTF_8;
3335
static final String EXT_CMT_END = "*/";
3436
static final String EXT_CMT_START = "/*";
37+
static final Pattern SPACES = Pattern.compile("\\s+");
38+
3539

3640
}

src/main/java/org/apache/commons/codec/language/bm/Rule.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,10 @@ public interface RPattern {
300300

301301
private static final int HASH_INCLUDE_LENGTH = HASH_INCLUDE.length();
302302

303+
private static final Pattern AROUND_PLUS = Pattern.compile("[+]");
304+
305+
private static final Pattern AROUND_PIPE = Pattern.compile("[|]");
306+
303307
private static final Map<NameType, Map<RuleType, Map<String, Map<String, List<Rule>>>>> RULES =
304308
new EnumMap<>(NameType.class);
305309

@@ -452,7 +456,7 @@ private static Phoneme parsePhoneme(final String ph) {
452456
}
453457
final String before = ph.substring(0, open);
454458
final String in = ph.substring(open + 1, ph.length() - 1);
455-
final Set<String> langs = new HashSet<>(Arrays.asList(in.split("[+]")));
459+
final Set<String> langs = new HashSet<>(Arrays.asList(AROUND_PLUS.split(in)));
456460

457461
return new Phoneme(before, Languages.LanguageSet.from(langs));
458462
}
@@ -467,7 +471,7 @@ private static PhonemeExpr parsePhonemeExpr(final String ph) {
467471

468472
final List<Phoneme> phs = new ArrayList<>();
469473
final String body = ph.substring(1, ph.length() - 1);
470-
for (final String part : body.split("[|]")) {
474+
for (final String part : AROUND_PIPE.split(body)) {
471475
phs.add(parsePhoneme(part));
472476
}
473477
if (body.startsWith("|") || body.endsWith("|")) {
@@ -521,7 +525,7 @@ private static Map<String, List<Rule>> parseRules(final Scanner scanner, final S
521525
}
522526
} else {
523527
// rule
524-
final String[] parts = line.split("\\s+");
528+
final String[] parts = ResourceConstants.SPACES.split(line);
525529
if (parts.length != 4) {
526530
throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
527531
" parts: " + rawLine + " in " + location);

0 commit comments

Comments
 (0)