Skip to content
This repository
Browse code

move freetts and seconstring into local repo in lib dir; remove Earl …

…as a title since it is also a name; ignore soundex codes in names
  • Loading branch information...
commit 3a6dfee35880bce28214f7477bba4bd8aa30ff01 1 parent e72d5e0
Dallan Quass authored
3  .gitignore
@@ -5,3 +5,6 @@ target
5 5 .idea
6 6 *.iml
7 7 nbproject
  8 +# these files must be downloaded separately
  9 +givenname_similar_names.csv
  10 +surname_similar_names.csv
2  pom.xml
@@ -16,7 +16,7 @@
16 16
17 17 <properties>
18 18 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
19   - <skipTests>true</skipTests>
  19 + <skipTests>false</skipTests>
20 20 </properties>
21 21
22 22 <build>
0  score/external/freetts.jar → ...tts/1.2.2-threadsafe/freetts-1.2.2-threadsafe.jar
File renamed without changes
0  score/external/secondstring.jar → ...s/secondstring/20101021/secondstring-20101021.jar
File renamed without changes
34 score/pom.xml
@@ -35,6 +35,36 @@
35 35 </plugins>
36 36 </build>
37 37
  38 + <repositories>
  39 + <repository>
  40 + <id>sonatype-snapshots</id>
  41 + <name>Sonatype Snapshots Repository</name>
  42 + <url>http://oss.sonatype.org/content/repositories/snapshots/</url>
  43 + <snapshots>
  44 + <enabled>true</enabled>
  45 + </snapshots>
  46 + </repository>
  47 +
  48 + <repository>
  49 + <id>sonatype-releases</id>
  50 + <name>Sonatype Releases Repository</name>
  51 + <url>http://oss.sonatype.org/content/repositories/releases/</url>
  52 + </repository>
  53 +
  54 + <repository>
  55 + <id>lib</id>
  56 + <name>lib</name>
  57 + <releases>
  58 + <enabled>true</enabled>
  59 + <checksumPolicy>ignore</checksumPolicy>
  60 + </releases>
  61 + <snapshots>
  62 + <enabled>false</enabled>
  63 + </snapshots>
  64 + <url>file://${project.basedir}/lib</url>
  65 + </repository>
  66 + </repositories>
  67 +
38 68 <dependencies>
39 69 <dependency>
40 70 <groupId>org.folg.names</groupId>
@@ -52,9 +82,7 @@
52 82 <version>1.4</version>
53 83 </dependency>
54 84 <dependency>
55   - <!-- need to use a customized (thread-safe) version of com.sun.speech.freetts.lexicon.LetterToSoundImpl.java
56   - mvn install:install-file -Dfile=external/freetts.jar -DgroupId=com.sun.speech -DartifactId=freetts -Dversion=1.2.2-threadsafe -Dpackaging=jar
57   - -->
  85 + <!-- need to use a customized (thread-safe) version of com.sun.speech.freetts.lexicon.LetterToSoundImpl.java -->
58 86 <groupId>com.sun.speech</groupId>
59 87 <artifactId>freetts</artifactId>
60 88 <version>1.2.2-threadsafe</version>
9 search/src/main/java/org/folg/names/search/Normalizer.java
@@ -158,10 +158,15 @@ else if (c == '\'' && i == name.length()-2 && Character.toLowerCase(name.charAt(
158 158 break;
159 159 }
160 160 else if (c >= 'A' && c <= 'Z') {
161   - buf.append(Character.toLowerCase(c));
  161 + // skip ANNN names because they're soundex codes
  162 + if (!(buf.length() == 0 && i < name.length()-3 && Character.isDigit(name.charAt(i+1)) && Character.isDigit(name.charAt(i+2)) && Character.isDigit(name.charAt(i+3)))) {
  163 + buf.append(Character.toLowerCase(c));
  164 + }
162 165 }
163 166 else if (c >= 'a' && c <= 'z') {
164   - buf.append(c);
  167 + if (!(buf.length() == 0 && i < name.length()-3 && Character.isDigit(name.charAt(i+1)) && Character.isDigit(name.charAt(i+2)) && Character.isDigit(name.charAt(i+3)))) {
  168 + buf.append(c);
  169 + }
165 170 }
166 171 else if (allowWildcards && (c == '?' || c == '*')) {
167 172 buf.append(c);
4 search/src/main/java/org/folg/names/search/Searcher.java
@@ -170,11 +170,11 @@ private Searcher(final boolean isSurname) {
170 170
171 171 // if not reading from database, read from file
172 172 if (dataSource == null) {
173   - similarNamesReader = new InputStreamReader(getClass().getClassLoader().getResourceAsStream(prefix + "_similar_names.csv"), "UTF8");
  173 + similarNamesReader = new InputStreamReader(this.getClass().getClassLoader().getResourceAsStream(prefix + "_similar_names.csv"), "UTF8");
174 174 readSimilarNames(similarNamesReader);
175 175 }
176 176
177   - codeMapReader = new InputStreamReader(getClass().getClassLoader().getResourceAsStream(prefix + "SoundexMap.txt"));
  177 + codeMapReader = new InputStreamReader(this.getClass().getClassLoader().getResourceAsStream(prefix + "SoundexMap.txt"));
178 178 if (codeMapReader != null) {
179 179 // call after readSimilarNames (if we're reading the whole file into memory)
180 180 readCodeMap(codeMapReader); // also populates commonNames
1  search/src/main/resources/name-normalizer.properties
@@ -329,7 +329,6 @@ queen,\
329 329 prince,\
330 330 major,\
331 331 baron,\
332   -earl,\
333 332 duke,\
334 333 count,\
335 334 president,\
1  search/src/test/java/org/folg/names/search/NormalizerTest.java
@@ -28,5 +28,6 @@ public void testNormalizer() throws Exception {
28 28 assertEquals("mcdonald", Utils.join(normalizer.normalize("Mc Donald", true)));
29 29 assertEquals("olson", Utils.join(normalizer.normalize("Olsdatter", true)));
30 30 assertEquals("alberte", Utils.join(normalizer.normalize("Alberte{1}", false)));
  31 + assertEquals("dallan", Utils.join(normalizer.normalize("Dallan D123", false)));
31 32 }
32 33 }
26 search/src/test/java/org/folg/names/search/SearcherTest.java
@@ -25,23 +25,27 @@ public SearcherTest(String name) {
25 25 super(name);
26 26 }
27 27
28   - public void testSearcher() throws Exception {
  28 + public void testGetGivennameIndexTokens() {
29 29 Searcher searcher = Searcher.getGivennameInstance();
30 30 assertEquals("", Utils.join(searcher.getAdditionalIndexTokens("dallan")));
31 31 assertEquals("D450", Utils.join(searcher.getAdditionalIndexTokens("dalan")));
32   - assertEquals("D450 dalana daleen dalen dalena dalene dalin dalla dallas dallen dallin " +
33   - "dallon dalma dalon dalson dalvin dalyn dalynn daylan daylene daylon delain " +
34   - "delaina delaine delan delana delane delaney delania delanie delano delany " +
35   - "delaon delaun delayne delean delena delene deleno deleon deliana delin delino " +
36   - "delion dellene dellon delon delona delone delyn delynn dilan dillan dillen " +
37   - "dillian dillin dillion dillon dolan dolen dolena doolin dulane dulaney dulany " +
38   - "dulin dylan dyllan dylon talan tallon",
  32 + }
  33 +
  34 + public void testGetGivennameSearchTokens() throws Exception {
  35 + Searcher searcher = Searcher.getGivennameInstance();
  36 + assertEquals("D450 dalen dalin dalla dallas dallen dallin dallon dalma dalon dalvin dalyn dalynn daylan daylon delain delaine delan delane delanie delaun delean deleon delin delion dellene dellon delon delone delyn delynn dilan dillan dillen dillian dillin dillion dillon dolan dolen doolin dulane dulin dylan dyllan dylon talan tallon",
39 37 Utils.join(new TreeSet<String>(searcher.getAdditionalSearchTokens("dallan"))));
40   - searcher = Searcher.getSurnameInstance();
  38 + }
  39 +
  40 + public void testGetSurnameIndexTokens() {
  41 + Searcher searcher = Searcher.getSurnameInstance();
41 42 assertEquals("", Utils.join(searcher.getAdditionalIndexTokens("quass")));
42 43 assertEquals("Q200", Utils.join(searcher.getAdditionalIndexTokens("quas")));
43   - assertEquals("Q200 cass casse catts kass kasse quaas quack quash quasie quast " +
44   - "quates quatsie quatsy quessy quijas quish",
  44 + }
  45 +
  46 + public void testGetSurnameSearchTokens() {
  47 + Searcher searcher = Searcher.getSurnameInstance();
  48 + assertEquals("Q200 kass kasse quaas quash quasie quast",
45 49 Utils.join(new TreeSet<String>(searcher.getAdditionalSearchTokens("quass"))));
46 50 }
47 51 }

0 comments on commit 3a6dfee

Please sign in to comment.
Something went wrong with that request. Please try again.