forked from jprante/elasticsearch-langdetect
-
Notifications
You must be signed in to change notification settings - Fork 2
/
DetectLanguageTest.java
221 lines (207 loc) · 10.1 KB
/
DetectLanguageTest.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
package org.xbib.elasticsearch.index.mapper.langdetect;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.common.io.Streams;
import org.junit.Assert;
import org.junit.Test;
import org.xbib.elasticsearch.common.langdetect.LangdetectService;
import org.xbib.elasticsearch.common.langdetect.Language;
import org.xbib.elasticsearch.common.langdetect.LanguageDetectionException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
public class DetectLanguageTest extends Assert {
private static final Logger logger = LogManager.getLogger();
@Test
public void testEnglish() throws IOException {
testLanguage("english.txt", "en");
}
@Test
public void testChinese() throws IOException {
testLanguage("chinese.txt", "zh-cn");
}
@Test
public void testJapanese() throws IOException {
testLanguage("japanese.txt", "ja");
}
@Test
public void testKorean() throws IOException {
testLanguage("korean.txt", "ko");
}
/**
* Test classification accuracies on translations of the Universal Declaration of Human Rights (UDHR).
*
* The translations were obtained from http://unicode.org/udhr/. Some minimal processing was done to create the
* udhr.tsv resource file: matched the dataset's language code with the one returned by the library, and removed
* each file's English intro and redundant whitespace.
*/
@Test
public void testUdhrAccuracies() throws IOException {
testSubstringAccuracies(
"udhr.tsv",
new double[][] {
{ 5, 100, 0.26, 0.65 },
{ 10, 100, 0.46, 0.82 },
{ 20, 100, 0.73, 0.94 },
{ 50, 100, 0.85, 0.98 },
{ 100, 100, 0.94, 0.99 },
{ 300, 100, 1.00, 1.00 },
{ 0, 1, 1.00, 1.00 }
}
);
}
/**
* Test classification accuracies on WordPress interface translations.
*
* The translations were obtained from https://translate.wordpress.org/projects/wp/4.6.x. Some minimal processing
* was done to create the wp-translations.tsv resource file: matched the dataset's language code with the one
* returned by the library, unescaped HTML entities, and dropped variable placeholders, HTML tags, and redundant
* whitespace. To speed up testing, the resource file contains only the 50 longest translated phrases for each
* language, excluding URL translations and word lists.
*/
@Test
public void testWordPressTranslationsAccuracies() throws IOException {
testSubstringAccuracies(
"wp-translations.tsv",
new double[][] {
{ 5, 10, 0.25, 0.60 },
{ 10, 10, 0.44, 0.76 },
{ 20, 10, 0.65, 0.88 },
{ 0, 1, 0.80, 0.98 }
}
);
}
/**
* Test classification accuracies on substrings of texts from a single dataset.
*
* For each text and substring length, this test generates a sample of substrings (drawn uniformly with
* replacement from the set of possible substrings of the given length), runs the language identification code,
* measures the per-language accuracy (percentage of substrings classified correctly), and fails if the minimum or
* mean accuracy for the length is below a predetermined threshold.
*
* @param datasetPath dataset resource path (see {@link #readMultiLanguageDataset(String)})
* @param allTrialParams a matrix specifying each trial's parameters. Each row in the matrix must have four items:
* substring length and sample size, which are passed to
* {@link #generateSubstringSample(String, int, int)}, and a per-language accuracy threshold
* and mean accuracy threshold, which are used to determine whether the trial passes or fails
*/
private void testSubstringAccuracies(String datasetPath, double[][] allTrialParams) throws IOException {
LangdetectService service = new LangdetectService();
Map<String, List<String>> languageToFullTexts = readMultiLanguageDataset(datasetPath);
// Sort the languages to make the log output prettier.
List<String> languages = new ArrayList<>(languageToFullTexts.keySet());
Collections.sort(languages);
for (double[] trialParams : allTrialParams) {
int substringLength = (int) trialParams[0];
int sampleSize = (int) trialParams[1];
double minAccuracyThreshold = trialParams[2];
double meanAccuracyThreshold = trialParams[3];
double sumAccuracies = 0;
double minAccuracy = Double.POSITIVE_INFINITY;
for (String language : languages) {
double numCorrect = 0;
List<String> fullTexts = languageToFullTexts.get(language);
for (String text : fullTexts) {
for (String substring : generateSubstringSample(text, substringLength, sampleSize)) {
if (Objects.equals(getTopLanguageCode(service, substring), language)) {
numCorrect++;
}
}
}
double accuracy = numCorrect / (fullTexts.size() * sampleSize);
sumAccuracies += accuracy;
minAccuracy = Math.min(minAccuracy, accuracy);
logger.debug("Substring length: {} Language: {} Accuracy: {}", substringLength, language, accuracy);
}
double meanAccuracy = sumAccuracies / languages.size();
logger.info("* Substring length: {} Accuracy: min={} mean={}", substringLength, minAccuracy, meanAccuracy);
assertTrue(minAccuracy >= minAccuracyThreshold);
assertTrue(meanAccuracy >= meanAccuracyThreshold);
}
}
/**
* Test that the contents of the file at the provided path are correctly detected as being in language lang.
*/
private void testLanguage(String path, String lang) throws IOException {
try (Reader reader = new InputStreamReader(getClass().getResourceAsStream(path), StandardCharsets.UTF_8)) {
assertEquals(getTopLanguageCode(new LangdetectService(), Streams.copyToString(reader)), lang);
}
}
/**
* Read and parse a multi-language dataset from the given path.
*
* @param path resource path, where the file is in tab-separated format with two columns: language code and text
* @return a mapping from each language code found in the file to the texts of this language
*/
private Map<String, List<String>> readMultiLanguageDataset(String path) throws IOException {
// TODO: investigate why some languages are commented out
Set<String> supportedLanguages = new HashSet<>(Arrays.asList(LangdetectService.DEFAULT_LANGUAGES));
Map<String, List<String>> languageToFullTexts = new HashMap<>();
try (BufferedReader br = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream(path),
StandardCharsets.UTF_8))) {
String line;
while ((line = br.readLine()) != null) {
String[] splitLine = line.split("\t");
String language = splitLine[0];
if (!supportedLanguages.contains(language)) {
continue;
}
if (!languageToFullTexts.containsKey(language)) {
languageToFullTexts.put(language, new ArrayList<String>());
}
languageToFullTexts.get(language).add(splitLine[1]);
}
}
return languageToFullTexts;
}
/**
* Return the text's language as detected by the given service object (may be null if no languages are returned).
*/
private String getTopLanguageCode(LangdetectService service, String text) throws LanguageDetectionException {
List<Language> languages = service.detectAll(text);
return languages.size() > 0 ? languages.get(0).getLanguage() : null;
}
/**
* Generate a random sample of substrings from the given text.
*
* Sampling is performed uniformly with replacement from the set of substrings of the provided text, ignoring
* whitespace-only substrings. The random seed is set to a deterministic function of the method's parameters, so
* repeated calls to this method with the same parameters will return the same sample.
*
* @param text the text from which the substring sample is drawn
* @param substringLength length of each generated substring (set to zero to return a singleton list with the
* text -- sampleSize must be 1 in this case)
* @param sampleSize number of substrings to include in the sample
* @return the sample (a list of strings)
*/
private List<String> generateSubstringSample(String text, int substringLength, int sampleSize) {
if (substringLength == 0 && sampleSize == 1) {
return Collections.singletonList(text);
}
if (substringLength > text.trim().length()) {
throw new IllegalArgumentException("Provided text is too short.");
}
Random rnd = new Random(Objects.hash(text, substringLength, sampleSize));
List<String> sample = new ArrayList<>(sampleSize);
while (sample.size() < sampleSize) {
int startIndex = rnd.nextInt(text.length() - substringLength + 1);
String substring = text.substring(startIndex, startIndex + substringLength);
if (!substring.trim().isEmpty()) {
sample.add(substring);
}
}
return sample;
}
}