Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix for TIKA-1943 contributed by Mark Duske #106

Merged
merged 3 commits into from Apr 23, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -0,0 +1,175 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.tika.language.translate;

import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Properties;

import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.translate.Translator;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
* An implementation of a REST client for the YANDEX <a href="https://tech.yandex.com/translate/">Translate API</a>.
* You can sign up for free access online on the <a href="https://tech.yandex.com/key/form.xml?service=trnsl">API Key form</a>
* and set your Application's User Key in the <code>translator.yandex.properties</code> file.
*/
public class YandexTranslator implements Translator {

/**
* Yandex Translate API service end-point URL
*/
private static final String YANDEX_TRANSLATE_URL_BASE = "https://translate.yandex.net/api/v1.5/tr.json/translate";

/**
* Default USer-Key, a real User-Key must be provided before the Lingo24 can successfully request translations
*/
private static final String DEFAULT_KEY = "dummy-key";

/**
* Identifies the client of the request, used for authentication
*/
private String apiKey;

/**
* The Yandex Translate API can handle text in <b>plain</b> and/or <b>html</b> format, the default
* format is <b>plain</b>
*/
private String format = "plain";

public YandexTranslator() {
Properties config = new Properties();
try {
config.load(YandexTranslator.class
.getResourceAsStream(
"translator.yandex.properties"));
this.apiKey = config.getProperty("translator.api-key");
this.format = config.getProperty("translator.text.format");
} catch (Exception e) {
e.printStackTrace();
}
}

@Override
public String translate(String text, String sourceLanguage,
String targetLanguage) throws TikaException, IOException {
if (!this.isAvailable()) {
return text;
}

WebClient client = WebClient.create(YANDEX_TRANSLATE_URL_BASE);

String langCode;

if (sourceLanguage == null) {
//Translate Service will identify source language
langCode = targetLanguage;
} else {
//Source language is well known
langCode = sourceLanguage + '-' + targetLanguage;
}

//TODO Add support for text over 10k characters
Response response = client.accept(MediaType.APPLICATION_JSON)
.query("key", this.apiKey).query("lang", langCode)
.query("text", text).get();
BufferedReader reader = new BufferedReader(new InputStreamReader(
(InputStream) response.getEntity(), UTF_8));
String line = null;
StringBuffer responseText = new StringBuffer();
while ((line = reader.readLine()) != null) {
responseText.append(line);
}

try {
ObjectMapper mapper = new ObjectMapper();
JsonNode jsonResp = mapper.readTree(responseText.toString());

if (!jsonResp.findValuesAsText("code").isEmpty()) {
String code = jsonResp.findValuesAsText("code").get(0);
if (code.equals("200")) {
return jsonResp.findValue("text").get(0).asText();
} else {
throw new TikaException(jsonResp.findValue("message").get(0).asText());
}
} else {
throw new TikaException("Return message not recognized: " + responseText.toString().substring(0, Math.min(responseText.length(), 100)));
}
} catch (JsonParseException e) {
throw new TikaException("Error requesting translation from '" + sourceLanguage + "' to '" + targetLanguage + "', JSON response from Lingo24 is not well formatted: " + responseText.toString());
}
}


/**
* Get the API Key in use for client authentication
* @return API Key
*/
public String getApiKey() {
return apiKey;
}

/**
* Set the API Key for client authentication
* @param apiKey API Key
*/
public void setApiKey(String apiKey) {
this.apiKey = apiKey;
}

/**
* Retrieve the current text format setting.
* The Yandex Translate API can handle text in <b>plain</b> and/or <b>html</b> format, the default
* format is <b>plain</b>
* @return
*/
public String getFormat() {
return format;
}

/**
* Set the text format to use (plain/html)
* @param format Text format setting, either plain or html
*/
public void setFormat(String format) {
this.format = format;
}

@Override
public String translate(String text, String targetLanguage)
throws TikaException, IOException {
return this.translate(text, null, targetLanguage);
}

@Override
public boolean isAvailable() {
return this.apiKey!=null && !this.apiKey.equals(DEFAULT_KEY);
}

}
@@ -0,0 +1,9 @@
# To use the YANDEX translate service, you <em>must</em> set your API-Key
# as described in Translate API, https://tech.yandex.com/translate/
# If you do not want translation please set the value to "dummy-key".

translator.api-key=dummy-key

#The Yandex Translate API can handle text in plain and/or html format, the default format is plain.
#Set this value to html to enable HTML format support
translator.text.format=plain
@@ -0,0 +1,102 @@

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.tika.language.translate;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;

import org.junit.Before;
import org.junit.FixMethodOrder;
import org.junit.Test;
import org.junit.runners.MethodSorters;

/**
* Generic Tests to ensure that the RSS library behaves as expected
* @author mark
*
*/
@FixMethodOrder(MethodSorters.NAME_ASCENDING)
public class YandexTranslatorTest {

private YandexTranslator translator;

@Before
public void setUp() throws Exception {
this.translator = new YandexTranslator();
}

@Test
public void test1EN_ES_Translation() {
String inputText = "Hello World!!!";
String expectedText = "Hola Mundo!!!";

try {
String transText = translator.translate(inputText, "en", "es");

assertNotNull("Text not translated", transText);

assertEquals("Result: [" + transText
+ "]: not equal to expected: [" + expectedText + "]",
expectedText, transText);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}

@Test
public void test2UNDEFINED_DE_ES_Translation() {
String inputText = "Guten Tag!!!";
String expectedText = "Buen Día!!!";

try {
String transText = translator.translate(inputText, "es");

assertNotNull("Text not translated", transText);

assertEquals("Result: [" + transText
+ "]: not equal to expected: [" + expectedText + "]",
expectedText, transText);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}

@Test
public void test3UNDEFINED_IT_EN_Translation() {
String inputText = "Buona Sera!!!";
String expectedText = "Good Evening!!!";

try {
String transText = translator.translate(inputText, "en");

assertNotNull("Text not translated", transText);

assertEquals("Result: [" + transText
+ "]: not equal to expected: [" + expectedText + "]",
expectedText, transText);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}

}