Skip to content

Commit

Permalink
Implement Lucene EstonianAnalyzer, Stemmer (elastic#49149)
Browse files Browse the repository at this point in the history
This PR adds a new analyzer and stemmer for the Estonian language.

Closes elastic#48895
  • Loading branch information
gpaimla authored and jimczi committed Nov 22, 2019
1 parent a7f33e0 commit c3ca133
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 1 deletion.
49 changes: 49 additions & 0 deletions docs/reference/analysis/analyzers/lang-analyzer.asciidoc
Expand Up @@ -15,6 +15,7 @@ following types are supported:
<<danish-analyzer,`danish`>>,
<<dutch-analyzer,`dutch`>>,
<<english-analyzer,`english`>>,
<<estonian-analyzer,`estonian`>>,
<<finnish-analyzer,`finnish`>>,
<<french-analyzer,`french`>>,
<<galician-analyzer,`galician`>>,
Expand Down Expand Up @@ -669,6 +670,54 @@ PUT /english_example
// TEST[s/"english_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: english_example, first: english, second: rebuilt_english}\nendyaml\n/]

<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
be excluded from stemming.

[[estonian-analyzer]]
===== `estonian` analyzer

The `estonian` analyzer could be reimplemented as a `custom` analyzer as follows:

[source,console]
----------------------------------------------------
PUT /estonian_example
{
"settings": {
"analysis": {
"filter": {
"estonian_stop": {
"type": "stop",
"stopwords": "_estonian_" <1>
},
"estonian_keywords": {
"type": "keyword_marker",
"keywords": ["näide"] <2>
},
"estonian_stemmer": {
"type": "stemmer",
"language": "estonian"
}
},
"analyzer": {
"rebuilt_estonian": {
"tokenizer": "standard",
"filter": [
"lowercase",
"estonian_stop",
"estonian_keywords",
"estonian_stemmer"
]
}
}
}
}
}
----------------------------------------------------
// TEST[s/"estonian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: estonian_example, first: estonian, second: rebuilt_estonian}\nendyaml\n/]

<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
Expand Down
Expand Up @@ -70,7 +70,7 @@ PUT /my_index
Elasticsearch provides the following predefined list of languages:

`_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`,
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_estonian_`, `_finnish_`,
`_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
`_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,
`_portuguese_`, `_romanian_`, `_russian_`, `_sorani_`, `_spanish_`,
Expand Down
Expand Up @@ -56,6 +56,7 @@
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.et.EstonianAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
Expand Down Expand Up @@ -192,6 +193,7 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
analyzers.put("danish", DanishAnalyzerProvider::new);
analyzers.put("dutch", DutchAnalyzerProvider::new);
analyzers.put("english", EnglishAnalyzerProvider::new);
analyzers.put("estonian", EstonianAnalyzerProvider::new);
analyzers.put("finnish", FinnishAnalyzerProvider::new);
analyzers.put("french", FrenchAnalyzerProvider::new);
analyzers.put("galician", GalicianAnalyzerProvider::new);
Expand Down Expand Up @@ -344,6 +346,7 @@ public List<PreBuiltAnalyzerProviderFactory> getPreBuiltAnalyzerProviderFactorie
analyzers.add(new PreBuiltAnalyzerProviderFactory("danish", CachingStrategy.LUCENE, DanishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("dutch", CachingStrategy.LUCENE, DutchAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("english", CachingStrategy.LUCENE, EnglishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("estonian", CachingStrategy.LUCENE, EstonianAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("finnish", CachingStrategy.LUCENE, FinnishAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("french", CachingStrategy.LUCENE, FrenchAnalyzer::new));
analyzers.add(new PreBuiltAnalyzerProviderFactory("galician", CachingStrategy.LUCENE, GalicianAnalyzer::new));
Expand Down
@@ -0,0 +1,45 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.et.EstonianAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class EstonianAnalyzerProvider extends AbstractIndexAnalyzerProvider<EstonianAnalyzer> {

private final EstonianAnalyzer analyzer;

EstonianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new EstonianAnalyzer(
Analysis.parseStopWords(env, settings, EstonianAnalyzer.getDefaultStopSet()),
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
);
analyzer.setVersion(version);
}

@Override
public EstonianAnalyzer get() { return this.analyzer; }
}
Expand Up @@ -65,6 +65,7 @@
import org.tartarus.snowball.ext.DanishStemmer;
import org.tartarus.snowball.ext.DutchStemmer;
import org.tartarus.snowball.ext.EnglishStemmer;
import org.tartarus.snowball.ext.EstonianStemmer;
import org.tartarus.snowball.ext.FinnishStemmer;
import org.tartarus.snowball.ext.FrenchStemmer;
import org.tartarus.snowball.ext.German2Stemmer;
Expand Down Expand Up @@ -142,6 +143,9 @@ public TokenStream create(TokenStream tokenStream) {
} else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) {
return new EnglishPossessiveFilter(tokenStream);

} else if ("estonian".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new EstonianStemmer());

// Finnish stemmers
} else if ("finnish".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new FinnishStemmer());
Expand Down
Expand Up @@ -466,6 +466,35 @@
- length: { tokens: 1 }
- match: { tokens.0.token: book }

---
"estonian":
- do:
indices.create:
index: test
body:
settings:
analysis:
analyzer:
my_analyzer:
type: estonian

- do:
indices.analyze:
body:
text: teadaolevalt
analyzer: estonian
- length: { tokens: 1 }
- match: { tokens.0.token: teadaole }

- do:
indices.analyze:
index: test
body:
text: teadaolevalt
analyzer: my_analyzer
- length: { tokens: 1 }
- match: { tokens.0.token: teadaole }

---
"finnish":
- do:
Expand Down
Expand Up @@ -33,6 +33,7 @@
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.et.EstonianAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
Expand Down Expand Up @@ -124,6 +125,7 @@ public static CharArraySet parseStemExclusion(Settings settings, CharArraySet de
entry("_danish_", DanishAnalyzer.getDefaultStopSet()),
entry("_dutch_", DutchAnalyzer.getDefaultStopSet()),
entry("_english_", EnglishAnalyzer.getDefaultStopSet()),
entry("_estonian_", EstonianAnalyzer.getDefaultStopSet()),
entry("_finnish_", FinnishAnalyzer.getDefaultStopSet()),
entry("_french_", FrenchAnalyzer.getDefaultStopSet()),
entry("_galician_", GalicianAnalyzer.getDefaultStopSet()),
Expand Down

0 comments on commit c3ca133

Please sign in to comment.