Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support parsing base-forms parts of a compound word #2

Merged
merged 2 commits into from
Jan 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/main/java/fi/evident/raudikko/Analysis.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.jetbrains.annotations.Nullable;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
Expand Down Expand Up @@ -64,6 +65,7 @@ public final class Analysis implements Cloneable {
private boolean malagaVapaaJalkiosa = false;
private boolean possibleGeographicalName = false;
private @Nullable String requireFollowingVerb;
private @Nullable List<String> baseFormParts;

public @Nullable String getBaseForm() {
return baseForm;
Expand Down Expand Up @@ -209,6 +211,14 @@ public void setRequireFollowingVerb(@Nullable String requireFollowingVerb) {
this.requireFollowingVerb = requireFollowingVerb;
}

public void setBaseFormParts(@Nullable List<String> baseFormParts) {
this.baseFormParts = baseFormParts;
}

public @Nullable List<String> getBaseFormParts() {
return baseFormParts;
}

@Override
public @NotNull Analysis clone() {
try {
Expand Down Expand Up @@ -277,6 +287,7 @@ public String toString() {
", malagaVapaaJalkiosa=" + malagaVapaaJalkiosa +
", possibleGeographicalName=" + possibleGeographicalName +
", requireFollowingVerb='" + requireFollowingVerb + '\'' +
", baseFormParts='" + baseFormParts + '\'' +
'}';
}
}
2 changes: 0 additions & 2 deletions src/main/java/fi/evident/raudikko/Analyzer.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ public interface Analyzer {

/**
* Analyze given word and return a list of possible interpretations.
*
* At most {@code maxResults} results are returned.
*/
default @NotNull List<String> baseForms(@NotNull CharSequence word) {
List<Analysis> analyses = analyze(word);
Expand Down
107 changes: 107 additions & 0 deletions src/main/java/fi/evident/raudikko/AnalyzerConfiguration.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* The contents of this file are subject to the Mozilla Public License Version
* 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* https://www.mozilla.org/en-US/MPL/2.0/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Libvoikko: Library of natural language processing tools.
* The Initial Developer of the Original Code is Harri Pitkänen <hatapitk@iki.fi>.
* Portions created by the Initial Developer are Copyright (C) 2012
* the Initial Developer. All Rights Reserved.
*
* Raudikko, the Java port of the Initial Code is Copyright (C) 2020 by
* Evident Solutions Oy. All Rights Reserved.
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*/

package fi.evident.raudikko;

/**
* Default configuration includes sensible defaults (most of the analysis excluding expensive).
* If this configuration is modified after constructing the analyzer the behaviour is undefined.
*/
public final class AnalyzerConfiguration {

private boolean includeStructure = true;
private boolean includeBaseForm = true;
private boolean includeBasicAttributes = true;
private boolean includeOrganizationNameAnalysis = true;
private boolean includeFstOutput = true;
private boolean includeBaseFormParts = true;

public boolean isIncludeStructure() {
return includeStructure;
}

public void setIncludeStructure(boolean includeStructure) {
this.includeStructure = includeStructure;
}

public boolean isIncludeBaseForm() {
return includeBaseForm;
}

public void setIncludeBaseForm(boolean includeBaseForm) {
this.includeBaseForm = includeBaseForm;
}

public boolean isIncludeBasicAttributes() {
return includeBasicAttributes;
}

public void setIncludeBasicAttributes(boolean includeBasicAttributes) {
this.includeBasicAttributes = includeBasicAttributes;
}

public boolean isIncludeOrganizationNameAnalysis() {
return includeOrganizationNameAnalysis;
}

public void setIncludeOrganizationNameAnalysis(boolean includeOrganizationNameAnalysis) {
this.includeOrganizationNameAnalysis = includeOrganizationNameAnalysis;
}

public boolean isIncludeFstOutput() {
return includeFstOutput;
}

public void setIncludeFstOutput(boolean includeFstOutput) {
this.includeFstOutput = includeFstOutput;
}

public boolean isIncludeBaseFormParts() {
return includeBaseFormParts;
}

public void setIncludeBaseFormParts(boolean includeBaseFormParts) {
this.includeBaseFormParts = includeBaseFormParts;
}

@Override
public String toString() {
return "AnalyzerConfiguration{" +
"includeStructure=" + includeStructure +
", includeBaseForm=" + includeBaseForm +
", includeBasicAttributes=" + includeBasicAttributes +
", includeOrganizationNameAnalysis=" + includeOrganizationNameAnalysis +
", includeFstOutput=" + includeFstOutput +
", includeBaseFormParts=" + includeBaseFormParts +
'}';
}
}
12 changes: 11 additions & 1 deletion src/main/java/fi/evident/raudikko/Morphology.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,16 @@ private Morphology(@NotNull UnweightedTransducer transducer) {
* shared between threads.
*/
public @NotNull Analyzer newAnalyzer() {
return new FinnishVfstAnalyzer(transducer);
return newAnalyzer(new AnalyzerConfiguration());
}

/**
* Create a new {@link Analyzer} for this morphology.
*
* The analyzer is a mutable object that can be used repeatedly, but may not be
* shared between threads.
*/
public @NotNull Analyzer newAnalyzer(@NotNull AnalyzerConfiguration configuration) {
return new FinnishVfstAnalyzer(transducer, configuration);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

import java.util.ArrayList;
import java.util.List;

import static fi.evident.raudikko.internal.utils.StringUtils.*;
import static java.lang.Character.isDigit;
import static java.lang.Character.toUpperCase;
Expand Down Expand Up @@ -165,6 +168,22 @@ else if (!classTagSeen && tag.matches(Tags.lu)) {
return baseform.toString();
}

static @NotNull List<String> parseBaseFormParts(@NotNull SymbolBuffer tokenizer) {
List<String> parts = new ArrayList<>();
tokenizer.moveToStart();

while (tokenizer.nextToken()) {
Symbol tag = tokenizer.getCurrentTag();
if (tag != null && tag.matches(Tags.xp)) {
String part = withoutChar(tokenizer.readXTagContents(), '=');
if (!part.isEmpty()) {
parts.add(part);
}
}
}
return parts;
}

private static final class StructureIterator {

private final @NotNull String structure;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,31 +34,36 @@

import fi.evident.raudikko.Analysis;
import fi.evident.raudikko.Analyzer;
import fi.evident.raudikko.AnalyzerConfiguration;
import fi.evident.raudikko.internal.fst.Symbol;
import fi.evident.raudikko.internal.fst.UnweightedTransducer;
import org.jetbrains.annotations.NotNull;

import java.util.ArrayList;
import java.util.List;

import static fi.evident.raudikko.internal.morphology.BaseForm.parseBaseFormParts;
import static fi.evident.raudikko.internal.morphology.BaseForm.parseBaseform;
import static fi.evident.raudikko.internal.morphology.BasicAttributes.parseBasicAttributes;
import static fi.evident.raudikko.internal.morphology.Organization.organizationNameAnalysis;
import static fi.evident.raudikko.internal.morphology.Structure.parseStructure;
import static fi.evident.raudikko.internal.morphology.Validator.isValidAnalysis;
import static java.util.Objects.requireNonNull;

public final class FinnishVfstAnalyzer implements Analyzer {

private final @NotNull UnweightedTransducer transducer;
private final @NotNull List<Symbol> inputBuffer = new ArrayList<>(2000);
private final @NotNull Symbol[] output = new Symbol[2000];
private final @NotNull SymbolBuffer buffer = new SymbolBuffer(2000);
private final @NotNull AnalyzerConfiguration configuration;
private final short[] flags;
private static final int MAX_WORD_LENGTH = 255;

public FinnishVfstAnalyzer(@NotNull UnweightedTransducer transducer) {
public FinnishVfstAnalyzer(@NotNull UnweightedTransducer transducer, @NotNull AnalyzerConfiguration configuration) {
this.transducer = transducer;
this.flags = new short[transducer.flagDiacriticFeatureCount];
this.configuration = configuration;
}

@Override
Expand All @@ -72,7 +77,7 @@ public FinnishVfstAnalyzer(@NotNull UnweightedTransducer transducer) {
buffer.reset(output, depth);

if (isValidAnalysis(buffer))
createAnalysis(buffer, word.length(), results);
createAnalysis(buffer, word.length(), results, configuration);
});

return results;
Expand Down Expand Up @@ -100,20 +105,40 @@ public FinnishVfstAnalyzer(@NotNull UnweightedTransducer transducer) {
return results;
}

private static void createAnalysis(@NotNull SymbolBuffer buffer, int wordLength, @NotNull List<Analysis> results) {
String structure = parseStructure(buffer, wordLength);

private static void createAnalysis(@NotNull SymbolBuffer buffer,
int wordLength,
@NotNull List<Analysis> results,
@NotNull AnalyzerConfiguration configuration) {
Analysis analysis = new Analysis();
analysis.setStructure(structure);
analysis.setFstOutput(buffer.fullContents());
analysis.setBaseForm(parseBaseform(buffer, structure));

parseBasicAttributes(analysis, buffer);
boolean dependsOnStructure =
configuration.isIncludeStructure()
|| configuration.isIncludeBaseForm()
|| configuration.isIncludeOrganizationNameAnalysis();

String structure = dependsOnStructure ? parseStructure(buffer, wordLength) : null;

if (configuration.isIncludeStructure())
analysis.setStructure(requireNonNull(structure));

if (configuration.isIncludeBaseForm())
analysis.setBaseForm(parseBaseform(buffer, requireNonNull(structure)));

if (configuration.isIncludeFstOutput())
analysis.setFstOutput(buffer.fullContents());

if (configuration.isIncludeBaseFormParts())
analysis.setBaseFormParts(parseBaseFormParts(buffer));

if (configuration.isIncludeBasicAttributes())
parseBasicAttributes(analysis, buffer);

results.add(analysis);

Analysis organizationNameAnalysis = organizationNameAnalysis(analysis, buffer, structure);
if (organizationNameAnalysis != null)
results.add(organizationNameAnalysis);
if (configuration.isIncludeOrganizationNameAnalysis()) {
Analysis organizationNameAnalysis = organizationNameAnalysis(analysis, buffer, requireNonNull(structure));
if (organizationNameAnalysis != null)
results.add(organizationNameAnalysis);
}
}
}