Skip to content

Commit

Permalink
Add fuzzy search support to "items" correlator
Browse files Browse the repository at this point in the history
Not quite working now, because of the query interpretation issues;
see disabled TestCorrelators#test220CorrelateByNameFuzzy.
  • Loading branch information
mederly committed Aug 9, 2022
1 parent 6cb231d commit 6df0bd4
Show file tree
Hide file tree
Showing 10 changed files with 256 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
xmlns:a="http://prism.evolveum.com/xml/ns/public/annotation-3"
xmlns:c="http://midpoint.evolveum.com/xml/ns/public/common/common-3"
xmlns:t="http://prism.evolveum.com/xml/ns/public/types-3"
xmlns:mr="http://prism.evolveum.com/xml/ns/public/matching-rule-3"
xmlns:jaxb="http://java.sun.com/xml/ns/jaxb"
elementFormDefault="qualified"
xmlns:xjc="http://java.sun.com/xml/ns/jaxb/xjc"
Expand Down Expand Up @@ -473,10 +474,10 @@
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="index" type="xsd:string" minOccurs="0">
<xsd:element name="search" type="tns:ItemSearchDefinitionType" minOccurs="0">
<xsd:annotation>
<xsd:documentation>
What index to use when matching this item? If none is specified, the default one is used. (TODO)
How to search for the item values?
</xsd:documentation>
</xsd:annotation>
</xsd:element>
Expand Down Expand Up @@ -1557,4 +1558,129 @@
</xsd:extension>
</xsd:complexContent>
</xsd:complexType>

<xsd:complexType name="ItemSearchDefinitionType">
<xsd:annotation>
<xsd:documentation>
Specifies how to search for the item.
</xsd:documentation>
<xsd:appinfo>
<a:since>4.6</a:since>
<a:container>true</a:container>
<a:experimental>true</a:experimental>
</xsd:appinfo>
</xsd:annotation>
<xsd:sequence>
<xsd:element name="index" type="xsd:string" minOccurs="0">
<xsd:annotation>
<xsd:documentation>
What index to use when matching this item? If none is specified, then:
If the item is indexed, the default index is used. If it is not, the original value is used.
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:choice>
<xsd:element name="matchingRule" type="xsd:QName" minOccurs="0" default="mr:default">
<xsd:annotation>
<xsd:documentation>
What matching rule to use?
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="fuzzy" type="tns:FuzzySearchDefinitionType" minOccurs="0">
<xsd:annotation>
<xsd:documentation>
Specification of the fuzzy search to be used.
</xsd:documentation>
</xsd:annotation>
</xsd:element>
</xsd:choice>
</xsd:sequence>
<xsd:attribute name="id" type="xsd:long"/>
</xsd:complexType>

<xsd:complexType name="FuzzySearchDefinitionType">
<xsd:annotation>
<xsd:documentation>
Specifies the fuzzy search algorithm to use for searching.
</xsd:documentation>
<xsd:appinfo>
<a:since>4.6</a:since>
<a:container>true</a:container>
<a:experimental>true</a:experimental>
</xsd:appinfo>
</xsd:annotation>
<xsd:sequence>
<xsd:choice>
<xsd:element name="levenshtein" type="tns:LevenshteinDistanceSearchDefinitionType" minOccurs="0">
<xsd:annotation>
<xsd:documentation>
Search using Levenshtein edit distance.
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="similarity" type="tns:TrigramSimilaritySearchDefinitionType" minOccurs="0">
<xsd:annotation>
<xsd:documentation>
Search using trigram similarity.
</xsd:documentation>
</xsd:annotation>
</xsd:element>
</xsd:choice>
</xsd:sequence>
<xsd:attribute name="id" type="xsd:long"/>
</xsd:complexType>

<xsd:complexType name="LevenshteinDistanceSearchDefinitionType">
<xsd:annotation>
<xsd:documentation>
Specifies the use of Levenshtein edit distance for searching.
</xsd:documentation>
<xsd:appinfo>
<a:since>4.6</a:since>
<a:container>true</a:container>
<a:experimental>true</a:experimental>
</xsd:appinfo>
</xsd:annotation>
<xsd:sequence>
<xsd:element name="threshold" type="xsd:int" minOccurs="0">
<xsd:annotation>
<xsd:documentation>
Upper limit on the edit distance to be matched. (Inclusive.)
</xsd:documentation>
</xsd:annotation>
</xsd:element>
</xsd:sequence>
<xsd:attribute name="id" type="xsd:long"/>
</xsd:complexType>

<xsd:complexType name="TrigramSimilaritySearchDefinitionType">
<xsd:annotation>
<xsd:documentation>
Specifies the use of trigram similarity for searching.
</xsd:documentation>
<xsd:appinfo>
<a:since>4.6</a:since>
<a:container>true</a:container>
<a:experimental>true</a:experimental>
</xsd:appinfo>
</xsd:annotation>
<xsd:sequence>
<xsd:element name="threshold" type="xsd:float" minOccurs="0">
<xsd:annotation>
<xsd:documentation>
Lower limit on the similarity to be matched.
</xsd:documentation>
</xsd:annotation>
</xsd:element>
<xsd:element name="inclusive" type="xsd:boolean" minOccurs="0" default="true">
<xsd:annotation>
<xsd:documentation>
Is the value of "threshold" meant to be inclusive?
</xsd:documentation>
</xsd:annotation>
</xsd:element>
</xsd:sequence>
<xsd:attribute name="id" type="xsd:long"/>
</xsd:complexType>
</xsd:schema>
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,7 @@ public interface Normalization {

@NotNull PrismPropertyDefinition<String> getIndexItemDefinition();

@NotNull String normalize(@NotNull String input, Task task, OperationResult result) throws SchemaException, ExpressionEvaluationException, CommunicationException, SecurityViolationException, ConfigurationException, ObjectNotFoundException;
@NotNull String normalize(@NotNull String input, Task task, OperationResult result)
throws SchemaException, ExpressionEvaluationException, CommunicationException,
SecurityViolationException, ConfigurationException, ObjectNotFoundException;
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,23 @@

import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import javax.xml.namespace.QName;

import com.evolveum.midpoint.model.api.identities.IdentityItemConfiguration;
import com.evolveum.midpoint.model.api.indexing.IndexingItemConfiguration;
import com.evolveum.midpoint.model.api.indexing.Normalization;
import com.evolveum.midpoint.model.impl.lens.identities.IndexingManager;

import com.evolveum.midpoint.prism.query.FuzzyStringMatchFilter;
import com.evolveum.midpoint.prism.query.FuzzyStringMatchFilter.FuzzyMatchingMethod;
import com.evolveum.midpoint.schema.result.OperationResult;
import com.evolveum.midpoint.task.api.Task;

import com.evolveum.midpoint.util.exception.*;

import com.evolveum.midpoint.xml.ns._public.common.common_3.*;

import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

Expand All @@ -37,8 +41,6 @@
import com.evolveum.midpoint.util.MiscUtil;
import com.evolveum.midpoint.util.logging.Trace;
import com.evolveum.midpoint.util.logging.TraceManager;
import com.evolveum.midpoint.xml.ns._public.common.common_3.ItemCorrelationType;
import com.evolveum.midpoint.xml.ns._public.common.common_3.ObjectType;
import com.evolveum.prism.xml.ns._public.types_3.ItemPathType;

/**
Expand All @@ -60,23 +62,25 @@ public class CorrelationItem implements DebugDumpable {
@Nullable private final Normalization normalization;

// TODO
@Nullable private final IdentityItemConfiguration identityItemConfiguration;
@Nullable private final IndexingItemConfiguration indexingItemConfiguration;

/** Note we ignore "index" from this configuration. It is already processed into {@link #normalization} field. */
@NotNull private final ItemSearchDefinitionType searchDefinitionBean;

// TODO
@NotNull private final List<? extends PrismValue> prismValues;

private CorrelationItem(
@NotNull String name,
@NotNull ItemPath itemPath,
@Nullable Normalization normalization,
@Nullable IdentityItemConfiguration identityItemConfiguration,
@Nullable ItemSearchDefinitionType searchDefinitionBean,
@Nullable IndexingItemConfiguration indexingItemConfiguration,
@NotNull List<? extends PrismValue> prismValues) {
this.name = name;
this.itemPath = itemPath;
this.normalization = normalization;
this.identityItemConfiguration = identityItemConfiguration;
this.searchDefinitionBean = searchDefinitionBean != null ? searchDefinitionBean : new ItemSearchDefinitionType();
this.indexingItemConfiguration = indexingItemConfiguration;
this.prismValues = prismValues;
}
Expand All @@ -88,15 +92,21 @@ public static CorrelationItem create(
throws ConfigurationException {
ItemPath path = getPath(itemBean);
IndexingItemConfiguration indexingConfig = getIndexingItemConfiguration(itemBean, correlatorContext);
String explicitIndexName = getExplicitIndexName(itemBean);
return new CorrelationItem(
getName(itemBean),
path,
getNormalization(indexingConfig, itemBean.getIndex(), path),
getIdentityItemConfiguration(itemBean, correlatorContext),
getNormalization(indexingConfig, explicitIndexName, path),
itemBean.getSearch(),
indexingConfig,
getPrismValues(preFocus, path));
}

private static String getExplicitIndexName(ItemCorrelationType itemBean) {
ItemSearchDefinitionType searchSpec = itemBean.getSearch();
return searchSpec != null ? searchSpec.getIndex() : null;
}

private static Normalization getNormalization(IndexingItemConfiguration indexingConfig, String index, ItemPath path)
throws ConfigurationException {
if (indexingConfig == null) {
Expand All @@ -114,16 +124,6 @@ private static Normalization getNormalization(IndexingItemConfiguration indexing
}
}

private static IdentityItemConfiguration getIdentityItemConfiguration(
@NotNull ItemCorrelationType itemBean, @NotNull CorrelatorContext<?> correlatorContext) {
ItemPathType itemPathBean = itemBean.getPath();
if (itemPathBean != null) {
return correlatorContext.getIdentityManagementConfiguration().getForPath(itemPathBean.getItemPath());
} else {
return null;
}
}

private static IndexingItemConfiguration getIndexingItemConfiguration(
@NotNull ItemCorrelationType itemBean, @NotNull CorrelatorContext<?> correlatorContext) {
ItemPathType itemPathBean = itemBean.getPath();
Expand Down Expand Up @@ -223,20 +223,54 @@ S_FilterExit addClauseToQueryBuilder(
if (indexingItemConfiguration != null) {
assert normalization != null;
ItemPath normalizedItemPath = normalization.getIndexItemPath();
Object normalizedValue = IndexingManager.normalizeValue(valueToFind, normalization, task, result);
String normalizedValue = IndexingManager.normalizeValue(valueToFind, normalization, task, result);
LOGGER.trace("Will look for normalized value '{}' in '{}' (of '{}')", normalizedValue, normalizedItemPath, itemPath);
ItemDefinition<?> normalizedItemDefinition = normalization.getIndexItemDefinition();
return builder
.item(normalizedItemPath, normalizedItemDefinition)
.eq(normalizedValue);
// TODO matching rule

FuzzySearchDefinitionType fuzzyDef = searchDefinitionBean.getFuzzy();
if (fuzzyDef != null) {
return builder
.item(normalizedItemPath, normalizedItemDefinition)
.fuzzyString(normalizedValue, getFuzzyMatchingMethod(fuzzyDef));
} else {
return builder
.item(normalizedItemPath, normalizedItemDefinition)
.eq(normalizedValue)
.matching(getMatchingRuleName());
}
} else {
LOGGER.trace("Will look for value '{}' of '{}'", valueToFind, itemPath);
return builder
.item(itemPath)
.eq(valueToFind);
// TODO matching rule
.eq(valueToFind)
.matching(getMatchingRuleName());
}
}

private FuzzyMatchingMethod getFuzzyMatchingMethod(FuzzySearchDefinitionType fuzzyDef) throws ConfigurationException {
LevenshteinDistanceSearchDefinitionType levenshtein = fuzzyDef.getLevenshtein();
if (levenshtein != null) {
return new FuzzyStringMatchFilter.Levenshtein(
MiscUtil.configNonNull(
levenshtein.getThreshold(),
() -> "Please specify Levenshtein edit distance threshold"),
true);
}
TrigramSimilaritySearchDefinitionType similarity = fuzzyDef.getSimilarity();
if (similarity != null) {
return new FuzzyStringMatchFilter.Similarity(
MiscUtil.configNonNull(
similarity.getThreshold(),
() -> "Please specify trigram similarity threshold"),
!Boolean.FALSE.equals(similarity.isInclusive()));
}
throw new ConfigurationException("Please specify Levenshtein or trigram similarity fuzzy string matching method");
}

private QName getMatchingRuleName() {
return Objects.requireNonNullElse(
searchDefinitionBean.getMatchingRule(),
PrismConstants.DEFAULT_MATCHING_RULE_NAME);
}

/**
Expand Down Expand Up @@ -275,7 +309,8 @@ public String toString() {
return "CorrelationItem{" +
"name=" + name +
", itemPath=" + itemPath +
", identityConfig=" + identityItemConfiguration +
", normalization=" + normalization +
", indexing=" + indexingItemConfiguration +
'}';
}

Expand All @@ -293,8 +328,9 @@ public String debugDump(int indent) {
StringBuilder sb = DebugUtil.createTitleStringBuilderLn(getClass(), indent);
DebugUtil.debugDumpWithLabelLn(sb, "name", name, indent + 1);
DebugUtil.debugDumpWithLabelLn(sb, "itemPath", String.valueOf(itemPath), indent + 1);
DebugUtil.debugDumpWithLabelLn(sb, "normalization", String.valueOf(normalization), indent + 1);
DebugUtil.debugDumpWithLabelLn(
sb, "identityItemConfiguration", String.valueOf(identityItemConfiguration), indent + 1);
sb, "indexingItemConfiguration", String.valueOf(indexingItemConfiguration), indent + 1);
DebugUtil.debugDumpWithLabel(sb, "values", prismValues, indent + 1);
return sb.toString();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ public ItemPath getIndexItemPath() {
public String toString() {
return "Normalization{" +
"name='" + name + '\'' +
",default='" + isDefault() + '\'' +
(isDefault() ? ",default" : "") +
'}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ String execute(@NotNull String input, Task task, OperationResult result) {

@Override
String asSuffix() {
return NAME_PREFIX + "." + length;
return NAME_PREFIX + length;
}
}

Expand Down

0 comments on commit 6df0bd4

Please sign in to comment.