Skip to content

Commit

Permalink
Provide default fuzzy confidence conversions
Browse files Browse the repository at this point in the history
For levenshtein, 1/(input+1) is used.
For trigram similarity, the value is used "as is".
  • Loading branch information
mederly committed Sep 13, 2022
1 parent e44004c commit 643799e
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -269,11 +269,6 @@ private SearchSpec createSearchSpec(Task task, OperationResult result)
throw new ConfigurationException("Please specify Levenshtein or trigram similarity fuzzy string matching method");
}

private ExpressionType getConfidenceExpression() {
ItemSearchConfidenceDefinitionType confidenceDef = searchDefinitionBean.getConfidence();
return confidenceDef != null ? confidenceDef.getExpression() : null;
}

private QName getMatchingRuleName() {
return Objects.requireNonNullElse(
searchDefinitionBean.getMatchingRule(),
Expand Down Expand Up @@ -311,6 +306,36 @@ public boolean isApplicable() throws SchemaException {
return resultingConfidence;
}

private ExpressionType getConfidenceExpression() {
ItemSearchConfidenceDefinitionType confidenceDef = searchDefinitionBean.getConfidence();
ExpressionType expression = confidenceDef != null ? confidenceDef.getExpression() : null;
return expression != null ? expression : getDefaultConfidenceExpression();
}

/** For fuzzy search. */
private ExpressionType getDefaultConfidenceExpression() {
FuzzySearchDefinitionType fuzzyDef = searchDefinitionBean.getFuzzy();
if (fuzzyDef == null) {
return null;
}
if (fuzzyDef.getLevenshtein() != null) {
return createConfidenceExpression("1/(input+1)");
} else if (fuzzyDef.getSimilarity() != null) {
return createConfidenceExpression("input");
} else {
return null; // should not occur anyway
}
}

private ExpressionType createConfidenceExpression(String code) {
return
new ExpressionType()
.expressionEvaluator(
new ObjectFactory().createScript(
new ScriptExpressionEvaluatorType()
.code(code)));
}

/** Returns the values of given metric (e.g. Levenshtein distance) for given candidate for this item. */
private @NotNull List<Double> computeMatchMetricValues(ObjectType candidate, Task task, OperationResult result)
throws SchemaException, ExpressionEvaluationException, CommunicationException, SecurityViolationException,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ public class TestCorrelators extends AbstractInternalModelIntegrationTest {
new File(TEST_DIR, "correlator-by-name-original.xml"),
USER_TEMPLATE_ORIGINAL_INDEXING);

private static final File FILE_ACCOUNTS_BY_NAME_FUZZY = new File(TEST_DIR, "accounts-by-name-fuzzy.csv");
private static final TestCorrelator CORRELATOR_BY_NAME_FUZZY =
private static final File FILE_ACCOUNTS_BY_NAME_FUZZY_FIXED = new File(TEST_DIR, "accounts-by-name-fuzzy-fixed.csv");
private static final TestCorrelator CORRELATOR_BY_NAME_FUZZY_FIXED =
new TestCorrelator(
new File(TEST_DIR, "correlator-by-name-fuzzy.xml"),
new File(TEST_DIR, "correlator-by-name-fuzzy-fixed.xml"),
USER_TEMPLATE_DEFAULT_INDEXING);

private static final File FILE_ACCOUNTS_BY_NAME_FUZZY_GRADUAL =
Expand Down Expand Up @@ -241,9 +241,9 @@ public void test210CorrelateByNameOriginal() throws Exception {
}

@Test
public void test220CorrelateByNameFuzzy() throws Exception {
public void test220CorrelateByNameFuzzyFixed() throws Exception {
skipIfNotNativeRepository();
executeTest(CORRELATOR_BY_NAME_FUZZY, FILE_USERS_ITEMS, FILE_ACCOUNTS_BY_NAME_FUZZY);
executeTest(CORRELATOR_BY_NAME_FUZZY_FIXED, FILE_USERS_ITEMS, FILE_ACCOUNTS_BY_NAME_FUZZY_FIXED);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Users: John Smith, Mary Smith
# Normalization: poly string norm
# Search: given name with levenshtein(1), family name with similarity(0.5, inclusive) - out of reality; just to test the algorithm
# local confidences fixed to 1.0

uid | givenName | familyName | expCandidates | expResult | _note
1 | John | Smith | smith1:1.0 | smith1 | Exact match
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Users: John Smith, Mary Smith
# Normalization: poly string norm
# Search: given name with levenshtein(4, exclusive), family name with similarity(0.2, inclusive) - out of reality; just to test the algorithm
# local confidences using default algorithms

uid | givenName | familyName | expCandidates | expResult | _note
1 | John | Smith | smith1:1.0 | smith1 | Exact match
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

<itemsCorrelator
xmlns="http://midpoint.evolveum.com/xml/ns/public/common/common-3">
<name>by-name-levenshtein</name>
<name>by-name-fuzzy-fixed</name>
<item>
<ref>givenName</ref>
<search>
Expand All @@ -16,6 +16,13 @@
<threshold>1</threshold>
</levenshtein>
</fuzzy>
<confidence>
<expression>
<script>
<code>1</code> <!-- overriding the default computation -->
</script>
</expression>
</confidence>
</search>
</item>
<item>
Expand All @@ -26,6 +33,13 @@
<threshold>0.5</threshold>
</similarity>
</fuzzy>
<confidence>
<expression>
<script>
<code>1</code> <!-- overriding the default computation -->
</script>
</expression>
</confidence>
</search>
</item>
</itemsCorrelator>
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@
<inclusive>false</inclusive> <!-- just testing this -->
</levenshtein>
</fuzzy>
<confidence>
<expression>
<script>
<code>1 / (input+1)</code>
</script>
</expression>
</confidence>
</search>
</item>
<item>
Expand All @@ -35,13 +28,6 @@
<threshold>0.2</threshold>
</similarity>
</fuzzy>
<confidence>
<expression>
<script>
<code>input</code>
</script>
</expression>
</confidence>
</search>
</item>
</itemsCorrelator>

0 comments on commit 643799e

Please sign in to comment.