-
Notifications
You must be signed in to change notification settings - Fork 188
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/master'
- Loading branch information
Showing
3 changed files
with
160 additions
and
0 deletions.
There are no files selected for viewing
37 changes: 37 additions & 0 deletions
37
infra/schema/src/main/java/com/evolveum/midpoint/schema/util/CorrelationAlgorithm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package com.evolveum.midpoint.schema.util; | ||
|
||
import com.evolveum.midpoint.schema.util.correlation.Levenshtein; | ||
import com.evolveum.midpoint.schema.util.correlation.TriGram; | ||
|
||
import java.util.List; | ||
|
||
public class CorrelationAlgorithm { | ||
|
||
Levenshtein levenshtein; | ||
TriGram triGram; | ||
|
||
public CorrelationAlgorithm() { | ||
levenshtein = new Levenshtein(); | ||
triGram = new TriGram(); | ||
} | ||
|
||
public double triGramSimilarity(String lObject, String rObject) { | ||
return triGram.getSimilarity(lObject, rObject); | ||
} | ||
|
||
public List<String> triGramForm(String object) { | ||
return triGram.generateTriGram(object); | ||
} | ||
|
||
public double levenshteinSimilarity(String lObject, String rObject) { | ||
int levenshteinDistance = levenshteinDistance(lObject, rObject); | ||
return levenshtein.computeLevenshteinSimilarity(lObject, rObject, levenshteinDistance); | ||
} | ||
|
||
public int levenshteinDistance(String lObject, String rObject) { | ||
return levenshtein.computeLevenshteinDistance(lObject, rObject); | ||
} | ||
|
||
|
||
|
||
} |
45 changes: 45 additions & 0 deletions
45
infra/schema/src/main/java/com/evolveum/midpoint/schema/util/correlation/Levenshtein.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/* | ||
* Copyright (C) 2010-2022 Evolveum and contributors | ||
* | ||
* This work is dual-licensed under the Apache License 2.0 | ||
* and European Union Public License. See LICENSE file for details. | ||
*/ | ||
package com.evolveum.midpoint.schema.util.correlation; | ||
|
||
import java.util.stream.IntStream; | ||
|
||
public class Levenshtein { | ||
|
||
public double computeLevenshteinSimilarity(String lObject, String rObject, int levenshteinDistance) { | ||
return 1 - (((double) levenshteinDistance) / (Math.max(lObject.length(), rObject.length()))); | ||
} | ||
|
||
public int computeLevenshteinDistance(String lObject, String rObject) { | ||
|
||
if (lObject.equals(rObject)) { | ||
return 0; | ||
} | ||
if (lObject.isEmpty()) { | ||
return rObject.length(); | ||
} else if (rObject.isEmpty()) { | ||
return lObject.length(); | ||
} | ||
|
||
int[][] distance = new int[lObject.length() + 1][rObject.length() + 1]; | ||
|
||
IntStream.rangeClosed(0, lObject.length()).forEach(i -> distance[i][0] = i); | ||
IntStream.rangeClosed(1, rObject.length()).forEach(j -> distance[0][j] = j); | ||
|
||
for (int i = 1; i <= lObject.length(); i++) { | ||
for (int j = 1; j <= rObject.length(); j++) { | ||
int match = (lObject.charAt(i - 1) == rObject.charAt(j - 1)) ? 0 : 1; | ||
|
||
distance[i][j] = Math.min( | ||
Math.min(distance[i - 1][j] + 1, distance[i][j - 1] + 1), | ||
distance[i - 1][j - 1] + match); | ||
} | ||
} | ||
return distance[lObject.length()][rObject.length()]; | ||
} | ||
|
||
} |
78 changes: 78 additions & 0 deletions
78
infra/schema/src/main/java/com/evolveum/midpoint/schema/util/correlation/TriGram.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
/* | ||
* Copyright (C) 2010-2022 Evolveum and contributors | ||
* | ||
* This work is dual-licensed under the Apache License 2.0 | ||
* and European Union Public License. See LICENSE file for details. | ||
*/ | ||
package com.evolveum.midpoint.schema.util.correlation; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
public class TriGram { | ||
|
||
private static final int N_GRAM_VALUE = 3; | ||
|
||
public List<String> generateTriGram(String object) { | ||
String[] normalizedInput = normalization(object); | ||
List<String> triGrams = new ArrayList<>(); | ||
|
||
for (String preparedString : normalizedInput) { | ||
for (int j = 0; j < preparedString.length() - N_GRAM_VALUE + 1; j++) { | ||
String triGramSubstring = preparedString.substring(j, j + N_GRAM_VALUE); | ||
|
||
if (!triGrams.contains(triGramSubstring)) { | ||
triGrams.add(triGramSubstring); | ||
} | ||
|
||
} | ||
} | ||
return triGrams; | ||
} | ||
|
||
private String[] normalization(String object) { | ||
String removeNonAlpha = object.replaceAll("[^\\p{Alnum}]", " "); | ||
String normalizeWhiteSpaces = removeNonAlpha.replaceAll("\\s{2,}", " ").trim(); | ||
String[] strArray = normalizeWhiteSpaces.split(" "); | ||
|
||
for (int i = 0; i < strArray.length; i++) { | ||
String normalizedString = " " + strArray[i] + " "; | ||
strArray[i] = normalizedString.toLowerCase(); | ||
} | ||
|
||
return strArray; | ||
} | ||
|
||
public double getSimilarity(String lObject, String rObject) { | ||
List<String> firstTriGrams = generateTriGram(lObject); | ||
List<String> secondTriGrams = generateTriGram(rObject); | ||
|
||
List<String> intersectionList = intersection(firstTriGrams, secondTriGrams); | ||
List<String> unionList = union(firstTriGrams, secondTriGrams); | ||
|
||
double intersectionListSize = intersectionList.size(); | ||
double unionListSize = unionList.size(); | ||
|
||
return (intersectionListSize / unionListSize); | ||
} | ||
|
||
private <T> List<T> union(List<T> list1, List<T> list2) { | ||
Set<T> set = new HashSet<>(); | ||
set.addAll(list1); | ||
set.addAll(list2); | ||
return new ArrayList<>(set); | ||
} | ||
|
||
private <T> List<T> intersection(List<T> list1, List<T> list2) { | ||
List<T> list = new ArrayList<>(); | ||
for (T t : list1) { | ||
if (list2.contains(t)) { | ||
list.add(t); | ||
} | ||
} | ||
return list; | ||
} | ||
|
||
} |