-
Notifications
You must be signed in to change notification settings - Fork 188
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add correlation internal algorithms.
- Loading branch information
1 parent
a28e7d1
commit 69e30d9
Showing
3 changed files
with
148 additions
and
0 deletions.
There are no files selected for viewing
37 changes: 37 additions & 0 deletions
37
infra/schema/src/main/java/com/evolveum/midpoint/schema/util/CorrelationAlgorithm.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package com.evolveum.midpoint.schema.util; | ||
|
||
import com.evolveum.midpoint.schema.util.correlation.Levenshtein; | ||
import com.evolveum.midpoint.schema.util.correlation.TriGram; | ||
|
||
import java.util.List; | ||
|
||
public class CorrelationAlgorithm { | ||
|
||
Levenshtein levenshtein; | ||
TriGram triGram; | ||
|
||
public CorrelationAlgorithm() { | ||
levenshtein = new Levenshtein(); | ||
triGram = new TriGram(); | ||
} | ||
|
||
public double triGramSimilarity(String lObject, String rObject) { | ||
return triGram.getSimilarity(lObject, rObject); | ||
} | ||
|
||
public List<String> triGramForm(String object) { | ||
return triGram.generateTriGram(object); | ||
} | ||
|
||
public double levenshteinSimilarity(String lObject, String rObject) { | ||
int levenshteinDistance = levenshteinDistance(lObject, rObject); | ||
return levenshtein.computeLevenshteinSimilarity(lObject, rObject, levenshteinDistance); | ||
} | ||
|
||
public int levenshteinDistance(String lObject, String rObject) { | ||
return levenshtein.computeLevenshteinDistance(lObject, rObject); | ||
} | ||
|
||
|
||
|
||
} |
39 changes: 39 additions & 0 deletions
39
infra/schema/src/main/java/com/evolveum/midpoint/schema/util/correlation/Levenshtein.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
package com.evolveum.midpoint.schema.util.correlation; | ||
|
||
import java.util.stream.IntStream; | ||
|
||
public class Levenshtein { | ||
|
||
public double computeLevenshteinSimilarity(String lObject, String rObject, int levenshteinDistance) { | ||
return 1 - (((double) levenshteinDistance) / (Math.max(lObject.length(), rObject.length()))); | ||
} | ||
|
||
public int computeLevenshteinDistance(String lObject, String rObject) { | ||
|
||
if (lObject.equals(rObject)) { | ||
return 0; | ||
} | ||
if (lObject.isEmpty()) { | ||
return rObject.length(); | ||
} else if (rObject.isEmpty()) { | ||
return lObject.length(); | ||
} | ||
|
||
int[][] distance = new int[lObject.length() + 1][rObject.length() + 1]; | ||
|
||
IntStream.rangeClosed(0, lObject.length()).forEach(i -> distance[i][0] = i); | ||
IntStream.rangeClosed(1, rObject.length()).forEach(j -> distance[0][j] = j); | ||
|
||
for (int i = 1; i <= lObject.length(); i++) { | ||
for (int j = 1; j <= rObject.length(); j++) { | ||
int match = (lObject.charAt(i - 1) == rObject.charAt(j - 1)) ? 0 : 1; | ||
|
||
distance[i][j] = Math.min( | ||
Math.min(distance[i - 1][j] + 1, distance[i][j - 1] + 1), | ||
distance[i - 1][j - 1] + match); | ||
} | ||
} | ||
return distance[lObject.length()][rObject.length()]; | ||
} | ||
|
||
} |
72 changes: 72 additions & 0 deletions
72
infra/schema/src/main/java/com/evolveum/midpoint/schema/util/correlation/TriGram.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package com.evolveum.midpoint.schema.util.correlation; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
public class TriGram { | ||
|
||
private static final int nGramValue = 3; | ||
|
||
public List<String> generateTriGram(String object) { | ||
String[] normalizedInput = normalization(object); | ||
List<String> triGrams = new ArrayList<>(); | ||
|
||
for (String preparedString : normalizedInput) { | ||
for (int j = 0; j < preparedString.length() - nGramValue + 1; j++) { | ||
String triGramSubstring = preparedString.substring(j, j + nGramValue); | ||
|
||
if (!triGrams.contains(triGramSubstring)) { | ||
triGrams.add(triGramSubstring); | ||
} | ||
|
||
} | ||
} | ||
return triGrams; | ||
} | ||
|
||
private String[] normalization(String object) { | ||
String removeNonAlpha = object.replaceAll("[^\\p{Alnum}]", " "); | ||
String normalizeWhiteSpaces = removeNonAlpha.replaceAll("\\s{2,}", " ").trim(); | ||
String[] strArray = normalizeWhiteSpaces.split(" "); | ||
|
||
for (int i = 0; i < strArray.length; i++) { | ||
String normalizedString = " " + strArray[i] + " "; | ||
strArray[i] = normalizedString.toLowerCase(); | ||
} | ||
|
||
return strArray; | ||
} | ||
|
||
public double getSimilarity(String lObject, String rObject) { | ||
List<String> firstTriGrams = generateTriGram(lObject); | ||
List<String> secondTriGrams = generateTriGram(rObject); | ||
|
||
List<String> intersectionList = intersection(firstTriGrams, secondTriGrams); | ||
List<String> unionList = union(firstTriGrams, secondTriGrams); | ||
|
||
double intersectionListSize = intersectionList.size(); | ||
double unionListSize = unionList.size(); | ||
|
||
return (intersectionListSize / unionListSize); | ||
} | ||
|
||
private <T> List<T> union(List<T> list1, List<T> list2) { | ||
Set<T> set = new HashSet<>(); | ||
set.addAll(list1); | ||
set.addAll(list2); | ||
return new ArrayList<>(set); | ||
} | ||
|
||
private <T> List<T> intersection(List<T> list1, List<T> list2) { | ||
List<T> list = new ArrayList<>(); | ||
for (T t : list1) { | ||
if (list2.contains(t)) { | ||
list.add(t); | ||
} | ||
} | ||
return list; | ||
} | ||
|
||
} |