Skip to content

Commit

Permalink
Add correlation internal algorithms.
Browse files Browse the repository at this point in the history
  • Loading branch information
tchrapovic committed Aug 15, 2022
1 parent a28e7d1 commit 69e30d9
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package com.evolveum.midpoint.schema.util;

import com.evolveum.midpoint.schema.util.correlation.Levenshtein;
import com.evolveum.midpoint.schema.util.correlation.TriGram;

import java.util.List;

public class CorrelationAlgorithm {

Levenshtein levenshtein;
TriGram triGram;

public CorrelationAlgorithm() {
levenshtein = new Levenshtein();
triGram = new TriGram();
}

public double triGramSimilarity(String lObject, String rObject) {
return triGram.getSimilarity(lObject, rObject);
}

public List<String> triGramForm(String object) {
return triGram.generateTriGram(object);
}

public double levenshteinSimilarity(String lObject, String rObject) {
int levenshteinDistance = levenshteinDistance(lObject, rObject);
return levenshtein.computeLevenshteinSimilarity(lObject, rObject, levenshteinDistance);
}

public int levenshteinDistance(String lObject, String rObject) {
return levenshtein.computeLevenshteinDistance(lObject, rObject);
}



}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.evolveum.midpoint.schema.util.correlation;

import java.util.stream.IntStream;

public class Levenshtein {

public double computeLevenshteinSimilarity(String lObject, String rObject, int levenshteinDistance) {
return 1 - (((double) levenshteinDistance) / (Math.max(lObject.length(), rObject.length())));
}

public int computeLevenshteinDistance(String lObject, String rObject) {

if (lObject.equals(rObject)) {
return 0;
}
if (lObject.isEmpty()) {
return rObject.length();
} else if (rObject.isEmpty()) {
return lObject.length();
}

int[][] distance = new int[lObject.length() + 1][rObject.length() + 1];

IntStream.rangeClosed(0, lObject.length()).forEach(i -> distance[i][0] = i);
IntStream.rangeClosed(1, rObject.length()).forEach(j -> distance[0][j] = j);

for (int i = 1; i <= lObject.length(); i++) {
for (int j = 1; j <= rObject.length(); j++) {
int match = (lObject.charAt(i - 1) == rObject.charAt(j - 1)) ? 0 : 1;

distance[i][j] = Math.min(
Math.min(distance[i - 1][j] + 1, distance[i][j - 1] + 1),
distance[i - 1][j - 1] + match);
}
}
return distance[lObject.length()][rObject.length()];
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package com.evolveum.midpoint.schema.util.correlation;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class TriGram {

private static final int nGramValue = 3;

public List<String> generateTriGram(String object) {
String[] normalizedInput = normalization(object);
List<String> triGrams = new ArrayList<>();

for (String preparedString : normalizedInput) {
for (int j = 0; j < preparedString.length() - nGramValue + 1; j++) {
String triGramSubstring = preparedString.substring(j, j + nGramValue);

if (!triGrams.contains(triGramSubstring)) {
triGrams.add(triGramSubstring);
}

}
}
return triGrams;
}

private String[] normalization(String object) {
String removeNonAlpha = object.replaceAll("[^\\p{Alnum}]", " ");
String normalizeWhiteSpaces = removeNonAlpha.replaceAll("\\s{2,}", " ").trim();
String[] strArray = normalizeWhiteSpaces.split(" ");

for (int i = 0; i < strArray.length; i++) {
String normalizedString = " " + strArray[i] + " ";
strArray[i] = normalizedString.toLowerCase();
}

return strArray;
}

public double getSimilarity(String lObject, String rObject) {
List<String> firstTriGrams = generateTriGram(lObject);
List<String> secondTriGrams = generateTriGram(rObject);

List<String> intersectionList = intersection(firstTriGrams, secondTriGrams);
List<String> unionList = union(firstTriGrams, secondTriGrams);

double intersectionListSize = intersectionList.size();
double unionListSize = unionList.size();

return (intersectionListSize / unionListSize);
}

private <T> List<T> union(List<T> list1, List<T> list2) {
Set<T> set = new HashSet<>();
set.addAll(list1);
set.addAll(list2);
return new ArrayList<>(set);
}

private <T> List<T> intersection(List<T> list1, List<T> list2) {
List<T> list = new ArrayList<>();
for (T t : list1) {
if (list2.contains(t)) {
list.add(t);
}
}
return list;
}

}

0 comments on commit 69e30d9

Please sign in to comment.