-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(distances): add hamming, jaro and jaro-winkler
- add new metrics: hamming, jaro and jaro-winkler - add utils and functional api
- Loading branch information
1 parent
87783fe
commit d9f7154
Showing
17 changed files
with
1,315 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -181,5 +181,8 @@ cython_debug/ | |
# scripts | ||
scripts | ||
|
||
### IDE | ||
# IDE | ||
.idea | ||
|
||
# notes | ||
notes/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// | ||
// Created by Benjamin Tenmann on 20/02/2022. | ||
// | ||
|
||
#ifndef SETRIQ_HAMMING_H | ||
#define SETRIQ_HAMMING_H | ||
|
||
#include "utils/type_defs.h" | ||
|
||
namespace metric { | ||
class Hamming { | ||
private: | ||
double mismatch_score_{}; | ||
|
||
public: | ||
explicit Hamming(const double &mismatch_score) : mismatch_score_{mismatch_score} {}; | ||
|
||
double forward(const std::string &a, const std::string &b) const; | ||
}; | ||
} | ||
|
||
#endif //SETRIQ_HAMMING_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// | ||
// Created by Benjamin Tenmann on 05/03/2022. | ||
// | ||
|
||
#ifndef SETRIQ_JARO_H | ||
#define SETRIQ_JARO_H | ||
|
||
#include <array> | ||
#include "utils/type_defs.h" | ||
|
||
typedef std::array<double, 3> jaro_weighting_t; | ||
|
||
namespace metric { | ||
class Jaro { | ||
private: | ||
jaro_weighting_t weights_ = {1. / 3, 1. / 3, 1. / 3}; | ||
|
||
public: | ||
Jaro() = default; | ||
|
||
explicit Jaro(jaro_weighting_t weights) : weights_(weights) {}; | ||
|
||
double forward(const std::string &a, const std::string &b) const; | ||
}; | ||
} | ||
|
||
#endif //SETRIQ_JARO_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// | ||
// Created by Benjamin Tenmann on 21/02/2022. | ||
// | ||
|
||
#ifndef SETRIQ_JAROWINKLER_H | ||
#define SETRIQ_JAROWINKLER_H | ||
|
||
#include "utils/type_defs.h" | ||
#include "metrics/Jaro.h" | ||
|
||
namespace metric { | ||
class JaroWinkler { | ||
private: | ||
double p_ = 0.; | ||
size_t max_l_ = 4; | ||
Jaro jaro_{}; | ||
|
||
public: | ||
JaroWinkler() = default; | ||
|
||
explicit JaroWinkler(const double &p, const size_t &max_l, Jaro jaro) : p_{p}, max_l_{max_l}, jaro_{jaro} {}; | ||
|
||
double forward(const std::string &a, const std::string &b) const; | ||
}; | ||
} | ||
|
||
#endif //SETRIQ_JAROWINKLER_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
// | ||
// Created by Benjamin Tenmann on 20/02/2022. | ||
// | ||
|
||
#include "metrics/Hamming.h" | ||
|
||
double metric::Hamming::forward(const std::string &a, const std::string &b) const { | ||
/*! | ||
* Compute the Hamming distance between two input strings. | ||
* | ||
* @param a: an input string to be compared | ||
* @param b: an input string to be compared | ||
*/ | ||
auto&& distance = 0.; | ||
for (auto i = 0ul; i < a.size(); i++) { | ||
if (a[i] != b[i]) | ||
distance += this->mismatch_score_; | ||
} | ||
return distance; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
// | ||
// Created by Benjamin Tenmann on 05/03/2022. | ||
// | ||
|
||
#include <cmath> | ||
#include "metrics/Jaro.h" | ||
|
||
#define either_zero(x, y) (x == 0) || (y == 0) | ||
#define max(x, y) x > y ? x : y | ||
#define min(x, y) x > y ? y : x | ||
|
||
|
||
void collapse_into_match_str(const std::string& sequence, const std::vector<size_t>& matches_idx, char* match_str) { | ||
auto&& j = 0ul; | ||
for (const auto& idx : matches_idx) { | ||
if (idx){ | ||
match_str[j] = sequence[idx - 1]; | ||
j++; | ||
} | ||
} | ||
} | ||
|
||
double metric::Jaro::forward(const std::string &a, const std::string &b) const { | ||
/*! | ||
* Compute the Jaro distance between two input strings. | ||
* Adapted from https://github.com/markvanderloo/stringdist/blob/master/pkg/src/jaro.c | ||
* | ||
* @param a: an input string to be compared | ||
* @param b: an input string to be compared | ||
*/ | ||
const auto& s_i = a.size(); | ||
const auto& s_j = b.size(); | ||
if (either_zero(s_i, s_j)) | ||
// if one of the strings is of length 0 and the other isn't, then the distance is maximal (1) | ||
// if both are length 0, then the distance is minimal, i.e. 0 | ||
return (double) ((s_i > 0) || (s_j > 0)); | ||
|
||
const auto& max_len = s_i > s_j ? s_i : s_j; | ||
const auto& max_match_distance = (int) std::floor(max_len / 2) - 1; | ||
if (max_match_distance < 0) | ||
// catch the case when both strings are of length == 1 | ||
return a[0] == b[0] ? 0.0 : 1.0; | ||
|
||
auto&& matches_s_i = std::vector<size_t>(s_i, 0); | ||
auto&& matches_s_j = std::vector<size_t>(s_j, 0); | ||
|
||
auto&& n_matches = 0ul; | ||
for (auto i = 0; i < s_i; i++) { | ||
const auto& left = max((i - max_match_distance), 0); | ||
const auto& right = min((i + max_match_distance) + 1, s_j); | ||
// can we collapse this in some way? | ||
for (auto j = left; j < right; j++) { | ||
if ((a[i] == b[j]) && (matches_s_j[j] == 0)) { | ||
n_matches++; | ||
matches_s_i[i] = i + 1; | ||
matches_s_j[j] = j + 1; | ||
break; | ||
} | ||
} | ||
} | ||
if (n_matches == 0) | ||
return 1.0; | ||
|
||
char *match_str_i = new char[n_matches]; | ||
char *match_str_j = new char[n_matches]; | ||
|
||
collapse_into_match_str(a, matches_s_i, match_str_i); | ||
collapse_into_match_str(b, matches_s_j, match_str_j); | ||
|
||
auto&& t = 0.0; | ||
for (auto k = 0ul; k < n_matches; k++) { | ||
if (match_str_i[k] != match_str_j[k]) | ||
t += 0.5; | ||
} | ||
delete []match_str_i; | ||
delete []match_str_j; | ||
|
||
const auto& m = (double) n_matches; | ||
// allow arbitrary weighting | ||
return 1 - (this->weights_[0] * (m / s_i) + this->weights_[1] * (m / s_j) + this->weights_[2] * ((m - t) / m)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
// | ||
// Created by Benjamin Tenmann on 21/02/2022. | ||
// | ||
|
||
#include "metrics/JaroWinkler.h" | ||
|
||
size_t min_sequence_len(const std::string& a, const std::string& b) { | ||
const auto& length_a = a.size(); | ||
const auto& length_b = b.size(); | ||
return length_a < length_b ? length_a : length_b; | ||
} | ||
|
||
double metric::JaroWinkler::forward(const std::string &a, const std::string &b) const { | ||
/*! | ||
* Compute the Jaro-Winkler distance between two input strings. | ||
* | ||
* @param a: an input string to be compared | ||
* @param b: an input string to be compared | ||
*/ | ||
const auto& jaro_distance = this->jaro_.forward(a, b); | ||
const auto& min_length = min_sequence_len(a, b); | ||
|
||
auto&& l = 0ul; | ||
while ((a[l] == b[l]) && (l < min_length) && (l < this->max_l_)) | ||
l++; | ||
return jaro_distance * (1 - l * this->p_); | ||
} |
Oops, something went wrong.