Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ These are for demonstration purposes only.
- [x] [Reverse](./src/string/reverse.rs)
- [x] [Run Length Encoding](.src/string/run_length_encoding.rs)
- [x] [Hamming Distance](./src/string/hamming_distance.rs)
- [x] [Jaro-Winkler Distance](./src/string/jaro_winkler_distance.rs)
- [x] [Suffix Tree](./src/string/suffix_tree.rs)
- [x] [Suffix Array](./src/string/suffix_array.rs)

Expand Down
85 changes: 85 additions & 0 deletions src/string/jaro_winkler_distance.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// In computer science and statistics,
// the Jaro–Winkler distance is a string metric measuring an edit distance
// between two sequences.
// It is a variant proposed in 1990 by William E. Winkler
// of the Jaro distance metric (1989, Matthew A. Jaro).

pub fn jaro_winkler_distance(str1: &str, str2: &str) -> f64 {
if str1.is_empty() || str2.is_empty() {
return 0.0;
}
fn get_matched_characters(s1: &str, s2: &str) -> String {
let mut s2 = s2.to_string();
let mut matched: Vec<char> = Vec::new();
let limit = std::cmp::min(s1.len(), s2.len()) / 2;
for (i, l) in s1.chars().enumerate() {
let left = std::cmp::max(0, i as i32 - limit as i32) as usize;
let right = std::cmp::min(i + limit + 1, s2.len());
if s2[left..right].contains(l) {
matched.push(l);
let a = &s2[0..s2.find(l).expect("this exists")];
let b = &s2[(s2.find(l).expect("this exists") + 1)..];
s2 = format!("{a} {b}");
}
}
matched.iter().collect::<String>()
}

let matching_1 = get_matched_characters(str1, str2);
let matching_2 = get_matched_characters(str2, str1);
let match_count = matching_1.len();

// transposition
let transpositions = {
let mut count = 0;
for (c1, c2) in matching_1.chars().zip(matching_2.chars()) {
if c1 != c2 {
count += 1;
}
}
count / 2
};

let jaro: f64 = {
if match_count == 0 {
return 0.0;
} else {
(1_f64 / 3_f64)
* (match_count as f64 / str1.len() as f64
+ match_count as f64 / str2.len() as f64
+ (match_count - transpositions) as f64 / match_count as f64)
}
};

let mut prefix_len = 0.0;
let bound = std::cmp::min(std::cmp::min(str1.len(), str2.len()), 4);
for (c1, c2) in str1[..bound].chars().zip(str2[..bound].chars()) {
if c1 == c2 {
prefix_len += 1.0;
} else {
break;
}
}
jaro + (0.1 * prefix_len * (1.0 - jaro))
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_jaro_winkler_distance() {
let a = jaro_winkler_distance("hello", "world");
assert_eq!(a, 0.4666666666666666);
let a = jaro_winkler_distance("martha", "marhta");
assert_eq!(a, 0.9611111111111111);
let a = jaro_winkler_distance("martha", "marhat");
assert_eq!(a, 0.9611111111111111);
let a = jaro_winkler_distance("test", "test");
assert_eq!(a, 1.0);
let a = jaro_winkler_distance("test", "");
assert_eq!(a, 0.0);
let a = jaro_winkler_distance("hello world", "HeLLo W0rlD");
assert_eq!(a, 0.6363636363636364);
}
}
2 changes: 2 additions & 0 deletions src/string/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod aho_corasick;
mod burrows_wheeler_transform;
mod hamming_distance;
mod jaro_winkler_distance;
mod knuth_morris_pratt;
mod manacher;
mod rabin_karp;
Expand All @@ -15,6 +16,7 @@ pub use self::burrows_wheeler_transform::{
burrows_wheeler_transform, inv_burrows_wheeler_transform,
};
pub use self::hamming_distance::hamming_distance;
pub use self::jaro_winkler_distance::jaro_winkler_distance;
pub use self::knuth_morris_pratt::knuth_morris_pratt;
pub use self::manacher::manacher;
pub use self::rabin_karp::rabin_karp;
Expand Down