/
string_metrics.py
96 lines (76 loc) · 3.08 KB
/
string_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def commoncharacters(s1: str, s2: str) -> int:
"""
Number of occurrences of the exactly same characters in exactly same position.
"""
return sum(c1 == c2 for c1, c2 in zip(s1, s2))
def leftcommonsubstring(s1: str, s2: str) -> int:
"""Size of the common start of two strings. "foo", "bar" => 0, "built", "build" => 4, "cat", "cats" => 3"""
for (i, (c1, c2)) in enumerate(zip(s1, s2)):
if c1 != c2:
return i
return min(len(s1), len(s2))
def ngram(max_ngram_size: int, s1: str, s2: str, *,
weighted: bool = False, any_mismatch: bool = False, longer_worse: bool = False) -> int:
"""
Calculates how many of n-grams of s1 are contained in s2 (the more the number, the more words
are similar).
Args:
max_ngram_size: n in ngram
s1: string to compare
s2: string to compare
weighted: substract from result for ngrams *not* contained
longer_worse: add a penalty when second string is longer
any_mismatch: add a penalty for any string length difference
FIXME: Actually, the last two settings do NOT participate in ngram counting by themselves, they
are just adjusting the final score, but that's how it was structured in Hunspell.
"""
l2 = len(s2)
if l2 == 0:
return 0
l1 = len(s1)
nscore = 0
# For all sizes of ngram up to desired...
for ngram_size in range(1, max_ngram_size + 1):
ns = 0
# Check every position in the first string
for pos in range(l1 - ngram_size + 1):
# ...and if the ngram of current size in this position is present in ANY place in second string
if s1[pos:pos+ngram_size] in s2:
# increase score
ns += 1
elif weighted:
# For "weighted" ngrams, decrease score if ngram is not found,
ns -= 1
if pos == 0 or pos + ngram_size == l1:
# ...and decrease once more if it was the beginning or end of first string
ns -= 1
nscore += ns
# there is no need to check for 4-gram if there were only one 3-gram
if ns < 2 and not weighted:
break
# longer_worse setting adds a penalty if the second string is longer than first
if longer_worse:
penalty = (l2 - l1) - 2
# any_mismatch adds a penalty for _any_ string length difference
elif any_mismatch:
penalty = abs(l2 - l1) - 2
else:
penalty = 0
return nscore - penalty if penalty > 0 else nscore
def lcslen(s1: str, s2: str) -> int:
"""
Classic "LCS (longest common subsequence) length" algorithm.
This implementation is stolen shamelessly from https://gist.github.com/cgt/c0c47c100efda1d11854
"""
m = len(s1)
n = len(s2)
c = [[0 for j in range(n+1)] for i in range(m+1)]
for i in range(m):
for j in range(n):
if s1[i] == s2[j]:
c[i][j] = c[i-1][j-1] + 1
elif c[i-1][j] >= c[i][j-1]:
c[i][j] = c[i-1][j]
else:
c[i][j] = c[i][j-1]
return c[m-1][n-1]