In [None]:
!pip install fuzzywuzzy python_Levenshtein hmni rapidfuzz

# 1) Levenshtein Distance

The Levenshtein distance is a string metric for measuring difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (i.e. insertions, deletions or substitutions) required to change one word into the other.

![](https://www.cuelogic.com/wp-content/uploads/2021/06/L1.jpg.webp)

[Levenshtein Algorithm](https://www.cuelogic.com/blog/the-levenshtein-algorithm#:~:text=The%20Levenshtein%20distance%20is%20a,one%20word%20into%20the%20other.)

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

print(fuzz.ratio("Catherine M Gitau", "Catherine Gitau"))
print(fuzz.partial_ratio("Catherine M. Gitau","Catherine Gitau"))

94
80


# 2) HMNI

[Fuzzy Name Matching with Machine Learning](https://towardsdatascience.com/fuzzy-name-matching-with-machine-learning-f09895dce7b4)

In [None]:
import pandas as pd
import hmni
import warnings
warnings.filterwarnings("ignore")
matcher = hmni.Matcher(model='latin')
print(matcher.similarity('Alan', 'Al'))
print(matcher.similarity('Alan', 'Al', prob=False))
print(matcher.similarity('Alan Turing', 'Al Turing', surname_first=False))

# Name Deduplication and Normalization
names_list = ['Alan', 'Al', 'Al', 'James']

print(matcher.dedupe(names_list, keep='longest'))
print(matcher.dedupe(names_list, keep='frequent'))
print(matcher.dedupe(names_list, keep='longest', replace=True))

# dataframe
df1 = pd.DataFrame({'name': ['Al', 'Mark', 'James', 'Harold']})
df2 = pd.DataFrame({'name': ['Mark', 'Alan', 'James', 'Harold']})
merged = matcher.fuzzymerge(df1, df2, how='left', on='name')

In [None]:
# to combine both method
def similarity_calculator(word1, word2):
    score_1 = fuzz.ratio(word1, word2) # score from fuzzywuzzy
    score_2 = matcher.similarity(word1, word2) # score from hmni
    score_1 = score_1 / 100
    score = 0.2 * score_1 + 0.8 * score_2 # customize your own weights
    return score

# 3) RapidFuzz

In [None]:
from rapidfuzz import fuzz

# normalized Indel distance
print(fuzz.ratio("this is a test", "this is a test!"))

# optimal alignment of the shorter string in the longer string
print(fuzz.partial_ratio("this is a test", "this is a test!"))

# Sorts the words in the strings and calculates the fuzz.ratio between them
print(fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))

# Compares the words in the strings based on unique and common words
print(fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear!"))

96.55172413793103
100.0
84.21052631578947
100.0
