### <a href='https://pypi.org/project/fuzzywuzzy/' style='text-decoration:none'>fuzzywuzzy</a> 

In [4]:
#!pip install fuzzywuzzy[speedup]

In [1]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

#### Simple Ratio

In [2]:
fuzz.ratio("this is a test", "this is a test!")

97

In [3]:
fuzz.ratio("ed wood film", "ed wood")

74

In [5]:
fuzz.ratio("wrights morning glory", "long name morning glory")

68

In [6]:
fuzz.ratio('hollywood', 'wood')

62

In [14]:
fuzz.ratio('bussinesswoman', 'woman')

53

In [2]:
fuzz.ratio('fantasy book', 'book')

50

In [3]:
fuzz.ratio('fantasy', 'science fantasy young adult series')

34

In [5]:
fuzz.ratio('first three book seer', 'book')

32

In [6]:
fuzz.ratio('handbook', 'book')

67

In [7]:
fuzz.ratio('first novel', 'first')

62

In [9]:
fuzz.ratio('first two issue', 'first')

50

#### Partial Ratio

In [7]:
fuzz.partial_ratio("this is a test", "this is a test!") 

100

In [8]:
fuzz.partial_ratio("ed wood film", "ed wood")

100

In [9]:
fuzz.partial_ratio("wrights morning glory", "morning")

100

In [10]:
fuzz.partial_ratio("long name morning glory", "morning")

100

In [11]:
fuzz.partial_ratio("morning", "long name morning glory")

100

In [12]:
 fuzz.partial_ratio('hollywood', 'wood')

100

#### Token Set Ratio

In [13]:
fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")

100

In [14]:
 fuzz.token_set_ratio('cult filmmaker ed wood', 'wood')

100

In [15]:
  fuzz.token_set_ratio('wood', 'wood plantation')

100

In [16]:
 fuzz.token_set_ratio('hollywood', 'wood')

62

In [17]:
fuzz.token_set_ratio("wrights morning glory", "morning")

100

In [2]:
  fuzz.token_set_ratio('bussinesswoman', 'woman')

53

#### Token Sort Ratio

In [18]:
fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")

100

In [19]:
fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")

84

In [20]:
 fuzz.token_sort_ratio('scott derricksons', 'derrickson')

74

In [21]:
 fuzz.token_sort_ratio('ed wood', 'scott derrickson')

26

In [22]:
fuzz.token_sort_ratio('woodson', 'wood')

73

In [3]:
fuzz.token_sort_ratio("wrights morning glory", " longer name morning glory")

61

#### <a href='https://github.com/seatgeek/fuzzywuzzy/blob/2188520502b86375cf2610b5100a56935417671f/fuzzywuzzy/process.py' style='text-decoration:none'>Process</a> 


In [24]:
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]

process.extract("new york jets", choices, limit=2) 

[('New York Jets', 100), ('New York Giants', 79)]

In [25]:
process.extractOne("cowboys", choices)

('Dallas Cowboys', 90)

In [26]:
process.extractOne("cowboys", choices)[0]

'Dallas Cowboys'

In [4]:
process.extractOne("ed wood", ['ed wood film', 'cult filmmaker ed wood', 'ed wood bestknown film',  'ed wood sr'], scorer=fuzz.token_sort_ratio)

('ed wood sr', 82)

In [28]:
process.extractOne("ed wood", ['ed wood film', 'cult filmmaker ed wood', 'ed wood bestknown film',  'ed wood sr'], scorer=fuzz.token_set_ratio)

('ed wood film', 100)

In [9]:
process.extractOne("ed wood", [], scorer=fuzz.token_set_ratio)

In [29]:
 process.extractOne('\u200e', ['james iroha uchechukwu', '2006 census'], scorer=fuzz.token_sort_ratio)



('james iroha uchechukwu', 0)

In [2]:
from fuzzywuzzy import utils

invalid_query = "\u200e"
if utils.full_process(invalid_query):
    print("not executed")

In [41]:
if utils.full_process(invalid_query):
    # wont execute and not produce a warning
    process.extractOne(invalid_query, ['james iroha uchechukwu', '2006 census'], scorer=fuzz.token_sort_ratio)

In [15]:
process.extractBests("ed wood", ['ed wood film', 'cult filmmaker ed wood', 'ed wood bestknown film',  'ed wood sr', 'irrelevant'], scorer=fuzz.partial_ratio)

[('ed wood film', 100),
 ('cult filmmaker ed wood', 100),
 ('ed wood bestknown film', 100),
 ('ed wood sr', 100),
 ('irrelevant', 14)]

In [17]:
[simi_phrase for (simi_phrase, similarity) in process.extractBests("ed wood", ['ed wood film', 'cult filmmaker ed wood', 'ed wood bestknown film',  'ed wood sr', 'irrelevant'], scorer=fuzz.partial_ratio) if similarity ==100]   

['ed wood film',
 'cult filmmaker ed wood',
 'ed wood bestknown film',
 'ed wood sr']

In [18]:
process.extractBests("ed wood", ['ed wood film', 'cult filmmaker ed wood', 'ed wood bestknown film',  'ed wood sr', 'irrelevant'], scorer=fuzz.ratio)

[('ed wood sr', 82),
 ('ed wood film', 74),
 ('cult filmmaker ed wood', 48),
 ('ed wood bestknown film', 48),
 ('irrelevant', 12)]

In [2]:
process.dedupe(['ed wood film', 'cult filmmaker ed wood', 'ed wood bestknown film',  'ed wood sr', 'irrelevant'], threshold=70, scorer=fuzz.token_set_ratio)

dict_keys(['cult filmmaker ed wood', 'ed wood bestknown film', 'irrelevant'])

In [4]:
contains_dupes = ['ed wood film', 'cult filmmaker ed wood', 'ed wood bestknown film',  'ed wood sr', 'irrelevant']

extractor = []
# iterate over items in *contains_dupes*
for item in contains_dupes:
    # return all duplicate matches found
    matches = process.extract(item, contains_dupes, limit=None, scorer=fuzz.ratio)
    print(matches)
    # filter matches based on the threshold
    filtered = [x for x in matches if x[1] > threshold]

[('ed wood film', 100), ('ed wood sr', 73), ('ed wood bestknown film', 71), ('cult filmmaker ed wood', 41), ('irrelevant', 18)]
[('cult filmmaker ed wood', 100), ('ed wood sr', 44), ('ed wood film', 41), ('ed wood bestknown film', 32), ('irrelevant', 19)]
[('ed wood bestknown film', 100), ('ed wood film', 71), ('ed wood sr', 56), ('cult filmmaker ed wood', 32), ('irrelevant', 19)]
[('ed wood sr', 100), ('ed wood film', 73), ('ed wood bestknown film', 56), ('cult filmmaker ed wood', 44), ('irrelevant', 10)]
[('irrelevant', 100), ('cult filmmaker ed wood', 19), ('ed wood bestknown film', 19), ('ed wood film', 18), ('ed wood sr', 10)]
