# Applying Fuzzywuzzy and TF-IDF on MWDSBE
* with Fuzzy wuzzy score of 95 and TF-IDF score of 90 

In [1]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import time

registry = mwdsbe.load_registry() # geopandas df
license = licenses.CommercialActivityLicenses().download()

# clean data
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore_words)

In [2]:
print('Total number of cleaned registry:', len(cleaned_registry))

Total number of cleaned registry: 3119


In [3]:
print('Total number of cleaned license:', len(cleaned_license))

Total number of cleaned license: 203578


## 1. Fuzz 95 + TF-IDF 90

In [4]:
t1 = time.time()
merged = (
    skool.fuzzy_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, on="company_name", score_cutoff=90)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=90)
)
t = time.time() - t1

Execution time: 1114.783467054367 sec


In [5]:
print('Execution time:', t/60, 'min')

Execution time: 18.579724450906117 min


In [6]:
matched = merged.dropna(subset=['company_name_y'])

In [7]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1480 out of 3119


In [8]:
# matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\fuzz_and_tfidf\fuzz95_tfidf90.xlsx', index = None, header=True)

## 2. Fuzz 95 + TF-IDF 85

In [9]:
t1 = time.time()
merged = (
    skool.fuzzy_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, on="company_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=85)
)
t = time.time() - t1

In [10]:
print('Execution time:', t/60, 'min')

Execution time: 18.664799781640372 min


In [11]:
matched = merged.dropna(subset=['company_name_y'])

In [12]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1566 out of 3119


In [13]:
# matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\fuzz_and_tfidf\fuzz95_tfidf85.xlsx', index = None, header=True)