# TF-IDF
Joining registry and license data using TF-IDF string matching algorithm

In [7]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import time

In [2]:
registry = mwdsbe.load_registry() # geopandas df
license = licenses.CommercialActivityLicenses().download()

In [90]:
# clean data
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore_words)

In [91]:
print('Total number of cleaned registry:', len(cleaned_registry))

Total number of cleaned registry: 3119


In [92]:
print('Total number of cleaned license:', len(cleaned_license))

Total number of cleaned license: 203578


## 1. Score-cutoff 90

In [93]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=90)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=90)
)
t = time.time() - t1

In [94]:
print('Execution time:', t, 'sec')

Execution time: 186.29000186920166 sec


In [95]:
matched = merged.dropna(subset=['company_name_y'])

In [96]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1391 out of 3119


In [97]:
non_exact_match = matched[matched.match_probability < 0.999999]
non_exact_match = non_exact_match[['company_name_x', 'match_probability', 'company_name_y']]
print('Non-exact match above 90:', len(non_exact_match), 'out of', len(matched))

Non-exact match above 90: 88 out of 1391


In [98]:
# non_exact_match.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\tf-idf\tf-idf-90.xlsx', index = None, header=True)

## 2. Score-cutoff 85

In [99]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=85)
)
t = time.time() - t1

In [100]:
print('Execution time:', t, 'sec')

Execution time: 187.34773302078247 sec


In [101]:
matched = merged.dropna(subset=['company_name_y'])

In [102]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1499 out of 3119


In [103]:
match_to_check = matched[matched.match_probability < 0.9]
match_to_check = match_to_check[['company_name_x', 'match_probability', 'company_name_y']]
print('Match between 85 and 90:', len(match_to_check), 'out of', len(matched))

Match between 85 and 90: 111 out of 1499


In [104]:
# match_to_check.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\tf-idf\tf-idf-85.xlsx', index = None, header=True)

## 3. Score-cutoff 80

In [105]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=80)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=80)
)
t = time.time() - t1

In [106]:
print('Execution time:', t, 'sec')

Execution time: 188.21181917190552 sec


In [107]:
matched = merged.dropna(subset=['company_name_y'])

In [108]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1666 out of 3119


In [109]:
match_to_check = matched[matched.match_probability < 0.85]
match_to_check = match_to_check[['company_name_x', 'match_probability', 'company_name_y']]
print('Match between 80 and 85:', len(match_to_check), 'out of', len(matched))

Match between 80 and 85: 172 out of 1666


In [110]:
# match_to_check.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\tf-idf\tf-idf-80.xlsx', index = None, header=True)

## 4. Score-cutoff 75

In [111]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=75)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=75)
)
t = time.time() - t1

In [112]:
print('Execution time:', t, 'sec')

Execution time: 186.20661854743958 sec


In [113]:
matched = merged.dropna(subset=['company_name_y'])

In [114]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1868 out of 3119


In [115]:
match_to_check = matched[matched.match_probability < 0.8]
match_to_check = match_to_check[['company_name_x', 'match_probability', 'company_name_y']]
print('Match between 75 and 80:', len(match_to_check), 'out of', len(matched))

Match between 75 and 80: 208 out of 1868


In [116]:
# match_to_check.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\tf-idf\tf-idf-75.xlsx', index = None, header=True)