# TF-IDF
Joining registry and license data using TF-IDF string matching algorithm

In [7]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import time

In [2]:
registry = mwdsbe.load_registry() # geopandas df
license = licenses.CommercialActivityLicenses().download()

In [34]:
# clean data
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore_words)

In [35]:
print('Total number of cleaned registry:', len(cleaned_registry))

Total number of cleaned registry: 3119


In [36]:
print('Total number of cleaned license:', len(cleaned_license))

Total number of cleaned license: 203578


## 1. Score-cutoff 90

In [64]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=90)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=90)
)
t = time.time() - t1

In [65]:
print('Execution time:', t, 'sec')

Execution time: 186.1156096458435 sec


In [66]:
matched = merged.dropna(subset=['company_name_y'])

In [67]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1376 out of 3119


In [68]:
non_exact_match = matched[matched.match_probability < 0.999999]
non_exact_match = non_exact_match[['company_name_x', 'match_probability', 'company_name_y']]
print('Non-exact match above 90:', len(non_exact_match), 'out of', len(matched))

Non-exact match above 90: 88 out of 1376


### TODO: Check with Nick if this seems promising or not

In [69]:
# non_exact_match.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\tf-idf\tf-idf-90.xlsx', index = None, header=True)

## 2. Score-cutoff 85

In [70]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=85)
)
t = time.time() - t1

In [71]:
print('Execution time:', t, 'sec')

Execution time: 190.3540334701538 sec


In [72]:
matched = merged.dropna(subset=['company_name_y'])

In [73]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1485 out of 3119


In [76]:
match_to_check = matched[matched.match_probability < 0.9]
match_to_check = match_to_check[['company_name_x', 'match_probability', 'company_name_y']]
print('Match between 85 and 90:', len(match_to_check), 'out of', len(matched))

Match between 85 and 90: 111 out of 1485


In [77]:
# match_to_check.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\tf-idf\tf-idf-85.xlsx', index = None, header=True)

## 3. Score-cutoff 80

In [78]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=80)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=80)
)
t = time.time() - t1

In [79]:
print('Execution time:', t, 'sec')

Execution time: 192.23722171783447 sec


In [80]:
matched = merged.dropna(subset=['company_name_y'])

In [81]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1650 out of 3119


In [82]:
match_to_check = matched[matched.match_probability < 0.85]
match_to_check = match_to_check[['company_name_x', 'match_probability', 'company_name_y']]
print('Match between 80 and 85:', len(match_to_check), 'out of', len(matched))

Match between 80 and 85: 170 out of 1650


In [83]:
# match_to_check.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\tf-idf\tf-idf-80.xlsx', index = None, header=True)

## 4. Score-cutoff 75

In [84]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=75)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=75)
)
t = time.time() - t1

In [85]:
print('Execution time:', t, 'sec')

Execution time: 191.79217743873596 sec


In [86]:
matched = merged.dropna(subset=['company_name_y'])

In [87]:
print('Match:', len(matched), 'out of', len(cleaned_registry))

Match: 1859 out of 3119


In [88]:
match_to_check = matched[matched.match_probability < 0.8]
match_to_check = match_to_check[['company_name_x', 'match_probability', 'company_name_y']]
print('Match between 75 and 80:', len(match_to_check), 'out of', len(matched))

Match between 75 and 80: 215 out of 1859


In [89]:
# match_to_check.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\tf-idf\tf-idf-75.xlsx', index = None, header=True)