Applying Fuzzy 95 + TF-IDF 85 for all data
* Data
    * Registry
    * License Data
    * State License Data
    * Payments
    * Professional Services Payments

In [18]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import pandas as pd
import numpy as np
import time

In [11]:
# convert state_license column names from titlecase to snakecase
def to_snake_case(aList):
    res = []
    for item in aList:
        words = item.strip().lower().split(' ')
        item = '_'.join(words)
        res.append(item)
    return res

In [3]:
registry = mwdsbe.load_registry() # geopandas df
license = licenses.CommercialActivityLicenses().get()
state_license = pd.read_csv('./data/state_business_license/Sales_Tax_Licenses_and_Certificates_Current_Monthly_County_Revenue.csv')
payments = pd.read_csv(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe\mwdsbe\data\Payments\city_payments_detailed_2017.csv')
ps = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\professional_services\cleaned_ps.xlsx')

In [12]:
state_license.columns = to_snake_case(state_license.columns.tolist())

In [13]:
state_license.columns

Index(['county', 'legal_name', 'trade_name', 'postal_code', 'country',
       'account_number', 'license_type', 'expiration_date',
       'address_with_lat/long'],
      dtype='object')

In [15]:
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore_words)
cleaned_SL = skool.clean_strings(state_license, ['legal_name', 'trade_name'], True, ignore_words)
cleaned_payments = skool.clean_strings(payments, ['vendor_name'], True, ignore_words)
cleaned_ps = skool.clean_strings(ps, ['vendor'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name'])
cleaned_license = cleaned_license.dropna(subset=['company_name'])
# cleaned_SL = cleaned_SL.dropna(subset=['legal_name', 'trade_name'])
cleaned_payments = cleaned_payments.dropna(subset=['vendor_name'])
cleaned_ps = cleaned_ps.dropna(subset=['vendor'])

In [16]:
pa_registry = cleaned_registry[cleaned_registry.location_state == 'PA']

In [22]:
t1 = time.time()
merged = (
    skool.fuzzy_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, on="company_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=85)
    #SL
    .pipe(skool.fuzzy_merge, pa_registry, cleaned_SL, left_on="company_name", right_on="legal_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, pa_registry, cleaned_SL, left_on="company_name", right_on="trade_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, pa_registry, cleaned_SL, left_on="dba_name", right_on="legal_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, pa_registry, cleaned_SL, left_on="dba_name", right_on="trade_name", score_cutoff=95)
    .pipe(skool.tf_idf_merge, pa_registry, cleaned_SL, left_on="company_name", right_on="legal_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, pa_registry, cleaned_SL, left_on="company_name", right_on="trade_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, pa_registry, cleaned_SL, left_on="dba_name", right_on="legal_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, pa_registry, cleaned_SL, left_on="dba_name", right_on="trade_name", score_cutoff=85)
    # payments
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_payments, left_on="company_name", right_on="vendor_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_payments, left_on="dba_name", right_on="vendor_name", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_payments, left_on="company_name", right_on="vendor_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_payments, left_on="dba_name", right_on="vendor_name", score_cutoff=85)
    #ps
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_ps, left_on="company_name", right_on="vendor", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_ps, left_on="dba_name", right_on="vendor", score_cutoff=95)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_ps, left_on="company_name", right_on="vendor", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_ps, left_on="dba_name", right_on="vendor", score_cutoff=85)
)
t = time.time() - t1

In [23]:
print('Execution time:', t/60, 'min')

Execution time: 69.15448158184687 min


In [26]:
merged

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,doc_ref_no_prefix,doc_ref_no_prefix_definition,contract_number,contract_description,transaction_amount,department_name,vendor,tot_payments,fy_year,fy_quarter
0,119 degrees architects,,Rafael,Utrera,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,"1503 Green Street, Suite # 4",Philadelphia,...,,,,,,,,,,
1,,,JEFFREY,YEKENCHIK,236 McKendimen Road,Medford Lakes,NJ,8055.0,236 McKendimen Road,Medford Lakes,...,,,,,,,,,,
2,,provisio,Kathrina,Nease,133 N. 21st Street,Camp Hill,PA,17011.0,133 N. 21st Street,Camp Hill,...,,,,,,,,,,
3,,two twelve,Ann,Harakawa,"236 W 27th Street, Suite 802",New York,NY,10001.0,"236 W 27th Street, Suite 802",New York,...,,,,,,,,,,
4,,,Dewain,Johnson,810 Felton Avenue,Sharon Hill,PA,19079.0,810 Felton Avenue,Sharon Hill,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3114,,,Jason,Wingard,"536 W. Moreland Avenue, Suite C.",Philadelphia,PA,19118.0,"536 W. Moreland Avenue, Suite C.",Philadelphia,...,,,,,,,,,,
3115,zones,,Mr.,Lalji,1102 15TH ST SW,Auburn,WA,98001.0,1102 15TH ST SW,Auburn,...,,,,,,,,,,
3116,,,Susan,Wilson,25 Willow Run Drive,Kane,PA,16735.0,25 Willow Run Drive,Kane,...,,,,,,,,,,
3117,zora,,Charles,O'Reilly,1901 Dorcas Lane,Wilmington,DE,19806.0,1901 Dorcas Lane,Wilmington,...,,,,,,,,,,


In [24]:
matched = merged.dropna(subset=['company_name_y'])
matched = matched.dropna(subset=['legal_name', 'dba_name'], how='all')
matched = matched.dropna(subset=['vendor_name'])
matched = matched.dropna(subset=['vendor'])

In [25]:
len(matched)

0