In [1]:
import pandas as pd
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import time

In [31]:
def drop_duplicates_by_date(df, date_column):
    df.sort_values(by=date_column, ascending=False, inplace=True)
    df = df.loc[~df.index.duplicated(keep="first")]
    df.sort_index(inplace=True)
    return df

In [2]:
registry = mwdsbe.load_registry() # geopandas df
license = licenses.CommercialActivityLicenses().get()

In [3]:
state_license = pd.read_csv('./data/PAStateBusinessLicense/Sales_Tax_Licenses_and_Certificates_Current_Monthly_County_Revenue.csv')

In [4]:
# convert state_license column names from titlecase to snakecase
def to_snake_case(aList):
    res = []
    for item in aList:
        words = item.strip().lower().split(' ')
        item = '_'.join(words)
        res.append(item)
    return res

In [5]:
state_license.columns = to_snake_case(state_license.columns.tolist())

In [6]:
# clean data
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore_words)
cleaned_state_license = skool.clean_strings(state_license, ['legal_name', 'trade_name'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name'])
cleaned_license = cleaned_license.dropna(subset=['company_name'])

In [8]:
pa_registry = cleaned_registry[cleaned_registry.location_state == 'PA']

In [13]:
len(pa_registry)

1769

In [9]:
cleaned_state_license.head()

Unnamed: 0,county,legal_name,trade_name,postal_code,country,account_number,license_type,expiration_date,address_with_lat/long
0,ADAMS,nugen energy,nugen energy,17331,US,67**9551,Retail,09/30/2021,"1601 CARLISLE PIKE\nHANOVER, PA 17331\n(39.834..."
1,ALLEGHENY,cellux,mobilexpress west mifflin,15123,US,85**6245,Retail,01/31/2021,"3075 CLAIRON RD CENTURY III MA\nWEST MIFFLIN, ..."
2,ALLEGHENY,dina persichetti hill,nails at last,15101,US,81**6740,Retail,02/29/2024,"4085 WM FLYNN HWY STE 10A\nALLISON PARK, PA 15101"
3,ADAMS,elwood mummert,mummert elwood k,17331-7700,US,81**7441,Retail,01/31/2021,"700 PINE GROVE RD\nHANOVER, PA 17331-7700\n(39..."
4,ALLEGHENY,cosmograce,brillobox,15224,US,67**9185,Retail,09/30/2021,"4104 PENN AVE\nPITTSBURGH, PA 15224\n(40.46574..."


## Merge PA_Registry and State License with Fuzz95 and TF-IDF85

In [10]:
t1 = time.time()
merged = (
    skool.fuzzy_merge(pa_registry, cleaned_state_license, left_on="company_name", right_on="legal_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, pa_registry, cleaned_state_license, left_on="company_name", right_on="trade_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, pa_registry, cleaned_state_license, left_on="dba_name", right_on="legal_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, pa_registry, cleaned_state_license, left_on="dba_name", right_on="trade_name", score_cutoff=95)
    .pipe(skool.tf_idf_merge, pa_registry, cleaned_state_license, left_on="company_name", right_on="legal_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, pa_registry, cleaned_state_license, left_on="company_name", right_on="trade_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, pa_registry, cleaned_state_license, left_on="dba_name", right_on="legal_name", score_cutoff=85)
    .pipe(skool.tf_idf_merge, pa_registry, cleaned_state_license, left_on="dba_name", right_on="trade_name", score_cutoff=85)
)
t = time.time() - t1

In [22]:
# merged.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\state_license\pa-registry-full-state-license\fuzz95-tfidf85.xlsx', header=True)

In [11]:
print('Execution time:', t/60, 'min')

Execution time: 51.068942018349965 min


In [12]:
len(merged)

1769

In [46]:
matched = merged.dropna(subset=['legal_name', 'dba_name'], how='all')

In [47]:
len(matched)

583

In [43]:
# test = matched[['company_name', 'legal_name', 'dba_name', 'zip_code', 'postal_code_clean', 'match_probability']]

In [48]:
matched['expiration_date'] = pd.to_datetime(matched['expiration_date'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [49]:
matched = drop_duplicates_by_date(matched, 'expiration_date')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [50]:
len(matched)

583

##### Load OL data

In [36]:
matched_OL = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\license-opendataphilly\tf-idf\tf-idf-85.xlsx')
matched_OL = matched_OL.set_index('left_index')

In [37]:
len(matched_OL)

1642

In [38]:
matched_OL = drop_duplicates_by_date(matched_OL, "issue_date") # without duplicates

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [39]:
len(matched_OL)

1502

In [40]:
difference = matched_zip.index.difference(matched_OL.index).tolist()

In [41]:
len(difference)

98