In [1]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import numpy as np

In [2]:
registry = mwdsbe.load_registry()

In [3]:
len(registry)

3119

In [4]:
license = licenses.CommercialActivityLicenses().download()

In [5]:
len(license)

203121

In [6]:
def exact_match(data1, data2, on, how):
    if how not in ["exact", "contains", "startswith"]:
        raise ValueError("how should be one of: 'exact', 'contains', 'startswith'")
    
    merged = skool.exact_merge(data1, data2, on=on, how=how)
    matched = merged.dropna(subset=['company_name_y'])
    
    return len(matched) / len(data1) * 100

In [7]:
# find max target score cutoff using binary search
def find_max_fuzzy_match_score(data1, data2, target):
    
    total_n = len(data1)
    
    # binary search
    start = 0
    end = 100
    
    maxScore = -1
    
    while start <= end:
        mid = (start + end) / 2;
        
        fuzzy_merged = skool.fuzzy_merge(data1, data2, on="company_name", score_cutoff=mid)
        fuzzy_matched = fuzzy_merged.dropna(subset=['company_name_y'])
        match_n = len(fuzzy_matched)
        
        match_prop = match_n / total_n
        
        if match_prop == target:
            start = mid + 1
            maxScore = mid
            matched = True
        elif match_prop < target:
            end = mid - 1
        else:
            start = mid + 1
    
    return maxScore
        

In [86]:
exact_match(registry, license, on="company_name", how="exact")

4.039756332157743

In [18]:
exact_match(registry, license, on="company_name", how="contains")

  return func(self, *args, **kwargs)


5.706957358127605

In [19]:
exact_match(registry, license, on="company_name", how="startswith")

5.45046489259378

In [8]:
# test for the first 100 registry data
mini_registry = registry[:100]

In [9]:
find_max_fuzzy_match_score(mini_registry, license, 1)

62.75

In [10]:
# clean punctuation & ignore "inc", "group" "llc"
cleaned_mini_registry = skool.clean_strings(mini_registry, ['company_name'],True, ['inc', 'group', 'llc', ' '])
cleaned_license = skool.clean_strings(license, ['company_name'], True, ['inc', 'group', 'llc', ' '])

In [13]:
exact_match(cleaned_mini_registry, cleaned_license, on="company_name", how="exact")

40.0

In [14]:
exact_match(cleaned_mini_registry, cleaned_license, on="company_name", how="contains")

631.0

In [15]:
exact_match(cleaned_mini_registry, cleaned_license, on="company_name", how="startswith")

248.0

In [17]:
# fuzzy match of cleaned data
find_max_fuzzy_match_score(cleaned_mini_registry, cleaned_license, 1)

56.375

In [21]:
# df to excel
export_excel = registry.to_excel (r'C:\Users\dabinlee\Desktop\export_dataframe.xlsx', index = None, header=True)

In [79]:
# compare both company_name & dba name

# clean company_name and dba name
ignore = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd']
cleaned_mini_registry = skool.clean_strings(mini_registry, ['company_name', 'dba_name'],True, ignore)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore)

merged = (
    skool.fuzzy_merge(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=70)
    .pipe(skool.fuzzy_merge, cleaned_mini_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=90)
)
matched = merged.dropna(subset=['company_name_y'])
matched

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed