In [1]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import numpy as np

In [2]:
registry = mwdsbe.load_registry()

In [3]:
len(registry)

3119

In [4]:
license = licenses.CommercialActivityLicenses().download()

In [5]:
len(license)

203121

In [6]:
def exact_match(data1, data2, on, how):
    if how not in ["exact", "contains", "startswith"]:
        raise ValueError("how should be one of: 'exact', 'contains', 'startswith'")
    
    merged = skool.exact_merge(data1, data2, on=on, how=how)
    matched = merged.dropna(subset=['company_name_y'])
    
    return len(matched) / len(data1) * 100

In [7]:
# find max target score cutoff using binary search
def find_max_fuzzy_match_score(data1, data2, target):
    
    total_n = len(data1)
    
    # binary search
    start = 0
    end = 100
    
    maxScore = -1
    
    while start <= end:
        mid = (start + end) / 2;
        
        fuzzy_merged = skool.fuzzy_merge(data1, data2, on="company_name", score_cutoff=mid)
        fuzzy_matched = fuzzy_merged.dropna(subset=['company_name_y'])
        match_n = len(fuzzy_matched)
        
        match_prop = match_n / total_n
        
        if match_prop == target:
            start = mid + 1
            maxScore = mid
            matched = True
        elif match_prop < target:
            end = mid - 1
        else:
            start = mid + 1
    
    return maxScore
        

In [86]:
exact_match(registry, license, on="company_name", how="exact")

4.039756332157743

In [18]:
exact_match(registry, license, on="company_name", how="contains")

  return func(self, *args, **kwargs)


5.706957358127605

In [19]:
exact_match(registry, license, on="company_name", how="startswith")

5.45046489259378

In [8]:
# test for the first 100 registry data
mini_registry = registry[:100]

In [26]:
# clean punctuation & ignore "inc", "group" "llc"
# cleaned_mini_registry = skool.clean_strings(mini_registry, ['company_name'],True, ['inc', 'group', 'llc', ' '])
# cleaned_license = skool.clean_strings(license, ['company_name'], True, ['inc', 'group', 'llc', ' '])

# clean company_name and dba name
ignore = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd']
cleaned_mini_registry = skool.clean_strings(mini_registry, ['company_name', 'dba_name'], True, ignore)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore)

In [27]:
find_max_fuzzy_match_score(cleaned_mini_registry, cleaned_license, 1)

56.375

In [28]:
merged = skool.fuzzy_merge(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=56.375)
matched = merged.dropna(subset=["company_name_y"])
matched

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,match_probability,right_index,license_num,issue_date,license_status,company_name_y
0,119 degrees architects,,Rafael,Utrera,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,"1503 Green Street, Suite # 4",Philadelphia,...,1503 GREEN ST,39.964275,-75.163042,POINT (-75.16304 39.96427),1.00,131113.0,480115,2009-07-31T00:00:00Z,Active,119 degrees architects
1,12bravo,,JEFFREY,YEKENCHIK,236 McKendimen Road,Medford Lakes,NJ,8055.0,236 McKendimen Road,Medford Lakes,...,,,,,0.93,200625.0,815623,2019-06-17T00:00:00Z,Active,12 bravo
2,1st choice financial,provisio,Kathrina,Nease,133 N. 21st Street,Camp Hill,PA,17011.0,133 N. 21st Street,Camp Hill,...,,,,,0.83,37281.0,174332,2004-10-12T00:00:00Z,Active,omni choice financial
3,212 harakawa,two twelve,Ann,Harakawa,"236 W 27th Street, Suite 802",New York,NY,10001.0,"236 W 27th Street, Suite 802",New York,...,,,,,0.60,99973.0,334667,2005-08-30T00:00:00Z,Active,1225 raw
4,215 media solutions,,Dewain,Johnson,810 Felton Avenue,Sharon Hill,PA,19079.0,810 Felton Avenue,Sharon Hill,...,,,,,0.83,123871.0,444271,2008-05-06T00:00:00Z,Active,medical solutions
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,advance detective bureau t a d o richardson,,David,Richardson,10 East Chestnut Hill Avenue,Philadelphia,PA,19118.0,10 East Chestnut Hill Avenue,Philadelphia,...,10 E CHESTNUT HILL AVE,40.078136,-75.209818,POINT (-75.20982 40.07814),0.72,180050.0,725552,2017-01-12T00:00:00Z,Active,advance detective bureau
96,advanced disposal solutions,,Suzanne,Dolaway,2200 Adams Avenue,Philadelphia,PA,19124.0,2200 Adams Avenue,Philadelphia,...,2200 ADAMS AVE,40.005190,-75.087504,POINT (-75.08750 40.00519),1.00,126252.0,135111,2008-08-22T00:00:00Z,Active,advanced disposal solutions
97,advanced hydraulic systems,,Judith,Ward,727 East 9th Street,Chester,PA,19013.0,727 East 9th Street,Chester,...,,,,,1.00,10624.0,104763,1997-07-07T00:00:00Z,Active,advanced hydraulic systems
98,advanced infrastructure design,,Mojgan,Mohtashami,1 Crossroads Drive,Hamilton,NJ,8691.0,1 Crossroads Drive,Hamilton,...,,,,,1.00,173126.0,689813,2016-01-28T00:00:00Z,Active,advanced infrastructure design


In [31]:
export_excel = matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\clean_fuzzy_merge_56.xlsx', index = None, header=True)

In [13]:
exact_match(cleaned_mini_registry, cleaned_license, on="company_name", how="exact")

40.0

In [14]:
exact_match(cleaned_mini_registry, cleaned_license, on="company_name", how="contains")

631.0

In [15]:
exact_match(cleaned_mini_registry, cleaned_license, on="company_name", how="startswith")

248.0

In [32]:
merged = skool.fuzzy_merge(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=95)
matched = merged.dropna(subset=["company_name_y"])
matched

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,match_probability,right_index,license_num,issue_date,license_status,company_name_y
0,119 degrees architects,,Rafael,Utrera,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,"1503 Green Street, Suite # 4",Philadelphia,...,1503 GREEN ST,39.964275,-75.163042,POINT (-75.16304 39.96427),1.0,131113.0,480115,2009-07-31T00:00:00Z,Active,119 degrees architects
5,22,,Caroline,Harper,757 Public Road,Bethlehem,PA,18015.0,757 Public Road,Bethlehem,...,,,,,1.0,168704.0,667661,2015-06-10T00:00:00Z,Active,22
7,24 hour cleaning services,,Mary Colleen,Zoltowski,14005 Barcalow Street,Philadelphia,PA,19116.0,14005 Barcalow Street,Philadelphia,...,14005 BARCALOW ST,40.131349,-75.014284,POINT (-75.01428 40.13135),0.98,17313.0,120711,1999-08-30T00:00:00Z,Active,24 hour cleaning service
8,259 strategies,,Chaka,"Fattah, Jr.","Two Logan Square, Suite 1900",Philadelphia,PA,19103.0,"Two Logan Square, Suite 1900",Philadelphia,...,100-20 N 18TH ST,39.955726,-75.169784,POINT (-75.16978 39.95573),1.0,102418.0,346817,2005-11-28T00:00:00Z,Active,259 strategies
12,4u services,stellar services,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,,,,,1.0,141065.0,531189,2011-04-22T00:00:00Z,Active,4u services
15,521 management,,Kris,Bowman,"1000 1st Avenue, Suite 104",King Of Prussia,PA,19406.0,"1000 1st Avenue, Suite 104",King Of Prussia,...,,,,,1.0,25999.0,136428,2002-03-08T00:00:00Z,Active,521 management
16,6 degrees consulting,,Robert,Lawson,"6545 Hamiton Avenue, Suite 1A",Ptttsburgh,PA,15206.0,"6545 Hamiton Avenue, Suite 1A",Ptttsburgh,...,,,,,1.0,171406.0,681851,2015-10-26T00:00:00Z,Active,6 degrees consulting
17,84 lumber company,84 lumber company,Margaret,Hardy-Knox,1019 Route 519,Eighty Four,PA,15330.0,1019 Route 519,Eighty Four,...,,,,,1.0,111889.0,393009,2006-11-30T00:00:00Z,Active,84 lumber company
18,a a court reporting,andrea st john,Andrea,St. John,"303 Chestnut Street, Second Floor",Philadelphia,PA,19106.0,"P.O. Box 74, Suite 4",Moorestown,...,303 CHESTNUT ST,39.948849,-75.146173,POINT (-75.14617 39.94885),1.0,154368.0,107405,2013-07-29T00:00:00Z,Active,a a court reporting
21,a c environmental services,,ANTOINETTE,PATRICK,2045 NORTH LAWRENCE STREET,Philadelphia,PA,19122.0,2045 NORTH LAWRENCE STREET,Philadelphia,...,2045 N LAWRENCE ST,39.98229,-75.141137,POINT (-75.14114 39.98229),1.0,118976.0,122477,2007-09-07T00:00:00Z,Active,a c environmental services


In [33]:
export_excel = matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\clean_fuzzy_merge_95.xlsx', index = None, header=True)

In [34]:
merged = skool.fuzzy_merge(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=90)
matched = merged.dropna(subset=["company_name_y"])
matched

export_excel = matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\clean_fuzzy_merge_90.xlsx', index = None, header=True)

In [35]:
merged = skool.fuzzy_merge(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=85)
matched = merged.dropna(subset=["company_name_y"])
matched

export_excel = matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\clean_fuzzy_merge_85.xlsx', index = None, header=True)

In [33]:
export_excel = matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\clean_fuzzy_merge_95.xlsx', index = None, header=True)

In [34]:
merged = skool.fuzzy_merge(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=90)
matched = merged.dropna(subset=["company_name_y"])
matched

export_excel = matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\clean_fuzzy_merge_90.xlsx', index = None, header=True)

In [17]:
# fuzzy match of cleaned data
find_max_fuzzy_match_score(cleaned_mini_registry, cleaned_license, 1)

56.375

In [21]:
# df to excel
export_excel = registry.to_excel (r'C:\Users\dabinlee\Desktop\export_dataframe.xlsx', index = None, header=True)

In [79]:
# compare both company_name & dba name

merged = (
    skool.fuzzy_merge(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=70)
    .pipe(skool.fuzzy_merge, cleaned_mini_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=90)
)
matched = merged.dropna(subset=['company_name_y'])
matched

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed

In [39]:
registry[20:40]

Unnamed: 0_level_0,company_name,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,mailing_state,mailing_zip,certification_type,capability,local,out_of_state,location_standard,lat,lng,geometry
registry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
20,"A & B Unique Construction / Home Repair, Inc.",,Barbara,Davis,5008 Wayne Avenue,Philadelphia,PA,19144.0,5008 Wayne Avenue,Philadelphia,PA,19144.0,MWBE,NAICS 238130 Carpentry Contractors,True,False,5008 WAYNE AVE,40.025369,-75.16807,POINT (-75.16807 40.02537)
21,"A & C ENVIRONMENTAL SERVICES, INC.",,ANTOINETTE,PATRICK,2045 NORTH LAWRENCE STREET,Philadelphia,PA,19122.0,2045 NORTH LAWRENCE STREET,Philadelphia,PA,19122.0,MWBE,Asphalt roof shingle installation ; Aerial or ...,True,False,2045 N LAWRENCE ST,39.98229,-75.141137,POINT (-75.14114 39.98229)
22,"A & I Security, LLC",,RAYMOND,ALVAREZ,10234 Dedater Street,Philadelphia,PA,19116.0,10234 Dedater Street,Philadelphia,PA,19116.0,MBE,Insurance investigation services (except claim...,True,False,10234 DEDATER ST,40.106057,-75.018803,POINT (-75.01880 40.10606)
23,A & O Recovery Services,,Kathy,Gallagher,3319 Kensington Avenue,Philadelphia,PA,19134.0,3319 Kensington Avenue,Philadelphia,PA,19134.0,WBE,"; 5416\tManagement, Scientific, and Technical ...",True,False,3319 KENSINGTON AVE,39.998001,-75.11018,POINT (-75.11018 39.99800)
24,A AND R FENCE AND GUIDERAIL LLC,,Anna,ESAU,564 13th street,Hammonton,NJ,8037.0,564 13th street,Hammonton,NJ,8037.0,WBE,"Highway, Street, and Bridge Construction",False,True,,,,
25,"A Bob's Towing, Inc.",,Mikina,Harrison,2220 Orthodox Street,Philadelphia,PA,19137.0,2220 Orthodox Street,Philadelphia,PA,19137.0,MWBE,NAICS\t48841\tMotor Vehicle Towing ; NAICS\t81...,True,False,2220 ORTHODOX ST,40.008166,-75.078701,POINT (-75.07870 40.00817)
26,"A Esteban & Co., Inc.",,Alfonso C.,Esteban,132 West 36th Street,New York,NY,10018.0,132 West 36th Street,New York,NY,10018.0,MBE,NAICS\t323111\tAddress lists commercial printi...,False,True,,,,
27,"A K Architecture, LLC",,Lisa,Armstrong,2425 Pine Street,Philadelphia,PA,19103.0,2425 Pine Street,Philadelphia,PA,19103.0,WBE,"NAICS\t5413 Architectural, Engineering, and ...",True,False,2425 PINE ST,39.947695,-75.18109,POINT (-75.18109 39.94770)
28,"A N S Steel Company, LLC",,Linda,Manning,4340 Sepviva Street,Philadelphia,PA,19124.0,4340 Sepviva Street,Philadelphia,PA,19124.0,WBE,NAICS\t2362\tNonresidential Building Construct...,True,False,4340 SEPVIVA ST,40.003004,-75.079992,POINT (-75.07999 40.00300)
29,A-Best Vending Corp,A-Best Vending & Coffee Service,Sue,Epstein,7336 State Rd,Philadelphia,PA,19136.0,7336 State Rd,Philadelphia,PA,19136.0,WBE,Vending machine installation,True,False,7336 STATE RD,40.026532,-75.030907,POINT (-75.03091 40.02653)
