In [2]:
import mwdsbe
import mwdsbe.datasets.licenses as licenses
import schuylkill as skool
import numpy as np
import pandas as pd

In [3]:
registry = mwdsbe.load_registry()
len(registry)

3119

In [4]:
license = licenses.CommercialActivityLicenses().download()
len(license)

203149

In [5]:
def exact_match(data1, data2, on, how):
    
    if how not in ["exact", "contains", "startswith"]:
        raise ValueError("how should be one of: 'exact', 'contains', 'startswith'")
    
    merged = skool.exact_merge(data1, data2, on=on, how=how)
    matched = merged.dropna(subset=['company_name_y'])
    
    return matched

In [6]:
def fuzzy_match(data1, data2, on, score_cutoff):
    
    merged = skool.fuzzy_merge(data1, data2, on=on, score_cutoff=score_cutoff)
    matched = merged.dropna(subset=["company_name_y"])
    
    return matched

In [7]:
def get_match_rate(original, matched):
    return len(matched) / len(original) * 100

In [8]:
mini_registry = registry[:100]

# clean company_name and dba name
ignore = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore)
cleaned_mini_registry = skool.clean_strings(mini_registry, ['company_name', 'dba_name'], True, ignore)
cleaned_license = skool.clean_strings(license, ['company_name'], True, ignore)

In [24]:
exact = exact_match(cleaned_mini_registry, cleaned_license, on="company_name", how="exact")

In [25]:
get_match_rate(cleaned_mini_registry, exact)

41.0

In [26]:
# exact_contains = exact_match(cleaned_mini_registry, cleaned_license, on="company_name", how="contains")

In [30]:
fuzzy_95 = fuzzy_match(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=95)

get_match_rate(cleaned_mini_registry, fuzzy_95)

48.0

In [18]:
fuzzy_90 = fuzzy_match(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=90)

In [23]:
get_match_rate(cleaned_mini_registry, fuzzy_90)

55.00000000000001

In [29]:
fuzzy_85 = fuzzy_match(cleaned_mini_registry, cleaned_license, on="company_name", score_cutoff=85)

get_match_rate(cleaned_mini_registry, fuzzy_85)

73.0

In [34]:
# exact match for full cleaned data
full_exact = exact_match(cleaned_registry, cleaned_license, on="company_name", how="exact")
get_match_rate(cleaned_registry, exact)

42.51362616223148

In [54]:
len(full_exact)

1326

In [11]:
# fuzzy match with 95 score_cutoff for full data
# full_fuzzy_95 = fuzzy_match(cleaned_registry, cleaned_license, on="company_name", score_cutoff=95)
# get_match_rate(cleaned_registry, full_fuzzy_95)

full_fuzzy_95_copy = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\full_clean_fuzzy_95.xlsx', index = None)

In [55]:
len(full_fuzzy_95)

1345

In [37]:
full_exact.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\full_clean_exact.xlsx', index = None, header=True)

In [10]:
full_fuzzy_95.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\full_clean_fuzzy_95.xlsx', index = None, header=True)

NameError: name 'full_fuzzy_95' is not defined

In [14]:
diff = full_fuzzy_95_copy[full_fuzzy_95_copy['match_probability'] < 1]
diff

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,match_probability,right_index,license_num,issue_date,license_status,company_name_y
2,24 hour cleaning services,,Mary Colleen,Zoltowski,14005 Barcalow Street,Philadelphia,PA,19116.0,14005 Barcalow Street,Philadelphia,...,14005 BARCALOW ST,40.131349,-75.014284,POINT (-75.0142839608247 40.1313485259829),0.98,17310,120711,1999-08-30T00:00:00Z,Active,24 hour cleaning service
11,a bob s towing,,Mikina,Harrison,2220 Orthodox Street,Philadelphia,PA,19137.0,2220 Orthodox Street,Philadelphia,...,2220 ORTHODOX ST,40.008166,-75.078701,POINT (-75.07870119360095 40.008166153533),0.96,134142,134939,2010-02-04T00:00:00Z,Active,a bobs towing
15,a m electric,,Anthony,Muhammad,2222 N. Marshall Street,Philadelphia,PA,19133.0,2222 N. Marshall Street,Philadelphia,...,2222 N MARSHALL ST,39.985489,-75.144539,POINT (-75.14453871694691 39.9854885343929),0.96,28712,154795,2002-11-27T00:00:00Z,Active,am electric
16,a m painting,,Abdullah,Muhammad,2745 North Garnet Street,Philadelphia,PA,19132.0,2745 North Garnet Street,Philadelphia,...,2745 N GARNET ST,39.996079,-75.162692,POINT (-75.16269177785369 39.99607919723243),0.96,118208,419397,2007-08-06T00:00:00Z,Active,am painting
17,a m truck auto repair,,CATHY,MATARAZZO-NICHOLAS,14065 Townsend Road,Philadelphia,PA,19154.0,14065 Townsend Road,Philadelphia,...,14065 TOWNSEND RD,40.108419,-74.975170,POINT (-74.97516992544176 40.10841900796122),0.98,19881,131320,2000-05-25T00:00:00Z,Active,am truck auto repair
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,vistaconnections,,Judith,Donnelly,12120 State Line Road #292,Leawood,KS,66209.0,12120 State Line Road #292,Leawood,...,,,,,0.97,169197,670243,2015-07-06T00:00:00Z,Active,vista connections
1304,vj associates of suffolk,,Vijay,Desai,"1090 King Georges Post Road, Suite 301",Edison,NJ,8837.0,"1090 King Georges Post Road, Suite 301",Edison,...,,,,,0.98,195398,794326,2018-11-13T00:00:00Z,Active,v j associates of suffolk
1315,west construction services,,Christine,Gallagher,264 Broadway Road,Merion,PA,19066.0,264 Broadway Road,Merion,...,,,,,0.96,105190,360580,2006-03-20T00:00:00Z,Active,best construction services
1321,williams electric,,Richard,Williams,17 Karen Drive,Williamstown,NJ,8094.0,1532 S 53rd Street,Philadelphia,...,,,,,0.97,125884,453068,2008-08-04T00:00:00Z,Active,williams electrica


In [15]:
diff.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\diff_exact_95.xlsx', index = None, header=True)

In [16]:
# fuzzy match with 90 score_cutoff for full data
full_fuzzy_90 = fuzzy_match(cleaned_registry, cleaned_license, on="company_name", score_cutoff=90)
get_match_rate(cleaned_registry, full_fuzzy_90)

52.48477075985893

In [17]:
full_fuzzy_90.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\full_clean_fuzzy_90.xlsx', index = None, header=True)

In [20]:
diff_exact_90 = full_fuzzy_90[full_fuzzy_90['match_probability'] < 1]
len(diff_exact_90)

429

In [19]:
diff_exact_90.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\diff_exact_90.xlsx', index = None, header=True)

In [27]:
diff_95_90 = full_fuzzy_90[full_fuzzy_90['match_probability'] < 1]
diff_95_90 = diff_95_90[diff_95_90['match_probability'] > 0.9]
diff_95_90

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,match_probability,right_index,license_num,issue_date,license_status,company_name_y
1,12bravo,,JEFFREY,YEKENCHIK,236 McKendimen Road,Medford Lakes,NJ,8055.0,236 McKendimen Road,Medford Lakes,...,,,,,0.93,200602.0,815623,2019-06-17T00:00:00Z,Active,12 bravo
7,24 hour cleaning services,,Mary Colleen,Zoltowski,14005 Barcalow Street,Philadelphia,PA,19116.0,14005 Barcalow Street,Philadelphia,...,14005 BARCALOW ST,40.131349,-75.014284,POINT (-75.01428 40.13135),0.98,17320.0,120711,1999-08-30T00:00:00Z,Active,24 hour cleaning service
25,a bob s towing,,Mikina,Harrison,2220 Orthodox Street,Philadelphia,PA,19137.0,2220 Orthodox Street,Philadelphia,...,2220 ORTHODOX ST,40.008166,-75.078701,POINT (-75.07870 40.00817),0.96,134133.0,134939,2010-02-04T00:00:00Z,Active,a bobs towing
34,a m electric,,Anthony,Muhammad,2222 N. Marshall Street,Philadelphia,PA,19133.0,2222 N. Marshall Street,Philadelphia,...,2222 N MARSHALL ST,39.985489,-75.144539,POINT (-75.14454 39.98549),0.96,28710.0,154795,2002-11-27T00:00:00Z,Active,am electric
35,a m painting,,Abdullah,Muhammad,2745 North Garnet Street,Philadelphia,PA,19132.0,2745 North Garnet Street,Philadelphia,...,2745 N GARNET ST,39.996079,-75.162692,POINT (-75.16269 39.99608),0.96,118226.0,419397,2007-08-06T00:00:00Z,Active,am painting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3045,webster associates,,Deborah,Webster,301 South Elm Street Suite 508,Greensboro,NC,27401.0,301 South Elm Street Suite 508,Greensboro,...,,,,,0.94,22355.0,131448,2001-01-22T00:00:00Z,Active,weber associates
3053,west construction services,,Christine,Gallagher,264 Broadway Road,Merion,PA,19066.0,264 Broadway Road,Merion,...,,,,,0.96,105222.0,360580,2006-03-20T00:00:00Z,Active,best construction services
3064,williams electric,,Richard,Williams,17 Karen Drive,Williamstown,NJ,8094.0,1532 S 53rd Street,Philadelphia,...,,,,,0.97,125874.0,453068,2008-08-04T00:00:00Z,Active,williams electrica
3073,wjj global enterprises,,Willie,Jones,1242 Pratt Street,Philadelphia,PA,19124.0,1242 Pratt Street,Philadelphia,...,1242 PRATT ST,40.025222,-75.083344,POINT (-75.08334 40.02522),0.91,172657.0,687493,2015-12-30T00:00:00Z,Active,cjr global enterprises


In [28]:
diff_95_90.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\diff_95_90.xlsx', index = None, header=True)

In [9]:
# match company_name and dba_name
merged = (
    skool.fuzzy_merge(cleaned_registry, cleaned_license, on="company_name", score_cutoff=95)
    .pipe(skool.fuzzy_merge, cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=95)
)
matched = merged.dropna(subset=['company_name_y'])
matched

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,match_probability,right_index,license_num,issue_date,license_status,company_name_y
0,119 degrees architects,,Rafael,Utrera,"1503 Green Street, Suite # 4",Philadelphia,PA,19130.0,"1503 Green Street, Suite # 4",Philadelphia,...,1503 GREEN ST,39.964275,-75.163042,POINT (-75.16304 39.96427),1.00,131109.0,480115,2009-07-31T00:00:00Z,Active,119 degrees architects
5,22,,Caroline,Harper,757 Public Road,Bethlehem,PA,18015.0,757 Public Road,Bethlehem,...,,,,,1.00,168696.0,667661,2015-06-10T00:00:00Z,Active,22
7,24 hour cleaning services,,Mary Colleen,Zoltowski,14005 Barcalow Street,Philadelphia,PA,19116.0,14005 Barcalow Street,Philadelphia,...,14005 BARCALOW ST,40.131349,-75.014284,POINT (-75.01428 40.13135),0.98,17318.0,120711,1999-08-30T00:00:00Z,Active,24 hour cleaning service
8,259 strategies,,Chaka,"Fattah, Jr.","Two Logan Square, Suite 1900",Philadelphia,PA,19103.0,"Two Logan Square, Suite 1900",Philadelphia,...,100-20 N 18TH ST,39.955726,-75.169784,POINT (-75.16978 39.95573),1.00,102402.0,346817,2005-11-28T00:00:00Z,Active,259 strategies
12,4u services,stellar services,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,,,,,1.00,141065.0,531189,2011-04-22T00:00:00Z,Active,4u services
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3105,zana cakes,,Zana,Billue,"7715 Crittenden Street, #339",Philadelphia,PA,19118.0,"7715 Crittenden Street, #339",Philadelphia,...,7715 CRITTENDEN ST,40.074388,-75.193592,POINT (-75.19359 40.07439),1.00,97875.0,167047,2005-07-01T00:00:00Z,Active,zana cakes
3106,zanaras reporting video,,JULIE,ZANARAS,"1845 WALNUT STREET, Suite 938",Philadelphia,PA,19103.0,"1845 WALNUT STREET, Suite 938",Philadelphia,...,1845 WALNUT ST,39.950665,-75.171552,POINT (-75.17155 39.95067),1.00,113998.0,401529,2007-03-05T00:00:00Z,Active,zanaras reporting video
3107,zavorski masonry restoration,,Lisa,Zavorski,717 Ford Avenue,Langhorne,PA,19047.0,717 Ford Avenue,Langhorne,...,,,,,1.00,101646.0,343160,2005-10-31T00:00:00Z,Active,zavorski masonry restoration
3110,zenga engineering,,Gwendolyn,Lodise,313 East Broad Street,Palmyra,NJ,8065.0,313 East Broad Street,Palmyra,...,,,,,1.00,136809.0,509641,2010-07-26T00:00:00Z,Active,zenga engineering


In [13]:
dba = matched[matched['dba_name'].notna()]
dba

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,match_probability,right_index,license_num,issue_date,license_status,company_name_y
12,4u services,stellar services,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,,,,,1.0,141065.0,531189,2011-04-22T00:00:00Z,Active,4u services
17,84 lumber company,84 lumber company,Margaret,Hardy-Knox,1019 Route 519,Eighty Four,PA,15330.0,1019 Route 519,Eighty Four,...,,,,,1.0,111886.0,393009,2006-11-30T00:00:00Z,Active,84 lumber company
18,a a court reporting,andrea st john,Andrea,St. John,"303 Chestnut Street, Second Floor",Philadelphia,PA,19106.0,"P.O. Box 74, Suite 4",Moorestown,...,303 CHESTNUT ST,39.948849,-75.146173,POINT (-75.14617 39.94885),1.0,154361.0,107405,2013-07-29T00:00:00Z,Active,a a court reporting
29,a best vending,a best vending coffee service,Sue,Epstein,7336 State Rd,Philadelphia,PA,19136.0,7336 State Rd,Philadelphia,...,7336 STATE RD,40.026532,-75.030907,POINT (-75.03091 40.02653),1.0,110609.0,122115,2006-10-06T00:00:00Z,Active,a best vending
48,abbadon corporation,superior facility service,Donna,Cohen,712 Bigler Street,Philadelphia,PA,19148.0,712 Bigler St,Philadelphia,...,712 BIGLER ST,39.912636,-75.161093,POINT (-75.16109 39.91264),1.0,103092.0,019267,2005-12-23T00:00:00Z,Active,abbadon corporation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3034,walls engineering pllc,richard walls,Richard,Walls,"10911 Raven Ridge Road, Suite 103-76",Raleigh,NC,27614.0,"10911 Raven Ridge Road, Suite 103-76",Raleigh,...,,,,,1.0,198351.0,806486,2019-03-21T00:00:00Z,Active,walls engineering pllc
3037,wash cycle laundry,wash cycle laundry,Gabriel,Mandujano,"1617 JFK Boulevard, Suite - 1855",Philadelphia,PA,19103.0,"1617 JFK Boulevard, Suite - 1855",Philadelphia,...,1617 JOHN F KENNEDY BLVD,39.953873,-75.167082,POINT (-75.16708 39.95387),1.0,147085.0,560293,2012-04-10T00:00:00Z,Active,wash cycle laundry
3051,wendy saltzman media,philly power media,Wendy,Saltzman,1112 South 17th Street,Philadelphia,PA,19146.0,1112 South 17th Street,Philadelphia,...,1112 S 17TH ST,39.937827,-75.171934,POINT (-75.17193 39.93783),1.0,190358.0,772115,2018-04-09T00:00:00Z,Active,wendy saltzman media
3066,willow accounting consulting,willow accounting consulting,Miranda,Willow,"4811 Jonestown Road, Suite 224",Harrisburg,PA,17109.0,"4811 Jonestown Road, Suite 224",Harrisburg,...,,,,,1.0,182953.0,737961,2017-05-09T00:00:00Z,Active,willow accounting consulting


In [15]:
# match just dba_name
fuzzy_dba_merged = skool.fuzzy_merge(cleaned_registry, cleaned_license, left_on="dba_name", right_on="company_name", score_cutoff=95)
fuzzy_dba_matched = fuzzy_dba_merged.dropna(subset=['company_name_y'])
fuzzy_dba_matched

Unnamed: 0,company_name_x,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,location_standard,lat,lng,geometry,match_probability,right_index,license_num,issue_date,license_status,company_name_y
17,84 lumber company,84 lumber company,Margaret,Hardy-Knox,1019 Route 519,Eighty Four,PA,15330.0,1019 Route 519,Eighty Four,...,,,,,1.00,111886.0,393009,2006-11-30T00:00:00Z,Active,84 lumber company
48,abbadon corporation,superior facility service,Donna,Cohen,712 Bigler Street,Philadelphia,PA,19148.0,712 Bigler St,Philadelphia,...,712 BIGLER ST,39.912636,-75.161093,POINT (-75.16109 39.91264),1.00,178872.0,720072,2016-11-16T00:00:00Z,Active,superior facility service
75,ack contracting,ack contracting,Brenda,Kellogg,18 Campus Blvd Ste 100,Newtown Square,PA,19073.0,18 Campus Blvd Ste 100,Newtown Square,...,,,,,1.00,197871.0,804592,2019-03-05T00:00:00Z,Active,ack contracting
169,alpha office supplies,alpha enterprise,Terrill L,Brown,"4950 Parkside Avenue, Suite 500",Philadelphia,PA,19131.0,P O BOX 2361,Bala Cynwyd,...,4950 PARKSIDE AVE,39.979714,-75.218697,POINT (-75.21870 39.97971),0.97,172754.0,687938,2016-01-06T00:00:00Z,Active,alpha enterprises
170,alpha omega systems services medical supply,aoss medical supply,Hon K,Liew,4971 Central Avenue,Monroe,LA,71203.0,4971 Central Avenue,Monroe,...,,,,,1.00,149286.0,573020,2012-09-06T00:00:00Z,Active,aoss medical supply
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3001,vic thompson company,vtc,Christine,Norton,"3751 New York Avenue, Suite 140",Arlington,TX,76014.0,"3751 New York Avenue, Suite 140",Arlington,...,,,,,1.00,145421.0,552631,2012-01-19T00:00:00Z,Active,vtc
3005,virgo iii,virgo iii,Anne,Manuel,766 Knox Court,Yardley,PA,19067.0,766 Knox Court,Yardley,...,,,,,1.00,94090.0,191740,2005-03-02T00:00:00Z,Active,virgo iii
3037,wash cycle laundry,wash cycle laundry,Gabriel,Mandujano,"1617 JFK Boulevard, Suite - 1855",Philadelphia,PA,19103.0,"1617 JFK Boulevard, Suite - 1855",Philadelphia,...,1617 JOHN F KENNEDY BLVD,39.953873,-75.167082,POINT (-75.16708 39.95387),1.00,147085.0,560293,2012-04-10T00:00:00Z,Active,wash cycle laundry
3066,willow accounting consulting,willow accounting consulting,Miranda,Willow,"4811 Jonestown Road, Suite 224",Harrisburg,PA,17109.0,"4811 Jonestown Road, Suite 224",Harrisburg,...,,,,,1.00,182953.0,737961,2017-05-09T00:00:00Z,Active,willow accounting consulting


In [16]:
fuzzy_dba_matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\full_clean_dba_fuzzy_95.xlsx', index = None, header=True)

In [19]:
# pipe all: exact, fuzzy_95 on company_name, fuzzy_95 on dba_name

In [None]:
# store data into excel

#     path = r'C:\Users\dabinlee\Desktop\mwdsbe\data\clean_fuzzy_merge'
#     path.join(['_', str(score_cutoff), '.xlsx'])
    
#     export_excel = matched.to_excel(path, index = None, header=True)

In [None]:
# get_match_rate(cleaned_mini_registry, exact_contains)

# exact_contains.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\clean_exact_contains.xlsx', index = None, header=True)