# Professional Services Contracts
The entire dataset for Professional Services Contracts by fiscal quarter - from 2013 Q4 to 2019 Q3

In [2]:
import mwdsbe
import schuylkill as skool
import pandas as pd
import glob
import time

## Functions

In [3]:
def drop_duplicates_by_date(df, date_column):
    df.sort_values(by=date_column, ascending=False, inplace=True)
    df = df.loc[~df.index.duplicated(keep="first")]
    df.sort_index(inplace=True)
    return df

## 1. Only read vendor column from Professional Services
In order to have a sense of how many matches we get from Professional Services data

## Data

In [4]:
registry = mwdsbe.load_registry() # geopandas df

In [5]:
path = r'C:\Users\dabinlee\Documents\GitHub\mwdsbe\mwdsbe\data\professional_services'
ps_vendor = pd.concat([pd.read_csv(file, usecols=['vendor']) for file in glob.glob(path + "/*.csv")], ignore_index = True)

In [6]:
ps_vendor

Unnamed: 0,vendor
0,Albert Scaperotto
1,"Ceisler Media & Issue Advocacy, LLC"
2,Charles Swanson
3,Cliftonlarsonallen LLP
4,Cliftonlarsonallen LLP
...,...
37681,"Whitman, Requardt and Associates, LLP"
37682,"Whitman, Requardt and Associates, LLP"
37683,"Wood Environment & Infrastructure Solutions, Inc."
37684,Zelenkofske Axelrod LLC


In [7]:
ps_vendor = ps_vendor.drop_duplicates()

In [8]:
len(ps_vendor)

2692

#### Clean Data

In [9]:
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_ps_vendor = skool.clean_strings(ps_vendor, ['vendor'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name'])
cleaned_ps_vendor = cleaned_ps_vendor.dropna(subset=['vendor'])

In [10]:
cleaned_ps_vendor = cleaned_ps_vendor.drop_duplicates()

In [11]:
len(cleaned_ps_vendor)

2269

## TF-IDF Merge Registry and Professional Services
on company_name and vendor before full merge

In [12]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_ps_vendor, left_on="company_name", right_on="vendor", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_ps_vendor, left_on="dba_name", right_on="vendor", score_cutoff=85)
)
t = time.time() - t1

In [13]:
print('Execution time:', t, 'sec')

Execution time: 0.38282060623168945 sec


In [14]:
len(merged)

3119

In [15]:
matched_PS = merged.dropna(subset=['vendor'])

In [16]:
matched_PS

Unnamed: 0_level_0,company_name,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,capability,local,out_of_state,location_standard,lat,lng,geometry,right_index,match_probability,vendor
registry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,"Internet service providers, using own operated...",False,True,,,,,31193.0,1.0,stellar
27,a k architecture,,Lisa,Armstrong,2425 Pine Street,Philadelphia,PA,19103.0,2425 Pine Street,Philadelphia,...,"NAICS\t5413 Architectural, Engineering, and ...",True,False,2425 PINE ST,39.947695,-75.181090,POINT (-75.18109 39.94770),25835.0,1.0,a k architecture
60,acacia financial,,Noreen,White,"6000 Midlantic Drive, Suite 410 North",Mt. Laurel,NJ,8054.0,"6000 Midlantic Drive, Suite 410 North",Mt. Laurel,...,Financial management consulting (except invest...,False,True,,,,,40.0,1.0,acacia financial
65,acclaim systems,,Kailash,Kalantri,110 EAST PENNSYLVANIA BOULEVARD,Feasterville,PA,19053.0,110 EAST PENNSYLVANIA BOULEVARD,Feasterville,...,NAICS\t5415\tComputer Systems Design and Relat...,False,False,,,,,3110.0,1.0,acclaim systems
86,adcon consultants,,Lawrence,Dibor,"2465 North 50th Street Bala Building, Suite #1...",Philadelphia,PA,19131.0,"2465 North 50th Street Bala Building, Suite #1...",Philadelphia,...,"Construction management, multifamily building ...",True,False,2465 N 50TH ST,39.999592,-75.227267,POINT (-75.22727 39.99959),16398.0,1.0,adcon consultants
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3007,viridian landscape studio,,Tavis,Dockwiller,3868 Terrace Street,Philadelphia,PA,19128.0,3868 Terrace Street,Philadelphia,...,Landscape architectural services,True,False,3868 TERRACE ST,40.020794,-75.213208,POINT (-75.21321 40.02079),7645.0,1.0,viridian landscape studio
3017,vkg,,Kimberly,Hawthorne,734 E. Dorset Street,Philadelphia,PA,19119.0,734 E. Dorset Street,Philadelphia,...,Computer Training ; Computer Training ; Prof...,True,False,734 DORSET ST,40.062765,-75.175337,POINT (-75.17534 40.06276),902.0,1.0,vkg
3056,wfgd studio,,Marcella,Coffey,"718 Arch Street, Suite 302s",Philadelphia,PA,19106.0,"718 Arch Street, Suite 302S",Philadelphia,...,"Art services, graphic ; Communication design s...",True,False,718 ARCH ST,39.952586,-75.152491,POINT (-75.15249 39.95259),7080.0,1.0,wfgd studio
3097,yikes,,Tracy,Levesque,204 East Girard Avenue,Philadelphia,PA,19125.0,204 East Girard Avenue,Philadelphia,...,"Advertising periodical publishers, exclusively...",True,False,204 E GIRARD AVE,39.968918,-75.133325,POINT (-75.13332 39.96892),8495.0,1.0,yikes


In [17]:
len(matched_PS)

205

#### New matches

In [18]:
matched_OL = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\license-opendataphilly\tf-idf\tf-idf-85.xlsx')
matched_OL = matched_OL.set_index('left_index')

In [19]:
matched_OL = drop_duplicates_by_date(matched_OL, "issue_date") # without duplicates

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [20]:
len(matched_OL)

1502

In [21]:
new_matches = matched_PS.index.difference(matched_OL.index).tolist()

In [22]:
len(new_matches)

71

## 2. Load useful columns
* vendor
* tot_payments
* department_name
* year
* fiscal quarter

In [58]:
all_files = glob.glob(path + "/*.csv")

li = []

for file in all_files:
    # get vendor, tot_payments, and department_name from original data
    df = pd.read_csv(file, usecols=['vendor', 'tot_payments', 'department_name'])
    
    file_name = file.split('\\')[-1]
    year = file_name.split('-')[1]
    quarter = file_name.split('-')[2].split('.')[0]
    
    df['fy_year'] = year
    df['fy_quarter'] = quarter
    
    li.append(df)

ps = pd.concat(li, ignore_index=False)

In [59]:
# save cleaned professional services
ps.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\professional_services\cleaned_ps.xlsx', header=True, index=False)

In [60]:
ps = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\professional_services\cleaned_ps.xlsx')

In [62]:
ps

Unnamed: 0,department_name,vendor,tot_payments,fy_year,fy_quarter
0,City Controllers Office,Albert Scaperotto,11908,2013,Q4
1,City Controllers Office,"Ceisler Media & Issue Advocacy, LLC",0,2013,Q4
2,City Controllers Office,Charles Swanson,0,2013,Q4
3,City Controllers Office,Cliftonlarsonallen LLP,0,2013,Q4
4,City Controllers Office,Cliftonlarsonallen LLP,0,2013,Q4
...,...,...,...,...,...
37681,WATER,"Whitman, Requardt and Associates, LLP",0,2019,Q3
37682,WATER,"Whitman, Requardt and Associates, LLP",4594.5,2019,Q3
37683,WATER,"Wood Environment & Infrastructure Solutions, Inc.",0,2019,Q3
37684,WATER,Zelenkofske Axelrod LLC,410182,2019,Q3


## Full Merge with Registry
* TF-IDF 85
* on company_name and vendor

In [65]:
# clean ps vendor column
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_ps = skool.clean_strings(ps, ['vendor'], True, ignore_words)
cleaned_ps = cleaned_ps.dropna(subset=['vendor'])

keep duplicates: one vendor can have multiple payments

In [78]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_ps, left_on="company_name", right_on="vendor", score_cutoff=85, max_matches = 100)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_ps, left_on="dba_name", right_on="vendor", score_cutoff=85, max_matches = 100)
)
t = time.time() - t1

In [79]:
print('Execution time:', t, 'sec')

Execution time: 67.21271276473999 sec


In [80]:
len(merged)

6295

In [81]:
matched = merged.dropna(subset=['vendor'])

In [82]:
len(matched)

3382

In [84]:
matched.head()

Unnamed: 0,company_name,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,lat,lng,geometry,right_index,match_probability,department_name,vendor,tot_payments,fy_year,fy_quarter
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,,,,33050.0,1.0,DISTRICT ATTORNEY,stellar,129210,2019.0,Q1
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,,,,31193.0,1.0,DISTRICT ATTORNEY,stellar,60000,2018.0,Q4
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,,,,36125.0,1.0,DISTRICT ATTORNEY,stellar,138150,2019.0,Q3
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,,,,34416.0,1.0,DISTRICT ATTORNEY,stellar,129210,2019.0,Q2
27,a k architecture,,Lisa,Armstrong,2425 Pine Street,Philadelphia,PA,19103.0,2425 Pine Street,Philadelphia,...,39.947695,-75.18109,POINT (-75.18109 39.94770),37354.0,1.0,PUBLIC PROPERTY,a k architecture,45719,2019.0,Q3


In [86]:
# save cleaned professional services
matched.to_excel (r'C:\Users\dabinlee\Desktop\mwdsbe\data\professional_services\matched.xlsx', header=True)

In [91]:
matched = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\professional_services\matched.xlsx')

In [92]:
matched.rename(columns={'Unnamed: 0': 'left_index'}, inplace=True)

In [93]:
matched.set_index('left_index', inplace=True)

In [94]:
matched

Unnamed: 0_level_0,company_name,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,lat,lng,geometry,right_index,match_probability,department_name,vendor,tot_payments,fy_year,fy_quarter
left_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018,"70 West 36th Street, Ste. #702",New York,...,,,,33050,1.0,DISTRICT ATTORNEY,stellar,129210,2019,Q1
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018,"70 West 36th Street, Ste. #702",New York,...,,,,31193,1.0,DISTRICT ATTORNEY,stellar,60000,2018,Q4
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018,"70 West 36th Street, Ste. #702",New York,...,,,,36125,1.0,DISTRICT ATTORNEY,stellar,138150,2019,Q3
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018,"70 West 36th Street, Ste. #702",New York,...,,,,34416,1.0,DISTRICT ATTORNEY,stellar,129210,2019,Q2
27,a k architecture,,Lisa,Armstrong,2425 Pine Street,Philadelphia,PA,19103,2425 Pine Street,Philadelphia,...,39.947695,-75.181090,POINT (-75.18109018817358 39.94769504710167),37354,1.0,PUBLIC PROPERTY,a k architecture,45719,2019,Q3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3118,zweig ramick,,Jeanne,Zweig,2320 Faunce Street,Philadelphia,PA,19152,2320 Faunce Street,Philadelphia,...,40.051443,-75.055115,POINT (-75.05511537466825 40.05144316591304),6340,1.0,COMMERCE,zweig ramick,32062.2,2014,Q4
3118,zweig ramick,,Jeanne,Zweig,2320 Faunce Street,Philadelphia,PA,19152,2320 Faunce Street,Philadelphia,...,40.051443,-75.055115,POINT (-75.05511537466825 40.05144316591304),4663,1.0,Commerce,zweig ramick,32062.2,2014,Q3
3118,zweig ramick,,Jeanne,Zweig,2320 Faunce Street,Philadelphia,PA,19152,2320 Faunce Street,Philadelphia,...,40.051443,-75.055115,POINT (-75.05511537466825 40.05144316591304),3109,1.0,Commerce,zweig ramick,26552.2,2014,Q2
3118,zweig ramick,,Jeanne,Zweig,2320 Faunce Street,Philadelphia,PA,19152,2320 Faunce Street,Philadelphia,...,40.051443,-75.055115,POINT (-75.05511537466825 40.05144316591304),204,1.0,Commerce,zweig ramick,,2013,Q4
