# Professional Services Contracts
The entire dataset for Professional Services Contracts by fiscal quarter - from 2013 Q4 to 2019 Q3

In [48]:
import mwdsbe
import schuylkill as skool
import pandas as pd
import glob
import time

## Functions

In [130]:
def drop_duplicates_by_date(df, date_column):
    df.sort_values(by=date_column, ascending=False, inplace=True)
    df = df.loc[~df.index.duplicated(keep="first")]
    df.sort_index(inplace=True)
    return df

## 1. Only read vendor column from Professional Services
In order to have a sense of how many matches we get from Professional Services data

## Data

In [82]:
registry = mwdsbe.load_registry() # geopandas df

In [101]:
path = r'C:\Users\dabinlee\Documents\GitHub\mwdsbe\mwdsbe\data\professional_services'
ps_vendor = pd.concat([pd.read_csv(file, usecols=['vendor']) for file in glob.glob(path + "/*.csv")], ignore_index = True)

In [102]:
ps_vendor

Unnamed: 0,vendor
0,Albert Scaperotto
1,"Ceisler Media & Issue Advocacy, LLC"
2,Charles Swanson
3,Cliftonlarsonallen LLP
4,Cliftonlarsonallen LLP
...,...
37681,"Whitman, Requardt and Associates, LLP"
37682,"Whitman, Requardt and Associates, LLP"
37683,"Wood Environment & Infrastructure Solutions, Inc."
37684,Zelenkofske Axelrod LLC


In [103]:
ps_vendor = ps_vendor.drop_duplicates()

In [104]:
len(ps_vendor)

2692

#### Clean Data

In [118]:
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_ps_vendor = skool.clean_strings(ps_vendor, ['vendor'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name'])
cleaned_ps_vendor = cleaned_ps_vendor.dropna(subset=['vendor'])

In [119]:
cleaned_ps_vendor = cleaned_ps_vendor.drop_duplicates()

In [120]:
len(cleaned_ps_vendor)

2269

## TF-IDF Merge Registry and Professional Services
on company_name and vendor

In [122]:
t1 = time.time()
merged = (
    skool.tf_idf_merge(cleaned_registry, cleaned_ps_vendor, left_on="company_name", right_on="vendor", score_cutoff=85)
    .pipe(skool.tf_idf_merge, cleaned_registry, cleaned_ps_vendor, left_on="dba_name", right_on="vendor", score_cutoff=85)
)
t = time.time() - t1

In [123]:
print('Execution time:', t, 'sec')

Execution time: 0.39903974533081055 sec


In [124]:
len(merged)

3119

In [133]:
matched_PS = merged.dropna(subset=['vendor'])

In [134]:
matched_PS

Unnamed: 0_level_0,company_name,dba_name,owner_first,owner_last,location,location_city,location_state,zip_code,mailing_address,mailing_city,...,capability,local,out_of_state,location_standard,lat,lng,geometry,right_index,match_probability,vendor
registry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,4u,stellar,Liang,Chen,"70 West 36th Street, Ste. #702",New York,NY,10018.0,"70 West 36th Street, Ste. #702",New York,...,"Internet service providers, using own operated...",False,True,,,,,31193.0,1.0,stellar
27,a k architecture,,Lisa,Armstrong,2425 Pine Street,Philadelphia,PA,19103.0,2425 Pine Street,Philadelphia,...,"NAICS\t5413 Architectural, Engineering, and ...",True,False,2425 PINE ST,39.947695,-75.181090,POINT (-75.18109 39.94770),25835.0,1.0,a k architecture
60,acacia financial,,Noreen,White,"6000 Midlantic Drive, Suite 410 North",Mt. Laurel,NJ,8054.0,"6000 Midlantic Drive, Suite 410 North",Mt. Laurel,...,Financial management consulting (except invest...,False,True,,,,,40.0,1.0,acacia financial
65,acclaim systems,,Kailash,Kalantri,110 EAST PENNSYLVANIA BOULEVARD,Feasterville,PA,19053.0,110 EAST PENNSYLVANIA BOULEVARD,Feasterville,...,NAICS\t5415\tComputer Systems Design and Relat...,False,False,,,,,3110.0,1.0,acclaim systems
86,adcon consultants,,Lawrence,Dibor,"2465 North 50th Street Bala Building, Suite #1...",Philadelphia,PA,19131.0,"2465 North 50th Street Bala Building, Suite #1...",Philadelphia,...,"Construction management, multifamily building ...",True,False,2465 N 50TH ST,39.999592,-75.227267,POINT (-75.22727 39.99959),16398.0,1.0,adcon consultants
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3007,viridian landscape studio,,Tavis,Dockwiller,3868 Terrace Street,Philadelphia,PA,19128.0,3868 Terrace Street,Philadelphia,...,Landscape architectural services,True,False,3868 TERRACE ST,40.020794,-75.213208,POINT (-75.21321 40.02079),7645.0,1.0,viridian landscape studio
3017,vkg,,Kimberly,Hawthorne,734 E. Dorset Street,Philadelphia,PA,19119.0,734 E. Dorset Street,Philadelphia,...,Computer Training ; Computer Training ; Prof...,True,False,734 DORSET ST,40.062765,-75.175337,POINT (-75.17534 40.06276),902.0,1.0,vkg
3056,wfgd studio,,Marcella,Coffey,"718 Arch Street, Suite 302s",Philadelphia,PA,19106.0,"718 Arch Street, Suite 302S",Philadelphia,...,"Art services, graphic ; Communication design s...",True,False,718 ARCH ST,39.952586,-75.152491,POINT (-75.15249 39.95259),7080.0,1.0,wfgd studio
3097,yikes,,Tracy,Levesque,204 East Girard Avenue,Philadelphia,PA,19125.0,204 East Girard Avenue,Philadelphia,...,"Advertising periodical publishers, exclusively...",True,False,204 E GIRARD AVE,39.968918,-75.133325,POINT (-75.13332 39.96892),8495.0,1.0,yikes


In [135]:
len(matched_PS)

205

#### New matches

In [128]:
matched_OL = pd.read_excel(r'C:\Users\dabinlee\Desktop\mwdsbe\data\license-opendataphilly\tf-idf\tf-idf-85.xlsx')
matched_OL = matched_OL.set_index('left_index')

In [131]:
matched_OL = drop_duplicates_by_date(matched_OL, "issue_date") # without duplicates

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [132]:
len(matched_OL)

1502

In [136]:
new_matches = matched_PS.index.difference(matched_OL.index).tolist()

In [137]:
len(new_matches)

71

## 2. Load useful columns
* vendor
* tot_payments
* department_name
* year
* fiscal quarter

In [158]:
all_files = glob.glob(path + "/*.csv")

li = []

for file in all_files:
    # get vendor, tot_payments, and department_name from original data
    df = pd.read_csv(file, usecols=['vendor', 'tot_payments', 'department_name'])
    
    file_name = file.split('\\')[-1]
    year = file.split('-', '.')[1]
    quarter = file.split('-', '.')[2]
    print(year)
    print(quarter)
    
    li.append(df)

ps = pd.concat(li, ignore_index=False)

TypeError: 'str' object cannot be interpreted as an integer

In [147]:
ps

Unnamed: 0,department_name,vendor,tot_payments
0,City Controllers Office,Albert Scaperotto,11908
1,City Controllers Office,"Ceisler Media & Issue Advocacy, LLC",0
2,City Controllers Office,Charles Swanson,0
3,City Controllers Office,Cliftonlarsonallen LLP,0
4,City Controllers Office,Cliftonlarsonallen LLP,0
...,...,...,...
1815,WATER,"Whitman, Requardt and Associates, LLP",0
1816,WATER,"Whitman, Requardt and Associates, LLP",4594.5
1817,WATER,"Wood Environment & Infrastructure Solutions, Inc.",0
1818,WATER,Zelenkofske Axelrod LLC,410182


In [44]:
# print all columns
# ps.columns

In [45]:
# ps

In [11]:
# ps1 = pd.read_csv(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe\mwdsbe\data\professional_services\FY-2019-Q3.csv')

In [34]:
# # diff btw ps columns and ps1 columns
# set1 = set(ps.columns)
# set2 = set(ps1.columns)

In [17]:
# set1.difference(set2)

{'Unnamed: 15', 'contract_amount', 'short desc'}

In [18]:
# ps1

Unnamed: 0,original_contract_id,current_item_id,department_name,vendor,contract_structure_type,short_desc,start_dt,end_dt,days_remaining,amt,tot_payments,orig_vendor,exempt_status,adv_or_exempt,profit_status
0,MPXX19000133,MPXX19000133,CITY CONTROLLERS OFFICE,Becker Professional Development Corporation,Miscellaneous Order,Provide CPA Exam Review,7/31/2018,6/30/2019,90,20000.00,2793.00,Becker Professional Development Corporation,102.0,EXEMPT,For Profit
1,MPXX19000321,MPXX19000321,CITY CONTROLLERS OFFICE,Becker Professional Development Corporation,Miscellaneous Order,Provide CPE courses,3/13/2019,6/30/2019,90,9625.00,0.00,Becker Professional Development Corporation,102.0,EXEMPT,For Profit
2,1520399,152039903,CITY CONTROLLERS OFFICE,CCH Inc,Computer and Information Svcs,Engagement Maint/Knowledg,3/31/2018,3/30/2019,0,50940.79,44674.00,CCH Inc,1.0,ADVERTISED,For Profit
3,MPXX19000091,MPXX19000091,CITY CONTROLLERS OFFICE,CliftonLarsonAllen LLP,Miscellaneous Order,Technical Consulting Serv,7/13/2018,6/30/2019,90,29700.00,18298.50,CliftonLarsonAllen LLP,102.0,EXEMPT,For Profit
4,MPXX19000060,MPXX19000060,CITY CONTROLLERS OFFICE,James J. McNichol,Miscellaneous Order,Perform reviews of audits,7/2/2018,6/30/2019,90,32000.00,17750.00,James J. McNichol,102.0,EXEMPT,For Profit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1815,1620356,162035602,WATER,"Whitman, Requardt and Associates, LLP",Architect and Engineer Svcs,Capital Improvements,2/1/2018,1/31/2019,0,150000.00,0.00,"Whitman, Requardt and Associates, LLP",1.0,ADVERTISED,For Profit
1816,1620356,162035603,WATER,"Whitman, Requardt and Associates, LLP",Architect and Engineer Svcs,Capital Improvements,2/1/2019,1/31/2020,305,150000.00,4594.50,"Whitman, Requardt and Associates, LLP",1.0,ADVERTISED,For Profit
1817,1620328,162032803,WATER,"Wood Environment & Infrastructure Solutions, Inc.",Architect and Engineer Svcs,GES Planning & Design,1/1/2019,12/31/2019,274,0.00,0.00,"Wood Environment & Infrastructure Solutions, Inc.",1.0,ADVERTISED,For Profit
1818,1820268,1820268,WATER,Zelenkofske Axelrod LLC,General Consultant Services,Accounting Services,6/1/2018,5/31/2019,60,500000.00,410182.50,Zelenkofske Axelrod LLC,1.0,ADVERTISED,For Profit


In [21]:
# ps['Unnamed: 15'].dropna()

18831    Non Profit
Name: Unnamed: 15, dtype: object