# Top Neighborhoods for Registry + General Funds Matching

In [29]:
import pandas as pd
import mwdsbe
import schuylkill as skool
import time
import geopandas as gpd

In [13]:
registry = mwdsbe.load_registry() # geopandas df
gf = pd.read_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\mwdsbe\data\cwedp_37_report.xlsx', sheet_name='general_funds')

In [14]:
# filter general funds data
gf = gf.loc[gf['MAJ_CLASS'] != 1]
gf = gf.loc[gf['VEND_NAME'].dropna().index]

In [15]:
# clean data
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_gf = skool.clean_strings(gf, ['VEND_NAME'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name'])
cleaned_gf = cleaned_gf.dropna(subset=['VEND_NAME'])

In [6]:
matches = pd.read_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\fuzz95_tfidf913.xlsx')

In [7]:
matches.rename(columns={'Unnamed: 0': 'left_index'}, inplace=True)
matches.set_index('left_index', inplace=True)

In [9]:
len(matches)

127

In [11]:
unique_vendors = matches['VEND_NAME'].tolist()

In [16]:
full_matches = cleaned_gf.loc[cleaned_gf['VEND_NAME'].apply(lambda x : x in unique_vendors)]

In [17]:
full_matches

Unnamed: 0,DR_ACCT,CR_ACCT,CURR_PRIOR_FLAG,POSTED_DATE,DEPT,SEC_FD,INDEX,PROGRAM_CD,MAJ_CLASS,CLASS,DOC_NUM,TRANS_DESC,DOC_REF_NUM,VEND_NAME,VEND_NUM,AMT,DOC_NUM4,DOC_REF4
76494,433,201,PRIOR,2018-07-05,1,10,10058,,2,0258,VCXX1900008101,*JUN34828S* CRT RPRTING SVS-6/18,POXX1812349101,strehlow,233036541 01,11125.54,VCXX,POXX
76510,433,201,PRIOR,2018-07-19,1,10,10058,,2,0258,VCXX1900408101,*JUL35226S* CRT RPRTING SVS-6/18,POXX1812349101,strehlow,233036541 01,3273.93,VCXX,POXX
76511,433,201,PRIOR,2018-07-19,1,10,10058,,2,0258,VCXX1900383201,*MAY34385S* CRT RPRTING SVS-4/18&...,POXX1812349101,strehlow,233036541 01,18024.30,VCXX,POXX
76694,431,201,CURR,2018-08-10,1,10,10058,,2,0258,VCXX1901049401,*JUL35228S* CRT RRT WAIT TIME-6/18,POXX1910295101,strehlow,233036541 01,70.00,VCXX,POXX
76695,431,201,CURR,2018-08-13,1,10,10058,,2,0258,VCXX1901062201,*JUL35391S* CRT RPRTING SVS-7/18,POXX1910293901,strehlow,233036541 01,587.10,VCXX,POXX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317129,433,201,PRIOR,2018-07-27,23,10,230073,,4,0427,VCXX1900606001,*B08480763* B08480763/EQUIP/06/18,POXX1812162501,shi international,223009648 01,21700.00,VCXX,POXX
317142,431,201,CURR,2019-06-27,22,10,226280,,4,0427,VCXX1909362302,#B10002045# SCANNER,POXX1912118102,shi international,223009648 01,168.80,VCXX,POXX
317146,431,201,CURR,2019-06-18,22,10,226280,,4,0427,VCXX1909085702,#B10068691# WEBCAM,POXX1912141802,shi international,223009648 01,166.00,VCXX,POXX
317156,433,201,PRIOR,2019-01-08,22,10,226280,,4,0427,VCXX1903988102,#B08440820# ENDRUN,POXX1812335002,shi international,223009648 01,1785.53,VCXX,POXX


In [19]:
data = matches.merge(full_matches, how='right', on='VEND_NAME')

In [24]:
data.drop(data.columns[40:], axis=1, inplace=True)

In [33]:
data_geo = data.dropna(subset=['geometry'])

In [45]:
len(data_geo)

2656

In [40]:
# drop existing geometry and create a new one
data_geo.drop('geometry', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [41]:
data_geo['geometry'] = list(zip(data_geo['lng'], data_geo['lat']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [42]:
from shapely.geometry import Point
data_geo['geometry'] = data_geo['geometry'].apply(Point)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [43]:
data_gdf = gpd.GeoDataFrame(data_geo, geometry='geometry', crs={"init": "epsg:4326"})

In [46]:
len(data_gdf)

2656

In [49]:
zillow = gpd.read_file('data/neighborhood/zillow_neighborhoods.geojson')
# zillow = zillow.to_crs(epsg=3857)

In [50]:
joined = gpd.sjoin(data_gdf, zillow, op='within', how='left')

In [52]:
joined.columns

Index(['company_name', 'dba_name', 'owner_first', 'owner_last', 'location',
       'location_city', 'location_state', 'zip_code', 'mailing_address',
       'mailing_city', 'mailing_state', 'mailing_zip', 'certification_type',
       'capability', 'local', 'out_of_state', 'location_standard', 'lat',
       'lng', 'match_probability', 'right_index', 'DR_ACCT_x', 'CR_ACCT_x',
       'CURR_PRIOR_FLAG_x', 'POSTED_DATE_x', 'DEPT_x', 'SEC_FD_x', 'INDEX_x',
       'PROGRAM_CD_x', 'MAJ_CLASS_x', 'CLASS_x', 'DOC_NUM_x', 'TRANS_DESC_x',
       'DOC_REF_NUM_x', 'VEND_NAME', 'VEND_NUM_x', 'AMT_x', 'DOC_NUM4_x',
       'DOC_REF4_x', 'geometry', 'index_right', 'ZillowName'],
      dtype='object')

In [152]:
total_amt = joined.groupby('ZillowName')['AMT_x'].sum()
total_amt = total_amt.to_frame('AMT').reset_index()
total_amt.head()

Unnamed: 0,ZillowName,AMT
0,Allegheny West,1214066.88
1,Callow Hill,15332.85
2,Chestnut Hill,600.0
3,East Passyunk,530.0
4,Fishtown,37576.0


In [153]:
total_n = joined.groupby('ZillowName').size()
total_n = total_n.to_frame('n_vendors').reset_index()
total_n.head()

Unnamed: 0,ZillowName,n_vendors
0,Allegheny West,243
1,Callow Hill,117
2,Chestnut Hill,2
3,East Passyunk,2
4,Fishtown,2


In [154]:
joined.certification_type.unique()

array(['WBE', 'MBE', 'MWBE'], dtype=object)

In [155]:
total_cert = joined.groupby(['ZillowName', 'certification_type'])['AMT_x'].sum()
total_cert = total_cert.to_frame('amt_cert_type').reset_index()
total_cert.head()

Unnamed: 0,ZillowName,certification_type,amt_cert_type
0,Allegheny West,MBE,1214066.88
1,Callow Hill,MBE,15332.85
2,Chestnut Hill,MWBE,600.0
3,East Passyunk,MBE,530.0
4,Fishtown,WBE,37576.0


In [156]:
total_mbe_amt = total_cert.loc[total_cert['certification_type'] == 'MBE']

In [157]:
total_mbe_amt.drop('certification_type', axis=1, inplace=True)

In [158]:
total_mbe_amt.rename(columns={'amt_cert_type': 'mbe_amt'}, inplace=True)

In [159]:
total_wbe_amt = total_cert.loc[total_cert['certification_type'] == 'WBE']
total_wbe_amt.drop('certification_type', axis=1, inplace=True)
total_wbe_amt.rename(columns={'amt_cert_type': 'wbe_amt'}, inplace=True)

In [160]:
total_wbe_amt

Unnamed: 0,ZillowName,wbe_amt
4,Fishtown,37576.0
6,Germantown West Central,1147.74
8,Holmesburg,16863.39
11,Logan Square,7863.12
15,Northeast Airport,9062.77
16,Northern Liberties,1740.0
19,Old City,34930.25
21,Packer Park,744900.68
24,Rittenhouse,185834.8
27,Somerton,86328.0


In [161]:
total_mwbe_amt = total_cert.loc[total_cert['certification_type'] == 'MWBE']
total_mwbe_amt.drop('certification_type', axis=1, inplace=True)
total_mwbe_amt.rename(columns={'amt_cert_type': 'mwbe_amt'}, inplace=True)

In [162]:
total_cert = joined.groupby(['ZillowName', 'certification_type']).size()
total_cert = total_cert.to_frame('n_cert_type').reset_index()
total_cert.head()

Unnamed: 0,ZillowName,certification_type,n_cert_type
0,Allegheny West,MBE,243
1,Callow Hill,MBE,117
2,Chestnut Hill,MWBE,2
3,East Passyunk,MBE,2
4,Fishtown,WBE,2


In [163]:
total_mbe_n = total_cert.loc[total_cert['certification_type'] == 'MBE']
total_mbe_n.drop('certification_type', axis=1, inplace=True)
total_mbe_n.rename(columns={'n_cert_type': 'mbe_n'}, inplace=True)

In [164]:
total_wbe_n = total_cert.loc[total_cert['certification_type'] == 'WBE']
total_wbe_n.drop('certification_type', axis=1, inplace=True)
total_wbe_n.rename(columns={'n_cert_type': 'wbe_n'}, inplace=True)

In [165]:
total_mwbe_n = total_cert.loc[total_cert['certification_type'] == 'MWBE']
total_mwbe_n.drop('certification_type', axis=1, inplace=True)
total_mwbe_n.rename(columns={'n_cert_type': 'mwbe_n'}, inplace=True)

In [169]:
totals = zillow.merge(total_amt, on='ZillowName', how='left')
totals = totals.merge(total_mbe_amt, on='ZillowName', how='left')
totals = totals.merge(total_wbe_amt, on='ZillowName', how='left')
totals = totals.merge(total_mwbe_amt, on='ZillowName', how='left')
totals = totals.merge(total_n, on='ZillowName', how='left')
totals = totals.merge(total_mbe_n, on='ZillowName', how='left')
totals = totals.merge(total_wbe_n, on='ZillowName', how='left')
totals = totals.merge(total_mwbe_n, on='ZillowName', how='left')

In [170]:
totals

Unnamed: 0,ZillowName,geometry,AMT,mbe_amt,wbe_amt,mwbe_amt,n_vendors,mbe_n,wbe_n,mwbe_n
0,Academy Gardens,"POLYGON ((-74.99851 40.06435, -74.99456 40.061...",,,,,,,,
1,Airport,"POLYGON ((-75.19728 39.89252, -75.19617 39.892...",,,,,,,,
2,Allegheny West,"POLYGON ((-75.16592 40.00327, -75.16596 40.003...",1214066.88,1214066.88,,,243.0,243.0,,
3,Andorra,"POLYGON ((-75.22463 40.06686, -75.22588 40.065...",,,,,,,,
4,Aston Woodbridge,"POLYGON ((-75.00860 40.05369, -75.00861 40.053...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
153,Wister,"POLYGON ((-75.17210 40.03840, -75.17197 40.037...",,,,,,,,
154,Woodland Terrace,"POLYGON ((-75.20399 39.94968, -75.20396 39.949...",4776.66,4776.66,,,9.0,9.0,,
155,Wynnefield,"POLYGON ((-75.22070 40.00392, -75.21792 39.994...",,,,,,,,
156,Wynnefield Heights,"POLYGON ((-75.20103 40.00419, -75.20410 40.000...",,,,,,,,


In [171]:
totals['AMT'] = totals['AMT'].fillna(0)
totals['mbe_amt'] = totals['mbe_amt'].fillna(0)
totals['wbe_amt'] = totals['wbe_amt'].fillna(0)
totals['mwbe_amt'] = totals['mwbe_amt'].fillna(0)
totals['n_vendors'] = totals['n_vendors'].fillna(0)
totals['mbe_n'] = totals['mbe_n'].fillna(0)
totals['wbe_n'] = totals['wbe_n'].fillna(0)
totals['mwbe_n'] = totals['mwbe_n'].fillna(0)

In [172]:
totals

Unnamed: 0,ZillowName,geometry,AMT,mbe_amt,wbe_amt,mwbe_amt,n_vendors,mbe_n,wbe_n,mwbe_n
0,Academy Gardens,"POLYGON ((-74.99851 40.06435, -74.99456 40.061...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
1,Airport,"POLYGON ((-75.19728 39.89252, -75.19617 39.892...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
2,Allegheny West,"POLYGON ((-75.16592 40.00327, -75.16596 40.003...",1214066.88,1214066.88,0.0,0.0,243.0,243.0,0.0,0.0
3,Andorra,"POLYGON ((-75.22463 40.06686, -75.22588 40.065...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
4,Aston Woodbridge,"POLYGON ((-75.00860 40.05369, -75.00861 40.053...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
153,Wister,"POLYGON ((-75.17210 40.03840, -75.17197 40.037...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
154,Woodland Terrace,"POLYGON ((-75.20399 39.94968, -75.20396 39.949...",4776.66,4776.66,0.0,0.0,9.0,9.0,0.0,0.0
155,Wynnefield,"POLYGON ((-75.22070 40.00392, -75.21792 39.994...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
156,Wynnefield Heights,"POLYGON ((-75.20103 40.00419, -75.20410 40.000...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0


In [173]:
totals.to_file(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\totals_nhoods2.geojson', driver='GeoJSON')