# Top Neighborhoods for Registry + General Funds Matching

In [2]:
import pandas as pd
import mwdsbe
import schuylkill as skool
import time
import geopandas as gpd

## Data
* registry
* general funds
* matches: matching between registry and general funds
* full_matches: full funds per vendor

In [5]:
registry = mwdsbe.load_registry() # geopandas df
# gf = pd.read_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\mwdsbe\data\cwedp_37_report.xlsx', sheet_name='general_funds')
gf = pd.read_excel('/Users/dabinlee/Documents/github/MWDSBE/mwdsbe/data/cwedp_37_report.xlsx', sheet_name='general_funds')

In [6]:
# filter general funds data
gf = gf.loc[gf['MAJ_CLASS'] != 1]
gf = gf.loc[gf['VEND_NAME'].dropna().index]

In [7]:
# clean data
ignore_words = ['inc', 'group', 'llc', 'corp', 'pc', 'incorporated', 'ltd', 'co', 'associates', 'services', 'company', 'enterprises', 'enterprise', 'service', 'corporation']
cleaned_registry = skool.clean_strings(registry, ['company_name', 'dba_name'], True, ignore_words)
cleaned_gf = skool.clean_strings(gf, ['VEND_NAME'], True, ignore_words)

cleaned_registry = cleaned_registry.dropna(subset=['company_name'])
cleaned_gf = cleaned_gf.dropna(subset=['VEND_NAME'])

In [8]:
# matches = pd.read_excel(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\fuzz95_tfidf913.xlsx')
matches = pd.read_excel('/Users/dabinlee/Documents/github/MWDSBE/analysis/data/general_funds/fuzz95_tfidf913.xlsx')

In [9]:
matches.rename(columns={'Unnamed: 0': 'left_index'}, inplace=True)
matches.set_index('left_index', inplace=True)

In [10]:
len(matches)

127

In [12]:
unique_vendors = matches['VEND_NAME'].tolist()

In [13]:
full_matches = cleaned_gf.loc[cleaned_gf['VEND_NAME'].apply(lambda x : x in unique_vendors)]

In [14]:
full_matches

Unnamed: 0,DR_ACCT,CR_ACCT,CURR_PRIOR_FLAG,POSTED_DATE,DEPT,SEC_FD,INDEX,PROGRAM_CD,MAJ_CLASS,CLASS,DOC_NUM,TRANS_DESC,DOC_REF_NUM,VEND_NAME,VEND_NUM,AMT,DOC_NUM4,DOC_REF4
76494,433,201,PRIOR,2018-07-05,1,10,10058,,2,0258,VCXX1900008101,*JUN34828S* CRT RPRTING SVS-6/18,POXX1812349101,strehlow,233036541 01,11125.54,VCXX,POXX
76510,433,201,PRIOR,2018-07-19,1,10,10058,,2,0258,VCXX1900408101,*JUL35226S* CRT RPRTING SVS-6/18,POXX1812349101,strehlow,233036541 01,3273.93,VCXX,POXX
76511,433,201,PRIOR,2018-07-19,1,10,10058,,2,0258,VCXX1900383201,*MAY34385S* CRT RPRTING SVS-4/18&...,POXX1812349101,strehlow,233036541 01,18024.30,VCXX,POXX
76694,431,201,CURR,2018-08-10,1,10,10058,,2,0258,VCXX1901049401,*JUL35228S* CRT RRT WAIT TIME-6/18,POXX1910295101,strehlow,233036541 01,70.00,VCXX,POXX
76695,431,201,CURR,2018-08-13,1,10,10058,,2,0258,VCXX1901062201,*JUL35391S* CRT RPRTING SVS-7/18,POXX1910293901,strehlow,233036541 01,587.10,VCXX,POXX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317129,433,201,PRIOR,2018-07-27,23,10,230073,,4,0427,VCXX1900606001,*B08480763* B08480763/EQUIP/06/18,POXX1812162501,shi international,223009648 01,21700.00,VCXX,POXX
317142,431,201,CURR,2019-06-27,22,10,226280,,4,0427,VCXX1909362302,#B10002045# SCANNER,POXX1912118102,shi international,223009648 01,168.80,VCXX,POXX
317146,431,201,CURR,2019-06-18,22,10,226280,,4,0427,VCXX1909085702,#B10068691# WEBCAM,POXX1912141802,shi international,223009648 01,166.00,VCXX,POXX
317156,433,201,PRIOR,2019-01-08,22,10,226280,,4,0427,VCXX1903988102,#B08440820# ENDRUN,POXX1812335002,shi international,223009648 01,1785.53,VCXX,POXX


In [15]:
data = matches.merge(full_matches, how='right', on='VEND_NAME')

In [16]:
data.drop(data.columns[40:], axis=1, inplace=True)

data has invalid geometry. Drop the existing geometry column and recreate it using lng and lat

In [18]:
data_geo = data.dropna(subset=['geometry'])

In [19]:
len(data_geo)

2656

In [20]:
# drop existing geometry and create a new one
data_geo.drop('geometry', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [21]:
data_geo['geometry'] = list(zip(data_geo['lng'], data_geo['lat']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
from shapely.geometry import Point
data_geo['geometry'] = data_geo['geometry'].apply(Point)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
data_gdf = gpd.GeoDataFrame(data_geo, geometry='geometry', crs={"init": "epsg:4326"})

In [26]:
len(data_gdf)

2656

In [27]:
zillow = gpd.read_file('data/neighborhood/zillow_neighborhoods.geojson')
# zillow = zillow.to_crs(epsg=3857)

In [28]:
joined = gpd.sjoin(data_gdf, zillow, op='within', how='left')

In [52]:
len(joined)

2656

In [50]:
# joined with unique companies for exact number of vendors by neighborhoods
unique_joined = joined.drop_duplicates(subset=['VEND_NAME'])

In [51]:
len(unique_joined)

52

In [29]:
joined.columns

Index(['company_name', 'dba_name', 'owner_first', 'owner_last', 'location',
       'location_city', 'location_state', 'zip_code', 'mailing_address',
       'mailing_city', 'mailing_state', 'mailing_zip', 'certification_type',
       'capability', 'local', 'out_of_state', 'location_standard', 'lat',
       'lng', 'match_probability', 'right_index', 'DR_ACCT_x', 'CR_ACCT_x',
       'CURR_PRIOR_FLAG_x', 'POSTED_DATE_x', 'DEPT_x', 'SEC_FD_x', 'INDEX_x',
       'PROGRAM_CD_x', 'MAJ_CLASS_x', 'CLASS_x', 'DOC_NUM_x', 'TRANS_DESC_x',
       'DOC_REF_NUM_x', 'VEND_NAME', 'VEND_NUM_x', 'AMT_x', 'DOC_NUM4_x',
       'DOC_REF4_x', 'geometry', 'index_right', 'ZillowName'],
      dtype='object')

In [30]:
total_amt = joined.groupby('ZillowName')['AMT_x'].sum()
total_amt = total_amt.to_frame('total_amt').reset_index()
total_amt.head()

Unnamed: 0,ZillowName,total_amt
0,Allegheny West,1214066.88
1,Callow Hill,15332.85
2,Chestnut Hill,600.0
3,East Passyunk,530.0
4,Fishtown,37576.0


In [53]:
total_n = unique_joined.groupby('ZillowName').size()
total_n = total_n.to_frame('total_n').reset_index()
total_n.head()

Unnamed: 0,ZillowName,total_n
0,Allegheny West,1
1,Callow Hill,1
2,Chestnut Hill,1
3,East Passyunk,1
4,Fishtown,1


In [32]:
joined.certification_type.unique()

array(['WBE', 'MBE', 'MWBE'], dtype=object)

In [33]:
total_cert = joined.groupby(['ZillowName', 'certification_type'])['AMT_x'].sum()
total_cert = total_cert.to_frame('amt_cert_type').reset_index()
total_cert.head()

Unnamed: 0,ZillowName,certification_type,amt_cert_type
0,Allegheny West,MBE,1214066.88
1,Callow Hill,MBE,15332.85
2,Chestnut Hill,MWBE,600.0
3,East Passyunk,MBE,530.0
4,Fishtown,WBE,37576.0


In [34]:
total_mbe_amt = total_cert.loc[total_cert['certification_type'] == 'MBE']
total_mbe_amt.drop('certification_type', axis=1, inplace=True)
total_mbe_amt.rename(columns={'amt_cert_type': 'mbe_amt'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [37]:
total_mbe_amt.head()

Unnamed: 0,ZillowName,mbe_amt
0,Allegheny West,1214066.88
1,Callow Hill,15332.85
3,East Passyunk,530.0
7,Holmesburg,47040.0
10,Logan Square,28465.4


In [35]:
total_wbe_amt = total_cert.loc[total_cert['certification_type'] == 'WBE']
total_wbe_amt.drop('certification_type', axis=1, inplace=True)
total_wbe_amt.rename(columns={'amt_cert_type': 'wbe_amt'}, inplace=True)

In [38]:
total_wbe_amt.head()

Unnamed: 0,ZillowName,wbe_amt
4,Fishtown,37576.0
6,Germantown West Central,1147.74
8,Holmesburg,16863.39
11,Logan Square,7863.12
15,Northeast Airport,9062.77


In [39]:
total_mwbe_amt = total_cert.loc[total_cert['certification_type'] == 'MWBE']
total_mwbe_amt.drop('certification_type', axis=1, inplace=True)
total_mwbe_amt.rename(columns={'amt_cert_type': 'mwbe_amt'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [40]:
total_mwbe_amt.head()

Unnamed: 0,ZillowName,mwbe_amt
2,Chestnut Hill,600.0
5,Germantown Southwest,895.83
9,Lawndale,1271.0
12,Mount Airy East,112500.0
13,Mount Airy West,58350.0


In [54]:
total_cert_n = unique_joined.groupby(['ZillowName', 'certification_type']).size()
total_cert_n = total_cert_n.to_frame('n_cert_type').reset_index()
total_cert_n.head()

Unnamed: 0,ZillowName,certification_type,n_cert_type
0,Allegheny West,MBE,1
1,Callow Hill,MBE,1
2,Chestnut Hill,MWBE,1
3,East Passyunk,MBE,1
4,Fishtown,WBE,1


In [55]:
total_mbe_n = total_cert_n.loc[total_cert_n['certification_type'] == 'MBE']
total_mbe_n.drop('certification_type', axis=1, inplace=True)
total_mbe_n.rename(columns={'n_cert_type': 'mbe_n'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [56]:
total_mbe_n.head()

Unnamed: 0,ZillowName,mbe_n
0,Allegheny West,1
1,Callow Hill,1
3,East Passyunk,1
7,Holmesburg,1
10,Logan Square,3


In [57]:
total_wbe_n = total_cert_n.loc[total_cert_n['certification_type'] == 'WBE']
total_wbe_n.drop('certification_type', axis=1, inplace=True)
total_wbe_n.rename(columns={'n_cert_type': 'wbe_n'}, inplace=True)

In [58]:
total_wbe_n.head()

Unnamed: 0,ZillowName,wbe_n
4,Fishtown,1
6,Germantown West Central,1
8,Holmesburg,3
11,Logan Square,1
15,Northeast Airport,1


In [59]:
total_mwbe_n = total_cert_n.loc[total_cert_n['certification_type'] == 'MWBE']
total_mwbe_n.drop('certification_type', axis=1, inplace=True)
total_mwbe_n.rename(columns={'n_cert_type': 'mwbe_n'}, inplace=True)

In [60]:
total_mwbe_n.head()

Unnamed: 0,ZillowName,mwbe_n
2,Chestnut Hill,1
5,Germantown Southwest,1
9,Lawndale,1
12,Mount Airy East,1
13,Mount Airy West,1


In [61]:
totals = zillow.merge(total_amt, on='ZillowName', how='left')
totals = totals.merge(total_mbe_amt, on='ZillowName', how='left')
totals = totals.merge(total_wbe_amt, on='ZillowName', how='left')
totals = totals.merge(total_mwbe_amt, on='ZillowName', how='left')
totals = totals.merge(total_n, on='ZillowName', how='left')
totals = totals.merge(total_mbe_n, on='ZillowName', how='left')
totals = totals.merge(total_wbe_n, on='ZillowName', how='left')
totals = totals.merge(total_mwbe_n, on='ZillowName', how='left')

In [64]:
totals['total_amt'] = totals['total_amt'].fillna(0)
totals['mbe_amt'] = totals['mbe_amt'].fillna(0)
totals['wbe_amt'] = totals['wbe_amt'].fillna(0)
totals['mwbe_amt'] = totals['mwbe_amt'].fillna(0)
totals['total_n'] = totals['total_n'].fillna(0)
totals['mbe_n'] = totals['mbe_n'].fillna(0)
totals['wbe_n'] = totals['wbe_n'].fillna(0)
totals['mwbe_n'] = totals['mwbe_n'].fillna(0)

In [65]:
totals

Unnamed: 0,ZillowName,geometry,total_amt,mbe_amt,wbe_amt,mwbe_amt,total_n,mbe_n,wbe_n,mwbe_n
0,Academy Gardens,"POLYGON ((-74.99851 40.06435, -74.99456 40.061...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
1,Airport,"POLYGON ((-75.19728 39.89252, -75.19617 39.892...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
2,Allegheny West,"POLYGON ((-75.16592 40.00327, -75.16596 40.003...",1214066.88,1214066.88,0.0,0.0,1.0,1.0,0.0,0.0
3,Andorra,"POLYGON ((-75.22463 40.06686, -75.22588 40.065...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
4,Aston Woodbridge,"POLYGON ((-75.00860 40.05369, -75.00861 40.053...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
153,Wister,"POLYGON ((-75.17210 40.03840, -75.17197 40.037...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
154,Woodland Terrace,"POLYGON ((-75.20399 39.94968, -75.20396 39.949...",4776.66,4776.66,0.0,0.0,1.0,1.0,0.0,0.0
155,Wynnefield,"POLYGON ((-75.22070 40.00392, -75.21792 39.994...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0
156,Wynnefield Heights,"POLYGON ((-75.20103 40.00419, -75.20410 40.000...",0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
# totals.to_file(r'C:\Users\dabinlee\Documents\GitHub\mwdsbe_binny\MWDSBE\analysis\data\general_funds\totals_nhoods2.geojson', driver='GeoJSON')
totals.to_file('/Users/dabinlee/Documents/github/MWDSBE/analysis/data/general_funds/totals_nhoods.geojson', driver='GeoJSON')