# Allegheny County Farmers' Markets ingestion script prototype

This notebook is intended to prototype code snippets for an ingestion script aimed at "Allegheny_County_Farmers_Markets_Locations_2019.csv".

Note that I (Drew) found an updated input file here: https://data.wprdc.org/dataset/allegheny-county-farmers-markets-locations/resource/77f53ea5-9a40-4047-8235-8ca8a2712f16

Dependencies:
* pandas
* os

In [1]:
import pandas as pd
import os

In [2]:
in_path = '../food-data/PFPC_data_files/Allegheny_County_Farmers_Markets_Locations_2019.csv'
out_path = '../food-data/cleaned/allegheny_county_farmers_markets.csv'

final_cols = ['id', 'source_org', 'source_file', 'original_id', 'type', 'name', 'address', 'city', 
              'state', 'zip_code', 'county', 'location_description', 'phone', 'url', 'latitude', 
              'longitude', 'latlng_source', 'date_from', 'date_to', 'SNAP', 'WIC', 'FMNP', 
              'fresh_produce', 'food_bucks', 'free_distribution', 'open_to_spec_group', 'data_issues']

In [3]:
df = pd.read_csv(in_path)
df

Unnamed: 0,Name,Street_Address,City,State,Zip,Latitude,Longitude,Additional_Directions,Day_Time,Season,Affiliations
0,Ambridge Farmer's Market,624 Park Road,Ambridge,PA,15003,40.586729,-80.229576,"St. Mary's Church parking lot, near Do It Best",Thursday 4:00 PM-7:00 PM,June-November,
1,Aspinwall Flea Market,217 Commerical Avenue,Aspinwall,PA,15215,40.490575,-79.904305,Municipal parking lot,Sunday 7:00 AM-1:00 PM,June-November,
2,Bachman's Greenhouse and Nursery,2905 Freeport Road,Natrona Heights,PA,15065,40.640015,-79.711384,,Daily 8:00 AM-7:00 PM,June-October,
3,Bachman's Greenhouse and Nursery,Route 908 & Ekastown Road,Natrona Heights,PA,15065,40.667206,-79.730284,,Daily 10:00 AM-6:00 PM,July-October,
4,Beccari's Farm Market,5095 Thoms Run Road,Oakdale,PA,15071,40.373965,-80.133129,,Tuesday-Friday 12:00 PM-5:00 PM and Saturday-S...,June-October,
...,...,...,...,...,...,...,...,...,...,...,...
87,Wexford Farms,550 Warrendale Road,Wexford,PA,15090,40.647630,-80.100129,,Daily 8:00 AM-4:00 PM,June-October,
88,Wilkins Township Farmer's Market,333 Penn Center Boulevard,Pittsburgh,PA,15235,40.427682,-79.811636,Route 22 Penn Center,Wednesday 3:00 PM-6:30 PM,June-October,
89,Wilkinsburg Farmer's Market,225 Penn Avenue,Pittsburgh,PA,15221,40.446769,-79.891807,Life Care Hospital parking lot,Thursday 3:00 PM-6:30 PM,June-November,Just Harvest Fresh Access
90,Wilmerding Green Grocer,314 Commerce Street,Wilmerding,PA,15148,40.392927,-79.807721,Wilmerding Apartments,Friday 2:30 PM-3:30 PM,Year Round,Greater Pittsburgh Community Food Bank


In [4]:
# Drop that one empty row at the end
df = df[df['AgencyRef'].notna()]

# Keep only active sites
df = df[df['Food Bank - Inactive In Inventory System'] == 0] # what about all the blanks?

# Assign some columns to schema fields
df['original_id'] = df['AgencyRef']
df['name'] = df['AgencyName']
df['city'] = df['City']
df['state'] = df['State']
df['zip_code'] = df['Zip']
df['county'] = df['County']
df['latitude'] = df['Google Lat']
df['longitude'] = df['Google Long']

# Set some fields directly
df['source_org'] = 'Greater Pittsburgh Community Food Bank'
df['source_file'] = os.path.basename(in_path)
df['type'] = 'food bank site'
df['latlng_source'] = df['source_org']
df['free_distribution'] = 1
df['data_issues'] = '' # start with blank field, to populate later

# Set the fresh_produce flag
df['GroupTypes'] = df['GroupTypeOne'] + df['GroupTypeTwo'] + df['GroupTypeThree']
df['fresh_produce'] = 0
df.loc[df['GroupTypes'].str.contains('Grocery') | df['GroupTypes'].str.contains('Fresh Market'), 'fresh_produce'] = 1

# Clean up and concatenate address fields
df['Addr1'] = df['Addr1'].str.replace('  ', ' ').str.strip(' ')
df['Addr2'] = df['Addr2'].str.replace('  ', ' ').str.strip(' ')
df.loc[df['Addr2'].notna(), 'address'] = df['Addr1'] + ', ' + df['Addr2']
df.loc[df['Addr2'].isna(), 'address'] = df['Addr1']

# Reorder and add any missing columns
df = df.reindex(columns = final_cols)

# Identify which columns we have handled
handled_cols = df.columns[~df.isna().all()] # i.e. columns that aren't all NA

# Detect and document missingness in handled columns
for col in handled_cols:
    df.loc[df[col].isna(), 'data_issues'] += '{} missing;'.format(col)

# Detect some specific data issues 
df.loc[((df['latitude'] == 0) & (df['longitude'] == 0)), 'data_issues'] += 'latlng is (0,0);'

# Write out to CSV
df.to_csv(out_path, index = False)

df

Unnamed: 0,id,source_org,source_file,original_id,type,name,address,city,state,zip_code,...,date_from,date_to,SNAP,WIC,FMNP,fresh_produce,food_bucks,free_distribution,open_to_spec_group,data_issues
0,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PAGI2450-1,food bank site,A Giving Heart,816 Climax Street,Pittsburgh,PA,15210,...,,,,,,0,,1,,
1,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PADA2545-1,food bank site,Adagio Health,116 Browns Hill Road,Valencia,PA,16059,...,,,,,,0,,1,,
2,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PADE2513-1,food bank site,Adelphoi Education Millvale,608 Farragut Street,Pittsburgh,PA,15209,...,,,,,,0,,1,,
3,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PADO2493-1,food bank site,Adolecent Medicine UPMC,"4401 Penn Ave., Floor 3",Pittsburgh,PA,15224,...,,,,,,0,,1,,
4,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PAGH2543-1,food bank site,AGH Federal North,1307 Federal Street,Pittsburgh,PA,15212,...,,,,,,0,,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PWOM1387-1,food bank site,"Womanspace East, Inc.",,,,,...,,,,,,0,,1,,address missing;city missing;state missing;zip...
556,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PWOM1386-1,food bank site,Women's Center and Shelter of Greater Pittsburgh,,,,,...,,,,,,0,,1,,address missing;city missing;state missing;zip...
557,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PYMC1391-1,food bank site,YMCA New Kensington,800 Constitution Boulevard,New Kensington,PA,15068,...,,,,,,0,,1,,
558,,Greater Pittsburgh Community Food Bank,2019-10-10 PGH Food Bank Site Addresses.xlsx,PYOR2318-1,food bank site,York Commons,4003 Penn Ave.,Pittsburgh,PA,15224,...,,,,,,0,,1,,
