# Just Harvest Fresh Access Markets ingestion script prototype

This notebook is intended to prototype code snippets for an ingestion script aimed at "Just Harvest - Fresh Access Markets.xlsx". 

Dependencies:
* pandas
* xlrd
* re
* os

In [1]:
import pandas as pd
import re
import os

In [2]:
in_path = '../food-data/PFPC_data_files/Just Harvest - Fresh Access Markets.xlsx'
out_path = '../food-data/Cleaned_data_files/just_harvest_fresh_access_markets.csv'

final_cols = ['id', 'source_org', 'source_file', 'original_id', 'type', 'name', 'address', 'city', 
              'state', 'zip_code', 'county', 'location_description', 'phone', 'url', 'latitude', 
              'longitude', 'latlng_source', 'date_from', 'date_to', 'SNAP', 'WIC', 'FMNP', 
              'fresh_produce', 'food_bucks', 'free_distribution', 'open_to_spec_group', 'data_issues']

In [3]:
df = pd.read_excel(in_path)
df

Unnamed: 0,Market,address,street_one,street_two,city,state,zip_code,Season,Date/Time,weekday,open_time1,close_time1,description,Participates in Food Bucks SNAP Incentive program
0,Beechview Farmers Market,1563 Beechview Ave,Broadway Ave,Hampshire Ave,,PA,15216.0,June 20th-September 12th,Thursday 3pm-7pm,Thursday,15:00:00,19:00:00,,yes
1,Belllevue Farmers Market,34 N Balph Ave,,,Bellevue,PA,15202.0,June 5th-October 30th,Wednesdays 3pm-7pm,Wednesday,15:00:00,19:00:00,Bayne Park,yes
2,Bloomfield Saturday Market,5050 Liberty Ave,,,Pittsburgh,PA,15224.0,May 18th-November 2nd,Saturdays 9am-1pm,Saturday,09:00:00,13:00:00,,yes
3,Carrick Farmers Market,1529 Brownsville Road,,,Pittsburgh,PA,15210.0,June 19th-November 27th,Wednesdays 3pm-7pm,Wednesday,15:00:00,19:00:00,,yes
4,East Liberty Farmers Market,,Broad St,Station St,Pittsburgh,PA,15206.0,May 13th-November 25th,Mondays 3pm-7pm,Monday,15:00:00,19:00:00,Garland Parklet Lot. N. Euclid,yes
5,Garfield Farm Stand,,Wiclow St,Columbo St,Pittsburgh,PA,15224.0,June 5th-September 25th,Wednesdays 3pm-7:30pm,Wednesday,16:00:00,19:00:00,,yes
6,Green Tree Farmers Market,905 Greentree Road,,,Pittsburgh,PA,15220.0,May 23rd-October 31st,Thursdays 4pm-7pm,Thursday,15:00:00,19:00:00,,yes
7,Homewood Farmers Market,7139 Fransktown Ave,,,Pittsburgh,PA,15208.0,July 27th-October 26th,Last Saturdays 3pm-7pm,Saturday,15:00:00,17:30:00,Last Saturday,yes
8,Homewood Farmers Market,7139 Fransktown Ave,,,Pittsburgh,PA,15208.0,July 3rd-July 31st,Wednedays 3pm-5:30pm,Wednesday,15:00:00,19:00:00,,yes
9,Larimer Farmers Market,,Larimer Ave,Carver St,Pittsburgh,PA,,June 23rd-November 24th,4th Sundays 3pm-7pm,Sunday,16:00:00,19:00:00,4th Sunday,yes


In [4]:
# Assign some columns to schema fields
df['name'] = df['Market']
df['location_description'] = df['description']

# Set some fields directly
df['source_org'] = 'Just Harvest'
df['source_file'] = os.path.basename(in_path)
df['type'] = "farmer's market"
df['county'] = 'Allegheny'
df['SNAP'] = 1 # because of food_bucks, see below
df['WIC'] = 1 # based on inspection of http://www.justharvest.org/fresh-access/local-farmers-markets-and-farm-stands-accepting-food-stamps/
df['FMNP'] = 1 # per rules from Cat
df['fresh_produce'] = 1
df['food_bucks'] = 1
df['free_distribution'] = 0
df['data_issues'] = '' # start with blank field, to populate later

# Fill missing addresses with intersections
df.loc[df['address'].isna(), 'address'] = df.loc[df['address'].isna(), 'street_one'] \
    + ' & ' + df.loc[df['address'].isna(), 'street_two']

# Calculate date_from and date_to from Season field
df['date_from'] = df['Season'].apply(lambda x: re.search(r'.+\d', x.split('-')[0]).group())
df['date_to'] = df['Season'].apply(lambda x: re.search(r'.+\d', x.split('-')[1]).group())

# Someday we will handle opening days/times robustly. For now I will append them to location_description :P
df.loc[df['location_description'].notna(), 'location_description'] = df['location_description'] + '; ' + df['Date/Time']
df.loc[df['location_description'].isna(), 'location_description'] = df['Date/Time']

# Reorder and add any missing columns
df = df.reindex(columns = final_cols)

# Identify which columns we have handled
handled_cols = df.columns[~df.isna().all()] # i.e. columns that aren't all NA

# Detect and document missingness in handled columns
for col in handled_cols:
    df.loc[df[col].isna(), 'data_issues'] += '{} missing;'.format(col)

# Write out to CSV
df.to_csv(out_path, index = False)

df

Unnamed: 0,id,source_org,source_file,original_id,type,name,address,city,state,zip_code,...,date_from,date_to,SNAP,WIC,FMNP,fresh_produce,food_bucks,free_distribution,open_to_spec_group,data_issues
0,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Beechview Farmers Market,1563 Beechview Ave,,PA,15216.0,...,June 20,September 12,1,1,1,1,1,0,,city missing;
1,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Belllevue Farmers Market,34 N Balph Ave,Bellevue,PA,15202.0,...,June 5,October 30,1,1,1,1,1,0,,
2,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Bloomfield Saturday Market,5050 Liberty Ave,Pittsburgh,PA,15224.0,...,May 18,November 2,1,1,1,1,1,0,,
3,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Carrick Farmers Market,1529 Brownsville Road,Pittsburgh,PA,15210.0,...,June 19,November 27,1,1,1,1,1,0,,
4,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,East Liberty Farmers Market,Broad St & Station St,Pittsburgh,PA,15206.0,...,May 13,November 25,1,1,1,1,1,0,,
5,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Garfield Farm Stand,Wiclow St & Columbo St,Pittsburgh,PA,15224.0,...,June 5,September 25,1,1,1,1,1,0,,
6,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Green Tree Farmers Market,905 Greentree Road,Pittsburgh,PA,15220.0,...,May 23,October 31,1,1,1,1,1,0,,
7,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Homewood Farmers Market,7139 Fransktown Ave,Pittsburgh,PA,15208.0,...,July 27,October 26,1,1,1,1,1,0,,
8,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Homewood Farmers Market,7139 Fransktown Ave,Pittsburgh,PA,15208.0,...,July 3,July 31,1,1,1,1,1,0,,
9,,Just Harvest,Just Harvest - Fresh Access Markets.xlsx,,farmer's market,Larimer Farmers Market,Larimer Ave & Carver St,Pittsburgh,PA,,...,June 23,November 24,1,1,1,1,1,0,,zip_code missing;
