# Dataset ingestion script prototype

This notebook is intended to prototype code snippets for an ingestion script aimed at "filename". 

Dependencies:
* pandas
* os

In [1]:
import pandas as pd
import os

In [2]:
in_path = '../food-data/PFPC_data_files/input.file'
out_path = '../food-data/Cleaned_data_files/output.file'

final_cols = ['id', 'source_org', 'source_file', 'original_id', 'type', 'name', 'address', 'city', 
              'state', 'zip_code', 'county', 'location_description', 'phone', 'url', 'latitude', 
              'longitude', 'latlng_source', 'date_from', 'date_to', 'SNAP', 'WIC', 'FMNP', 
              'fresh_produce', 'food_bucks', 'free_distribution', 'open_to_spec_group', 'data_issues']

In [3]:
df = pd.read_csv(in_path)
df

Unnamed: 0,Name,Street_Address,City,State,Zip,Latitude,Longitude,Additional_Directions,Day_Time,Season,Affiliations
0,Ambridge Farmer's Market,624 Park Road,Ambridge,PA,15003,40.586729,-80.229576,"St. Mary's Church parking lot, near Do It Best",Thursday 4:00 PM-7:00 PM,June-November,
1,Aspinwall Flea Market,217 Commerical Avenue,Aspinwall,PA,15215,40.490575,-79.904305,Municipal parking lot,Sunday 7:00 AM-1:00 PM,June-November,
2,Bachman's Greenhouse and Nursery,2905 Freeport Road,Natrona Heights,PA,15065,40.640015,-79.711384,,Daily 8:00 AM-7:00 PM,June-October,
3,Bachman's Greenhouse and Nursery,Route 908 & Ekastown Road,Natrona Heights,PA,15065,40.667206,-79.730284,,Daily 10:00 AM-6:00 PM,July-October,
4,Beccari's Farm Market,5095 Thoms Run Road,Oakdale,PA,15071,40.373965,-80.133129,,Tuesday-Friday 12:00 PM-5:00 PM and Saturday-S...,June-October,
...,...,...,...,...,...,...,...,...,...,...,...
87,Wexford Farms,550 Warrendale Road,Wexford,PA,15090,40.647630,-80.100129,,Daily 8:00 AM-4:00 PM,June-October,
88,Wilkins Township Farmer's Market,333 Penn Center Boulevard,Pittsburgh,PA,15235,40.427682,-79.811636,Route 22 Penn Center,Wednesday 3:00 PM-6:30 PM,June-October,
89,Wilkinsburg Farmer's Market,225 Penn Avenue,Pittsburgh,PA,15221,40.446769,-79.891807,Life Care Hospital parking lot,Thursday 3:00 PM-6:30 PM,June-November,Just Harvest Fresh Access
90,Wilmerding Green Grocer,314 Commerce Street,Wilmerding,PA,15148,40.392927,-79.807721,Wilmerding Apartments,Friday 2:30 PM-3:30 PM,Year Round,Greater Pittsburgh Community Food Bank


In [5]:
# Assign some columns to schema fields
df['name'] = df['Name']
df['address'] = df['Street_Address']
df['city'] = df['City']
df['state'] = df['State']
df['zip_code'] = df['Zip']
df['latitude'] = df['Latitude']
df['longitude'] = df['Longitude']
df['location_description'] = df['Additional_Directions']

# Set some fields directly
df['source_org'] = 'Allegheny County Health Department'
df['source_file'] = os.path.basename(in_path)
df['type'] = "farmer's market"
df['county'] = 'Allegheny'
df['latlng_source'] = df['source_org']
df['SNAP'] = 1 # because of food_bucks, see below
df['WIC'] = 1
df['FMNP'] = 1
df['fresh_produce'] = 1
df['food_bucks'] = 1 # per Cat email - but do all FMNP farmer's markets participate in Food Bucks??
df['free_distribution'] = 0
df['data_issues'] = '' # start with blank field, to populate later

# Calculate date_from and date_to from Season field
df['Season'] = df['Season'].str.strip(' ') # clean off leading/trailing spaces
df['Season'] = df['Season'].str.split('-')
df['date_from'] = df['Season'].apply(lambda x: try_except(string_to_start_date, x, 0))
df['date_to'] = df['Season'].apply(lambda x: try_except(string_to_end_date, x, -1))

# Someday we will handle opening days/times robustly. For now I will append them to location_description :P
df.loc[df['location_description'].notna(), 'location_description'] = df['location_description'] + '; ' + df['Day_Time']
df.loc[df['location_description'].isna(), 'location_description'] = df['Day_Time']

# Reorder and add any missing columns
df = df.reindex(columns = final_cols)

# Identify which columns we have handled
handled_cols = df.columns[~df.isna().all()] # i.e. columns that aren't all NA

# Detect and document missingness in handled columns
for col in handled_cols:
    df.loc[df[col].isna(), 'data_issues'] += '{} missing;'.format(col)

# Detect some specific data issues 
df.loc[((df['latitude'] == 0) & (df['longitude'] == 0)), 'data_issues'] += 'latlng is (0,0);'

# Write out to CSV
df.to_csv(out_path, index = False)

df

Unnamed: 0,id,source_org,source_file,original_id,type,name,address,city,state,zip_code,...,date_from,date_to,SNAP,WIC,FMNP,fresh_produce,food_bucks,free_distribution,open_to_spec_group,data_issues
0,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Ambridge Farmer's Market,624 Park Road,Ambridge,PA,15003,...,June 1,November 30,1,1,1,1,1,0,,
1,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Aspinwall Flea Market,217 Commerical Avenue,Aspinwall,PA,15215,...,June 1,November 30,1,1,1,1,1,0,,
2,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Bachman's Greenhouse and Nursery,2905 Freeport Road,Natrona Heights,PA,15065,...,June 1,October 31,1,1,1,1,1,0,,
3,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Bachman's Greenhouse and Nursery,Route 908 & Ekastown Road,Natrona Heights,PA,15065,...,July 1,October 31,1,1,1,1,1,0,,
4,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Beccari's Farm Market,5095 Thoms Run Road,Oakdale,PA,15071,...,June 1,October 31,1,1,1,1,1,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Wexford Farms,550 Warrendale Road,Wexford,PA,15090,...,June 1,October 31,1,1,1,1,1,0,,
88,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Wilkins Township Farmer's Market,333 Penn Center Boulevard,Pittsburgh,PA,15235,...,June 1,October 31,1,1,1,1,1,0,,
89,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Wilkinsburg Farmer's Market,225 Penn Avenue,Pittsburgh,PA,15221,...,June 1,November 30,1,1,1,1,1,0,,
90,,Allegheny County Health Department,Allegheny_County_Farmers_Markets_Locations_201...,,farmer's market,Wilmerding Green Grocer,314 Commerce Street,Wilmerding,PA,15148,...,January 1,December 31,1,1,1,1,1,0,,
