# Dataset ingestion script prototype

This notebook is intended to prototype code snippets for an ingestion script aimed at "Summer_Meal_Sites_2019.csv". 

Dependencies:
* pandas
* os

In [1]:
import pandas as pd
import os

In [2]:
in_path = '../food-data/PFPC_data_files/Summer_Meal_Sites_2019.csv'
out_path = '../food-data/Cleaned_data_files/summer_meal_sites_2019.csv'

final_cols = ['id', 'source_org', 'source_file', 'original_id', 'type', 'name', 'address', 'city', 
              'state', 'zip_code', 'county', 'location_description', 'phone', 'url', 'latitude', 
              'longitude', 'latlng_source', 'date_from', 'date_to', 'SNAP', 'WIC', 'FMNP', 
              'fresh_produce', 'food_bucks', 'free_distribution', 'open_to_spec_group', 'data_issues']

In [3]:
df = pd.read_csv(in_path, encoding = 'ansi')
df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,X,Y,OBJECTID,siteName,siteStatus,siteAddress,siteAddress2,siteCity,siteState,siteZip,...,snackTimePM,dinnerSupperTime,mealTypesServed,cycleNumber,RecordStatus,Country,FNSID,Created,Season,County
0,-79.095379,34.480939,49094,Fairmont High,Open,"5419 Old Stage Roae Fairmont, NC 28340",,Fairmont,NC,28340,...,,,"B,L",2,N,US,75727,,2019,Robeson
1,-79.031752,34.603793,49095,Lumberton Junior High,Open,"82 Marion Road Lumberton, NC 28358",,Lumberton,NC,28358,...,,,"B,L",2,N,US,16116,,2019,Robeson
2,-79.003666,34.712291,49096,Magnolia Elementary,Open,"10928 US 301 North Lumberton, NC 28360",,Lumberton,NC,28360,...,,,"B,L",2,N,US,16134,,2019,Robeson
3,-79.194384,34.676639,49097,Pembroke Elementary,Open,"505 SOUTH JONES STREET PEMBROKE, NC 28372",,PEMBROKE,NC,28372,...,,,"B,L",2,N,US,16405,,2019,Robeson
4,-79.232282,34.733090,49098,Prospect Elementary,Open,"4024 MISSOURI ROAD PEMBROKE, NC 28372",,PEMBROKE,NC,28372,...,,,"B,L",2,N,US,76614,,2019,Robeson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57583,-98.504059,29.433697,140165,BARKLEY/RUIZ EL,Open,"2 Haven for Hope Way San Antonio, TX 78207",,San Antonio,TX,78207,...,,,,11,U,US,132450,,2019,Bexar
57584,-97.484003,32.665264,140166,BENBROOK MIDDLE/HIGH SCHOOL,Open,"201 Overcrest Benbrook, TX 76126",,Benbrook,TX,76126,...,,,,11,U,US,25834,,2019,Tarrant
57585,-93.888960,30.672447,140167,Bleakwood Community Center,Open,"266 FM 363 Kirbyville, TX 75956",,Kirbyville,TX,75956,...,,,,11,U,US,120536,,2019,Jasper
57586,-93.927672,29.928196,140168,Brittany Place,Open,"3500 Normandy Place Port Arthur, TX 77642",,Port Arthur,TX,77642,...,,,,11,U,US,3497,,2019,Jefferson


In [4]:
# Filter down to Allegheny County sites only
df = df.loc[(df['siteState'] == 'PA') & (df['County'] == 'Allegheny')]

# Assign some columns to schema fields
df['original_id'] = df['FNSID']
df['name'] = df['siteName'].str.title() # deal with ALL CAPS
df['city'] = df['siteCity'].str.title()
df['state'] = df['siteState']
df['zip_code'] = df['siteZip'].str.zfill(5)
df['county'] = df['County']
df['latitude'] = df['Y']
df['longitude'] = df['X']

# Handle phone numbers with and without extensions
df.loc[df['ext'].notnull(), 'phone'] = df.loc[df['ext'].notnull(), 'sitePhone'].astype('int64').astype(str) + \
                                       'x' + df.loc[df['ext'].notnull(), 'ext'].astype('int64').astype(str)
df.loc[df['ext'].isnull(), 'phone'] = df.loc[df['ext'].isnull(), 'sitePhone']

# Handle address, fun text processing
df['address'] = df.apply(lambda x: x['siteAddress'].split(' ' + x['siteCity'])[0].title(), axis = 1)

# Set some fields directly
df['source_org'] = 'USDA Food and Nutrition Service'
df['source_file'] = os.path.basename(in_path)
df['type'] = "summer meal site"
df['latlng_source'] = df['source_org']

df['SNAP'] = 0
df['WIC'] = 0
df['FMNP'] = 0
# df['fresh_produce'] = 1
df['food_bucks'] = 0
df['free_distribution'] = 1
df['open_to_spec_group'] = 'children and teens 18 and younger'
df['data_issues'] = '' # start with blank field, to populate later

# Calculate date_from and date_to
df['date_from'] = df['startDate'].str.split('T').str[0]
df['date_to'] = df['endDate'].str.split('T').str[0]

# Reorder and add any missing columns
df = df.reindex(columns = final_cols)

# Identify which columns we have handled
handled_cols = df.columns[~df.isna().all()] # i.e. columns that aren't all NA

# Detect and document missingness in handled columns
for col in handled_cols:
    df.loc[df[col].isna(), 'data_issues'] += '{} missing;'.format(col)

# Detect some specific data issues 
df.loc[((df['latitude'] == 0) & (df['longitude'] == 0)), 'data_issues'] += 'latlng is (0,0);'

# Write out to CSV
df.to_csv(out_path, index = False)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin

Unnamed: 0,id,source_org,source_file,original_id,type,name,address,city,state,zip_code,...,date_from,date_to,SNAP,WIC,FMNP,fresh_produce,food_bucks,free_distribution,open_to_spec_group,data_issues
1555,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,4808,summer meal site,Hillel Academy Of Pittsburgh,5685 Beacon Street,Pittsburgh,PA,15217,...,2019-06-24,2019-08-02,0,0,0,,0,1,children and teens 18 and younger,
2253,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,41960,summer meal site,Castle Shannon Library,3677 Myrtle Avenue,Pgh,PA,15234,...,2019-06-10,2019-08-16,0,0,0,,0,1,children and teens 18 and younger,
2254,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,40957,summer meal site,Alvern Gardens,1000 Vermont Ave,Pittsburgh,PA,15234,...,2019-06-10,2019-08-16,0,0,0,,0,1,children and teens 18 and younger,
2255,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,43679,summer meal site,Green Tree Wilson Park,10 West Manilla Dr,Pgh,PA,15220,...,2019-06-10,2019-08-16,0,0,0,,0,1,children and teens 18 and younger,
2259,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,42688,summer meal site,Dormont Park / Pool & Rec Center,1801 Dormont Ave,Pgh,PA,15216,...,2019-06-10,2019-08-16,0,0,0,,0,1,children and teens 18 and younger,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57459,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,84713,summer meal site,Allegheny Ymca,600 W. North Avenue,Pittsburgh,PA,,...,2019-06-17,2019-08-23,0,0,0,,0,1,children and teens 18 and younger,zip_code missing;
57460,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,92916,summer meal site,Burgwin Spray Park/Playground,Johnston Avenue @ Mansion Street,Pittsburgh,PA,,...,2019-06-26,2019-08-23,0,0,0,,0,1,children and teens 18 and younger,zip_code missing;
57469,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,149550,summer meal site,Yp Northview,525 Mt. Pleasant Road,Pittsburgh,PA,,...,2019-06-24,2019-08-02,0,0,0,,0,1,children and teens 18 and younger,zip_code missing;
57470,,USDA Food and Nutrition Service,Summer_Meal_Sites_2019.csv,104548,summer meal site,Mount Ararat,745 N. Negley Avenue,Pittsburgh,PA,,...,2019-06-24,2019-08-16,0,0,0,,0,1,children and teens 18 and younger,zip_code missing;
