# Dataset ingestion script prototype

This notebook is intended to prototype code snippets for an ingestion script aimed at "filename". 

Dependencies:
* pandas
* os
* openpyxl

In [1]:
import pandas as pd
import os

In [7]:
in_path = '../food-data/PFPC_data_files/Bridgeway Capital - HFFI Funds.xlsx'
out_path = '../food-data/Cleaned_data_files/bridgeway_capital_hffi.csv'

final_cols = ['id', 'source_org', 'source_file', 'original_id', 'type', 'name', 'address', 'city', 
              'state', 'zip_code', 'county', 'location_description', 'phone', 'url', 'latitude', 
              'longitude', 'latlng_source', 'date_from', 'date_to', 'SNAP', 'WIC', 'FMNP', 
              'fresh_produce', 'food_bucks', 'free_distribution', 'open_to_spec_group', 'data_issues']

In [5]:
df = pd.read_excel(in_path, engine='openpyxl')
df

Unnamed: 0,Neighborhood,Store Name,Address,City,Zip,Unnamed: 5
0,Lawrenceville,52nd Street Market,601 52nd Street,Pittsburgh,15201.0,
1,Homewood,Perry's Honeydripper,7006 Frankstown Ave.,Pittsburgh,15208.0,(Moving)
2,Hazelwood,Dylamatos,5414 Second Ave.,Pittsburgh,15207.0,
3,Lawrenceville,Butcher on Butler,5145 Butler St,Pittsburgh,15201.0,
4,Garfield,Gluten Free Goat,4905 Penn Ave.,Pittsburgh,15224.0,
...,...,...,...,...,...,...
994,,,,,,
995,,,,,,
996,,,,,,
997,,,,,,


In [6]:
# Keep only non-empty rows
df = df[df['Store Name'].notnull()]
df

Unnamed: 0,Neighborhood,Store Name,Address,City,Zip,Unnamed: 5
0,Lawrenceville,52nd Street Market,601 52nd Street,Pittsburgh,15201.0,
1,Homewood,Perry's Honeydripper,7006 Frankstown Ave.,Pittsburgh,15208.0,(Moving)
2,Hazelwood,Dylamatos,5414 Second Ave.,Pittsburgh,15207.0,
3,Lawrenceville,Butcher on Butler,5145 Butler St,Pittsburgh,15201.0,
4,Garfield,Gluten Free Goat,4905 Penn Ave.,Pittsburgh,15224.0,
5,Larimer,LA Market,511 Larimer Ave.,Pittsburgh,15206.0,
6,Allentown,Day La Soul,829-831 E. Warrington Ave,Pittsburgh,15210.0,


In [8]:
# Assign some columns to schema fields
df['name'] = df['Store Name']
df['address'] = df['Address']
df['city'] = df['City']
df['zip_code'] = df['Zip']

# Set some fields directly
df['source_org'] = 'Pittsburgh Food Policy Council'
df['source_file'] = os.path.basename(in_path)
df['type'] = 'supermarket'
df['state'] = 'PA'
df['county'] = 'Allegheny'
df['fresh_produce'] = 1
df['free_distribution'] = 0
df['data_issues'] = '' # start with blank field, to populate later

# Reorder and add any missing columns
df = df.reindex(columns = final_cols)

# Identify which columns we have handled
handled_cols = df.columns[~df.isna().all()] # i.e. columns that aren't all NA

# Detect and document missingness in handled columns
for col in handled_cols:
    df.loc[df[col].isna(), 'data_issues'] += '{} missing;'.format(col)

# Write out to CSV
df.to_csv(out_path, index = False)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'] = df['Store Name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['address'] = df['Address']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['city'] = df['City']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

Unnamed: 0,id,source_org,source_file,original_id,type,name,address,city,state,zip_code,...,date_from,date_to,SNAP,WIC,FMNP,fresh_produce,food_bucks,free_distribution,open_to_spec_group,data_issues
0,,Pittsburgh Food Policy Council,Bridgeway Capital - HFFI Funds.xlsx,,supermarket,52nd Street Market,601 52nd Street,Pittsburgh,PA,15201.0,...,,,,,,1,,0,,
1,,Pittsburgh Food Policy Council,Bridgeway Capital - HFFI Funds.xlsx,,supermarket,Perry's Honeydripper,7006 Frankstown Ave.,Pittsburgh,PA,15208.0,...,,,,,,1,,0,,
2,,Pittsburgh Food Policy Council,Bridgeway Capital - HFFI Funds.xlsx,,supermarket,Dylamatos,5414 Second Ave.,Pittsburgh,PA,15207.0,...,,,,,,1,,0,,
3,,Pittsburgh Food Policy Council,Bridgeway Capital - HFFI Funds.xlsx,,supermarket,Butcher on Butler,5145 Butler St,Pittsburgh,PA,15201.0,...,,,,,,1,,0,,
4,,Pittsburgh Food Policy Council,Bridgeway Capital - HFFI Funds.xlsx,,supermarket,Gluten Free Goat,4905 Penn Ave.,Pittsburgh,PA,15224.0,...,,,,,,1,,0,,
5,,Pittsburgh Food Policy Council,Bridgeway Capital - HFFI Funds.xlsx,,supermarket,LA Market,511 Larimer Ave.,Pittsburgh,PA,15206.0,...,,,,,,1,,0,,
6,,Pittsburgh Food Policy Council,Bridgeway Capital - HFFI Funds.xlsx,,supermarket,Day La Soul,829-831 E. Warrington Ave,Pittsburgh,PA,15210.0,...,,,,,,1,,0,,
