## Creating a new .csv file that contains California tract numbers and their respective counties
* pull our foodaccess2019.csv that contains census tract and county names (but does not have full 9128 rows as our CA tract data)
* pull our CA tract data that contains all modern census tracts
* based on a shared county FP column, map the county names from our foodaccess2019.csv file to every row in our CA tract data

In [55]:
import os
import sys
import pandas as pd
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
sys.path.append(os.path.expanduser('../../'))

# Adjust display options, helpful for long descriptions within ACS data
pd.set_option('display.max_colwidth', None)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import (
    pull_csv_from_directory, upload_csv_aws
)

In [2]:
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/usda/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'foodaccess2019.csv'
Saved DataFrame as 'foodaccess2019_readme.csv'
Saved DataFrame as 'foodaccess2019_variable_names.csv'


In [3]:
food_access = pd.read_csv('foodaccess2019.csv')

In [4]:
food_access.head()

Unnamed: 0,CensusTract,State,County,Urban,Pop2010,OHU2010,GroupQuartersFlag,NUMGQTRS,PCTGQTRS,LILATracts_1And10,...,TractSeniors,TractWhite,TractBlack,TractAsian,TractNHOPI,TractAIAN,TractOMultir,TractHispanic,TractHUNV,TractSNAP
0,1001020100,Alabama,Autauga County,1,1912,693,0,0.0,0.0,0,...,221.0,1622.0,217.0,14.0,0.0,14.0,45.0,44.0,6.0,102.0
1,1001020200,Alabama,Autauga County,1,2170,743,0,181.0,8.34,1,...,214.0,888.0,1217.0,5.0,0.0,5.0,55.0,75.0,89.0,156.0
2,1001020300,Alabama,Autauga County,1,3373,1256,0,0.0,0.0,0,...,439.0,2576.0,647.0,17.0,5.0,11.0,117.0,87.0,99.0,172.0
3,1001020400,Alabama,Autauga County,1,4386,1722,0,0.0,0.0,0,...,904.0,4086.0,193.0,18.0,4.0,11.0,74.0,85.0,21.0,98.0
4,1001020500,Alabama,Autauga County,1,10766,4082,0,181.0,1.68,0,...,1126.0,8666.0,1437.0,296.0,9.0,48.0,310.0,355.0,230.0,339.0


In [52]:
# isolate relevant columns
tract_county = food_access[['CensusTract', 'County', 'State']]

# isolate California data
california_tracts = tract_county[tract_county['State'] == 'California']



In [53]:
california_tracts

Unnamed: 0,CensusTract,County,State
3551,6001400100,Alameda County,California
3552,6001400200,Alameda County,California
3553,6001400300,Alameda County,California
3554,6001400400,Alameda County,California
3555,6001400500,Alameda County,California
...,...,...,...
11570,6115040800,Yuba County,California
11571,6115040901,Yuba County,California
11572,6115040902,Yuba County,California
11573,6115041000,Yuba County,California


In [54]:
# select relevant columns
california_tracts_county = california_tracts[['CensusTract', 'County']]

# elminate 'county' entries within the County column
california_tracts_county.loc[:, 'County'] = california_tracts_county['County'].str.replace(' County', '')
california_tracts_county.loc[:,'CensusTract'] = california_tracts_county['CensusTract'].astype(str)
# add '0' to the start of each tract entry
california_tracts_county.loc[:,'CensusTract'] = '0' + california_tracts_county['CensusTract']
# splice the tract number to get county FP code
california_tracts_county['COUNTYFP'] = california_tracts_county['CensusTract'].astype(str).str[2:5]
california_tracts_county.head(5)

Unnamed: 0,CensusTract,County,COUNTYFP
3551,6001400100,Alameda,1
3552,6001400200,Alameda,1
3553,6001400300,Alameda,1
3554,6001400400,Alameda,1
3555,6001400500,Alameda,1


## Now reading in our CA census tract file for processing

In [13]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

In [61]:
# selecting and renaming relevant columns
filtered_ca_boundaries = ca_boundaries[['GEOID', 'COUNTYFP']].copy()
filtered_ca_boundaries.rename(columns ={'GEOID':'TRACT'}, inplace=True)


In [62]:
filtered_ca_boundaries

Unnamed: 0,TRACT,COUNTYFP
0,06085504321,085
1,06085504410,085
2,06085507003,085
3,06085507004,085
4,06085502204,085
...,...,...
9124,06059001303,059
9125,06059001304,059
9126,06059001401,059
9127,06013367200,013


## Creating a county column for filtered_ca_boundaries based on the shared COUNTYFP with california_tracts_county

In [63]:
# create a mapping dictionary from 'COUNTYFP' to 'County' from california_tracts_county
county_mapping = california_tracts_county.set_index('COUNTYFP')['County'].to_dict()

# map the 'County' values from california_tracts_county to filtered_ca_boundaries based on 'COUNTYFP'
filtered_ca_boundaries.loc[:,'County'] = filtered_ca_boundaries['COUNTYFP'].map(county_mapping)

In [65]:
print(len(filtered_ca_boundaries))
filtered_ca_boundaries.tail(5)

9129


Unnamed: 0,TRACT,COUNTYFP,County
9124,6059001303,59,Orange
9125,6059001304,59,Orange
9126,6059001401,59,Orange
9127,6013367200,13,Contra Costa
9128,6037578100,37,Los Angeles


## Making sure all rows within the new county column are populated

In [47]:
# check if there are any missing values in the 'County' column of filtered_ca_boundaries
missing_counties = filtered_ca_boundaries['County'].isnull().sum()

if missing_counties == 0:
    print("All rows in filtered_ca_boundaries have been populated in the new 'County' column.")
else:
    print(f"There are {missing_counties} rows in filtered_ca_boundaries with missing values in the 'County' column.")


All rows in filtered_ca_boundaries have been populated in the new 'County' column.


## Upload to AWS

In [67]:
# save data as a csv
filtered_ca_boundaries.to_csv('ca_tracts_county.csv')

# upload csv to aws
bucket_name = 'ca-climate-index'
file_name = 'ca_tracts_county.csv'
directory = '0_map_data'

upload_csv_aws([file_name], bucket_name, directory)
# Remove final csv files from local directory
os.remove(file_name)

ca_tracts_county.csv uploaded to AWS
