## Creating a new .csv file that contains California tract numbers and their respective counties
* pull our foodaccess2019.csv that contains census tract and county names (but does not have full 9128 rows as our CA tract data)
* pull our CA tract data that contains all modern census tracts
* based on a shared county FP column, map the county names from our foodaccess2019.csv file to every row in our CA tract data

In [7]:
import os
import sys
import pandas as pd
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
sys.path.append(os.path.expanduser('../../'))

# Adjust display options, helpful for long descriptions within ACS data
pd.set_option('display.max_colwidth', None)

from scripts.utils.file_helpers import (
    pull_csv_from_directory, upload_csv_aws
)

In [None]:
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/vulnerable_populations/usda/'
output_folder = 'food_access'

pull_csv_from_directory(bucket_name, aws_dir, output_folder, search_zipped=False)

In [None]:
# Set the base directory dynamically (for example, the current working directory or a generic path)
base_dir = os.path.expanduser("~")  # This gets the home directory of the user
project_folder = os.path.join(base_dir, 'eagle', 'carb-climate-index-7')

# Set the output folder path dynamically based on the base directory and project structure
output_folder = os.path.join(project_folder, 'food_access')

# Count the number of files in the output folder
file_count = len([f for f in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, f))])

# Print the result
print(f"Number of files in the output folder: {file_count}")

# Load the food access CSV file from the project folder
food_access_csv_path = os.path.join(output_folder, 'foodaccess2019.csv')
food_access = pd.read_csv(food_access_csv_path)

In [None]:
food_access.head()

In [None]:
# isolate relevant columns
tract_county = food_access[['CensusTract', 'County', 'State']]

# isolate California data
california_tracts = tract_county[tract_county['State'] == 'California']

california_tracts

In [None]:
# select relevant columns
california_tracts_county = california_tracts[['CensusTract', 'County']]

# elminate 'county' entries within the County column
california_tracts_county.loc[:, 'County'] = california_tracts_county['County'].str.replace(' County', '')
california_tracts_county.loc[:,'CensusTract'] = california_tracts_county['CensusTract'].astype(str)
# add '0' to the start of each tract entry
california_tracts_county.loc[:,'CensusTract'] = '0' + california_tracts_county['CensusTract']
# splice the tract number to get county FP code
california_tracts_county['COUNTYFP'] = california_tracts_county['CensusTract'].astype(str).str[2:5]
california_tracts_county.head(5)

#### Now reading in our CA census tract file for processing

In [14]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

In [None]:
# selecting and renaming relevant columns
filtered_ca_boundaries = ca_boundaries[['GEOID', 'COUNTYFP']].copy()
filtered_ca_boundaries.rename(columns ={'GEOID':'TRACT'}, inplace=True)
filtered_ca_boundaries

#### Creating a county column for filtered_ca_boundaries based on the shared COUNTYFP with california_tracts_county

In [16]:
# create a mapping dictionary from 'COUNTYFP' to 'County' from california_tracts_county
county_mapping = california_tracts_county.set_index('COUNTYFP')['County'].to_dict()

# map the 'County' values from california_tracts_county to filtered_ca_boundaries based on 'COUNTYFP'
filtered_ca_boundaries.loc[:,'County'] = filtered_ca_boundaries['COUNTYFP'].map(county_mapping)

In [None]:
print(len(filtered_ca_boundaries))
filtered_ca_boundaries.tail(5)

#### Making sure all rows within the new county column are populated

In [None]:
# check if there are any missing values in the 'County' column of filtered_ca_boundaries
missing_counties = filtered_ca_boundaries['County'].isnull().sum()

if missing_counties == 0:
    print("All rows in filtered_ca_boundaries have been populated in the new 'County' column.")
else:
    print(f"There are {missing_counties} rows in filtered_ca_boundaries with missing values in the 'County' column.")

#### Upload to AWS

In [None]:
# save data as a csv
filtered_ca_boundaries.to_csv('ca_tracts_county.csv')

# upload csv to aws
bucket_name = 'ca-climate-index'
file_name = 'ca_tracts_county.csv'
directory = '0_map_data'

upload_csv_aws([file_name], bucket_name, directory)
# Remove final csv files from local directory
os.remove(file_name)