In [None]:
import geopandas as gpd
from geopandas.tools import overlay
import os
import matplotlib.pyplot as plt
import boto3
import zipfile
import sys
import xarray as xr
import pandas as pd
from datetime import datetime
from functools import wraps
import re

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)

In [None]:
s3_client = boto3.client('s3')
def list_geospatial_files(path):
    """ Build a list of shapefile URIs contained in S3 folder """
    # initiate empty list for s3 URIs
    all_shapefiles = []
    bucket_name = 'ca-climate-index' 
    # initiate s3 session
    session = boto3.Session()
    # use the session to get the resource
    s3 = session.resource('s3')
    my_bucket = s3.Bucket(bucket_name)
    # iterate through directory
    for obj in my_bucket.objects.filter(
        Prefix=path):
        # build list of shapefile URIs
        if obj.key.endswith('.zip'):
            # preceding the URI with 'zip' lets you read in the file without downloading, unzipping, etc
            s3_uri = f"zip+s3://ca-climate-index/"+obj.key
            all_shapefiles.append(s3_uri)
        elif obj.key.endswith('.shp'):
            s3_uri = "s3://ca-climate-index/"+obj.key
            all_shapefiles.append(s3_uri)
    return all_shapefiles

# @append_metadata
def reproject_shapefile(shp_fname, ca_boundaries, varname='', additional_comments='N/A'):
    """ Given S3 URI which corresponds to a data shapefile and a shapefile
    with California Census Tract, (1) reproject the data shapefile to the CRS of the California Census Tracts, 
    (2) clip to California, and (3) send it off to S3. """    

    # read in shapefile of interest from S3 and take a look at it
    gdf = gpd.read_file(shp_fname)
    print(f"Reading in shapefile: {shp_fname}")
    print(f"Original CRS of {varname}: {gdf.crs}")
    fig, ax = plt.subplots()
    gdf.plot(ax=ax, markersize=1)
    plt.title(f"{varname} on original projection")
    plt.show()

    # check the current coordinate system of the census tracts data
    print(f"CRS of Census Tracts Shapefile: {ca_boundaries.crs}")

    # reproject the data to the census tract CRS and clip to California
    gdf_reprojected = gdf.to_crs(ca_boundaries.crs)
    print(f"{varname} reprojected from {gdf.crs} to {gdf_reprojected.crs} with geopandas to_crs() function.")

    clipped_gdf = overlay(gdf_reprojected, ca_boundaries, how='intersection')
    print(f"{varname} clipped to California boundaries via geopandas overlay using the 'intersection' method.")
    print(f"Additional comments: {additional_comments}.") # eg, code rerun, bug fix, etc

    # visualize results
    fig, ax = plt.subplots()
    ca_boundaries.plot(ax=ax, color='white', edgecolor='black')
    clipped_gdf.plot(ax=ax, marker='o', color='red', markersize=1)
    plt.title(f"{varname} on new projection")
    plt.show()
  
    # write the reprojected file to disk - still looking for a way around this
    # if not os.path.exists(f"{varname}.gpkg"):
    clipped_gdf.to_file(f"{varname}.gpkg", driver="GPKG")
    print(f"{varname}.gpkg has been made")
    print(f'Uploading file {varname}.gpkg to AWS')
    
    # upload it to S3
    s3_client = boto3.client('s3')  
    bucket_name = 'ca-climate-index' 

    if shp_fname.endswith('.zip'):
        shp_fname = shp_fname.replace(
            'zip+',
            '')
    dest_path = shp_fname.replace(
        's3://ca-climate-index/',
        '')
    dest_path = re.sub(r'1_pull_data|2a_subset', '2b_reproject', dest_path)
    dest_path = dest_path.replace(dest_path.split('/')[-1],f"{varname}.gpkg")

    s3_client.upload_file(f"{varname}.gpkg", f'{bucket_name}', f'{dest_path}'
    )
    print(f"Reprojected data called {varname}.gpkg sent to S3 bucket: {dest_path}")
    print('')
    os.remove(f"{varname}.gpkg")


In [None]:
# read in the CSV with the data details
ref_file = sys.path[-1]+'/metadata/Full Data Pipeline Notes - 1_ Pull.csv'
df = pd.read_csv(ref_file)

# subset for shapefiles
ref_df = df.fillna('N/A')
# comment out for now as 'Pulled Format' column not updated
# ref_df = ref_df[ref_df["Pulled Format"].str.contains("shp")]

### Define the path
path1 = "1_pull_data"
path2 = "2a_subset"
#  build a list of shapefiles in the above s3 paths
my_list = list_geospatial_files(path1) 
my_list += list_geospatial_files(path2)

# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
# need to rename columns so we don't have any duplicates in the final geodatabase
column_names = ca_boundaries.columns
new_column_names = ["USCB_"+column for column in column_names if column != "geometry"]
ca_boundaries = ca_boundaries.rename(columns=dict(zip(column_names, new_column_names)))

# Troubleshooting code

In [None]:
def troubleshoot_reproject(shp_fname, ca_boundaries, varname='', additional_comments='N/A'):
    """ Given S3 URI which corresponds to a shapefile, (1) reproject it
    to the CRS of the California Census Tracts, (2) clip to California, 
    and (3) send it off to S3. """    

    # check the current coordinate system of the census tracts data
    print(f"CRS of Census Tracts Shapefile: {ca_boundaries.crs}")
    # reproject the data to the census tract CRS and clip to California
    gdf_reprojected = gdf.to_crs(ca_boundaries.crs)
    print(f"{varname} reprojected from {gdf.crs} to {gdf_reprojected.crs} with geopandas to_crs() function.")
    clipped_gdf = overlay(gdf_reprojected, ca_boundaries, how='intersection')
    return gdf_reprojected, clipped_gdf

fpath = "zip+s3://ca-climate-index/1_pull_data/built_environment/transportation/cdot/Bottlenecks.zip"
gdf = gpd.read_file(fpath)
reproj, clipped_gdf = troubleshoot_reproject(fpath, ca_boundaries, varname='bottlenecks_test')

# Transportation

In [None]:
# sample for testing
file_names = [ 
    'Bottlenecks.zip',
    'California_Rail_Network.zip',
    'Local_Bridges.zip',
    'National_Highway_System.zip',
    'Public_Airport.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="TV_Broadcast_Contours.zip":
            continue
        if fname =='sta_tv_contours.zip':
            varname = "built_hifld_tv_contour"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")

# Communication

In [None]:
# sample for testing
file_names = [ 
    'broadband_internet.gdb.zip'
    ]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        print(varname)
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")

In [None]:
# sample for testing
file_names = [ 
    'FM_Transmission_Towers.zip',
    'Cellular_Towers.zip',
    'Microwave_Service_Towers.zip',
    'Paging_Transmission_Towers.zip',
    'Land_Mobile_Broadcast_Towers.zip'
    ]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="TV_Broadcast_Contours.zip":
            continue
        if fname =='sta_tv_contours.zip':
            varname = "built_hifld_tv_contour"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")

In [None]:
# doing tv contours separately, doesnt work when grouped with other communication file names
file_names = [ 
    'sta_tv_contours.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="TV_Broadcast_Contours.zip":
            continue
        if fname =='sta_tv_contours.zip':
            varname = "built_hifld_tv_contour"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    #else:
        #print(f"Skipping file {fname} as it is not in the list of files to process.")


# Utilities

In [None]:
# sample for testing
file_names = [ 
    'California_Electric_Transmission_Lines.zip',
    'California_Power_Plants.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="TV_Broadcast_Contours.zip":
            continue
        if fname =='sta_tv_contours.zip':
            varname = "built_hifld_tv_contour"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")


# Climate Risks

## Extreme heat

In [None]:
# sample for testing
file_names = [ 
    'extreme_heat_warnings_1986_2024.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="TV_Broadcast_Contours.zip":
            continue
        if fname =='sta_tv_contours.zip':
            varname = "built_hifld_tv_contour"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")


## Flooding

In [None]:
file_names = [
    'vulnerable_fire_stations_2000.gdb.zip',
    'vulnerable_fire_stations_2100.gdb.zip',
    'vulnerable_hospitals_2000.gdb.zip',
    'vulnerable_hospitals_2100.gdb.zip',
    'vulnerable_police_stations_2000.gdb.zip',
    'vulnerable_police_stations_2100.gdb.zip',
    'vulnerable_schools_2000.gdb.zip',
    'vulnerable_schools_2100.gdb.zip',
    'vulnerable_slr_superfund_sites.gdb.zip',
    'vulnerable_wastewater_treatment_facilities.gdb.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="flash_flood_warnings_1986_2024.zip":
            continue
        if fname=="fl_flood_warnings_1986_2024.zip":
            continue
        if fname=="fa_flood_warnings_1986_2024.zip":
            continue
        if fname =='merged_flood.zip':
            varname = "climate_iowa_mesonet_flash_flood_warnings"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")


### Large Files

In [None]:
# sample for testing
file_names = [ 
    'merged_flood.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="flash_flood_warnings_1986_2024.zip":
            continue
        if fname=="fl_flood_warnings_1986_2024.zip":
            continue
        if fname=="fa_flood_warnings_1986_2024.zip":
            continue
        if fname =='merged_flood.zip':
            varname = "climate_iowa_mesonet_flash_flood_warnings"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")

In [None]:
# sample for testing
file_names = [ 
    'california-fema-100-year-floodplains.gdb.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="flash_flood_warnings_1986_2024.zip":
            continue
        if fname=="fl_flood_warnings_1986_2024.zip":
            continue
        if fname=="fa_flood_warnings_1986_2024.zip":
            continue
        if fname =='merged_flood.zip':
            varname = "climate_iowa_mesonet_flash_flood_warnings"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")

## Wildfire

### Large file

In [None]:
# sample for testing
file_names = [ 
    'red_flag_warnings_1986_2024.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="flash_flood_warnings_1986_2024.zip":
            continue
        if fname=="fl_flood_warnings_1986_2024.zip":
            continue
        if fname=="fa_flood_warnings_1986_2024.zip":
            continue
        if fname =='merged_flood.zip':
            varname = "climate_iowa_mesonet_flash_flood_warnings"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)
    else:
        print(f"Skipping file {fname} as it is not in the list of files to process.")


## Community Preparedness

### Large File

In [None]:
# sample for testing
file_names = [ 
    'usda_fuel_treatment.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# full list of file names here - still need to do some tweaking
# file_names = [name for name in ref_df['File Name'].values if name != 'N/A']

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
            # match up file name to variable name
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)


## Emergency Response

In [None]:
# sample for testing
file_names = [ 
    'usgs_fire_stations.zip'
]

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
            # match up file name to variable name
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)

## Natural Resources Conservation

In [None]:
# sample for testing
file_names = [ 
    'cpad_2023_holdings.zip',
    'calfire_timber_management.zip'
]

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
            # match up file name to variable name
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)

### Large file

In [None]:
# too big, running separately
file_names = [ 
    'f2f2_assessment.gdb.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
            # match up file name to variable name
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)

## Ecosystem Condition

In [None]:
# too big, running separately
file_names = [ 
    'ca_fish_wildlife_species_biodiversity.gdb.zip'
]

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
            # match up file name to variable name
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)

## Ecosystem Conservation

In [None]:
file_names = [ 
    'cpad_2023_holdings_conservation.zip'
]

# test additional comments for fun
additional_comments = "N/A"

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
            # match up file name to variable name
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)

## Ecosystem Type

### Not working

In [None]:
file_names = [ 
    'ca_dept_forestry_ecosystem_veg.gdb.zip'
]

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
            # match up file name to variable name
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)

### Troubleshooting

In [None]:
fpath = "zip+s3://ca-climate-index/1_pull_data/natural_systems/ecosystem_type/ca_dept_forestry_fire/ca_dept_forestry_ecosystem_veg.gdb.zip"
gdf = gpd.read_file(fpath)
reproj, clipped_gdf = troubleshoot_reproject(fpath, ca_boundaries, varname='bottlenecks_test')

## Social Services

In [None]:
file_names = [ 
    'hpsa_mental_health.zip',
    'hpsa_primary_care.zip',
    'hpsa_narcotic_treatment_programs.zip'
]

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
            # match up file name to variable name
        varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
        reproject_shapefile(fpath, ca_boundaries, varname=varname, additional_comments=additional_comments)