## Cal-CRAI Reprojection -- geospatial file inputs
This notebook processes geospatial data files, of manageable size, for reprojection where necessary and includes the necessary metadata pieces for clarity. 

In [1]:
import geopandas as gpd
from geopandas.tools import overlay
import os
import matplotlib.pyplot as plt
import boto3
import zipfile
import sys
import xarray as xr
import pandas as pd
from datetime import datetime
from functools import wraps
import re

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)

In [6]:
s3_client = boto3.client('s3')
def list_geospatial_files(path):
    """ Build a list of shapefile URIs contained in S3 folder """
    # initiate empty list for s3 URIs
    all_shapefiles = []
    bucket_name = 'ca-climate-index' 
    # initiate s3 session
    session = boto3.Session()
    # use the session to get the resource
    s3 = session.resource('s3')
    my_bucket = s3.Bucket(bucket_name)
    # iterate through directory
    for obj in my_bucket.objects.filter(
        Prefix=path):
        # build list of shapefile URIs
        if obj.key.endswith('.zip'):
            # preceding the URI with 'zip' lets you read in the file without downloading, unzipping, etc
            s3_uri = f"zip+s3://ca-climate-index/"+obj.key
            all_shapefiles.append(s3_uri)
        elif obj.key.endswith('.shp'):
            s3_uri = "s3://ca-climate-index/"+obj.key
            all_shapefiles.append(s3_uri)
    return all_shapefiles

@append_metadata
def reproject_shapefile(shp_fname, ca_boundaries, varname='', export=False, additional_comments='N/A'):
    """ 
    Given S3 URI which corresponds to a data shapefile and a shapefile with California Census Tract: 
    (1) reproject the data shapefile to the CRS of the California Census Tracts, 
    (2) clip to California, and 
    (3) send it off to S3.

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in
    ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Parameters
    ----------
    shp_fname: string
        Local main folder where the pulled files will be saved.
    ca_boundaries: string
        Name of the CA census tract shape file.
    export: bool
        If True, exports resulting file to S3.
        If False, will return the metadata.

    Script
    ------
    geospatial_reproject.ipynb
    """
    if export == False:
        return
    else:
        s3_client = boto3.client('s3')  
        bucket_name = 'ca-climate-index' 
        # read in shapefile of interest from S3 and take a look at it
        gdf = gpd.read_file(shp_fname)
        print(f"Original CRS of {varname}: {gdf.crs}")
        fig, ax = plt.subplots()
        gdf.plot(ax=ax, markersize=1)
        plt.title(f"{varname} on original projection")
        plt.show()

        # reproject the data to the census tract CRS and clip to California
        gdf_reprojected = gdf.to_crs(ca_boundaries.crs)
        print(f"{varname} reprojected from {gdf.crs} to {gdf_reprojected.crs} with geopandas to_crs() function.")

        clipped_gdf = overlay(gdf_reprojected, ca_boundaries, how='intersection')
        print(f"{varname} clipped to California boundaries via geopandas overlay using the 'intersection' method.")

        # visualize results
        fig, ax = plt.subplots()
        ca_boundaries.plot(ax=ax, color='white', edgecolor='black')
        clipped_gdf.plot(ax=ax, marker='o', color='red', markersize=1)
        plt.title(f"{varname} on new projection")
        plt.show()

        # write the reprojected file to disk - still looking for a way around this
        # if not os.path.exists(f"{varname}.gpkg"):
        clipped_gdf.to_file(f"{varname}.gpkg", driver="GPKG")
        print(f"{varname}.gpkg has been made")

        if shp_fname.endswith('.zip'):
            shp_fname = shp_fname.replace(
                'zip+',
                '')
            
        dest_path = shp_fname.replace(
            's3://ca-climate-index/',
            '')
        dest_path = re.sub(r'1_pull_data|2a_subset', '2b_reproject', dest_path)
        dest_path = dest_path.replace(dest_path.split('/')[-1],f"{varname}.gpkg")
        print(f"Reprojected data called {varname}.gpkg sent to S3 bucket: {dest_path}")

        if export == True:
            # upload it to S3
            s3_client.upload_file(f"{varname}.gpkg", f'{bucket_name}', f'{dest_path}'
            )
            
        os.remove(f"{varname}.gpkg")


In [5]:
# read in the CSV with the data details
# ref_file = sys.path[-1]+'/metadata/Full Data Pipeline Notes - 1_ Pull.csv'
ref_file = r'C:/Users/jespi/eagle/carb-climate-index-7/metadata/Full Data Pipeline Notes - 1_ Pull.csv'
df = pd.read_csv(ref_file)

# subset for shapefiles
ref_df = df.fillna('N/A')
# comment out for now as 'Pulled Format' column not updated
ref_df = ref_df[
(ref_df["Pulled Format"].str.contains("shp")) 
| (ref_df["Pulled Format"].str.contains("gdb"))
]

### Define the path
path1 = "1_pull_data"
path2 = "2a_subset"
#  build a list of shapefiles in the above s3 paths
my_list = list_geospatial_files(path1) 
my_list += list_geospatial_files(path2)

# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
# need to rename columns so we don't have any duplicates in the final geodatabase
column_names = ca_boundaries.columns
new_column_names = ["USCB_"+column for column in column_names if column != "geometry"]
ca_boundaries = ca_boundaries.rename(columns=dict(zip(column_names, new_column_names)))

# Run the reprojection code over the desired files

In [7]:
additional_comments = "N/A"
# build list of file names
file_names = [name for name in ref_df['File Name'].values if name != 'N/A']
# define large files, which are reprojected elsewhere
large_files = [
    'climate_iowa_mesonet_flash_flood_warnings', 
    'climate_koordinates_floodplain', 
    'climate_iowa_mesonet_wildfire_warnings',
    'governance_usda_watershed_risk',
    'governance_usda_fuel_reduction'
]
# skip problematic files
problem_vars = [
    "natural_calfire_vegetation_types", 
    "natural_cnra_protected_areas"
]
# excluded files
excluded_files = large_files+problem_vars

In [8]:
# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    if fname in file_names:
         # want the subsetted TV contours data in 2a_subset folder
        if fname=="TV_Broadcast_Contours.zip":
            continue
        if fname =='sta_tv_contours.zip':
            varname = "built_hifld_tv_contour"
        else:
            # match up file name to variable name
            varname = ref_df.loc[ref_df["File Name"] == fname]["Variable"].values[0]
            # exclude some files
            if varname in excluded_files:
                continue
            else:
                reproject_shapefile(fpath, ca_boundaries, export=False, additional_comments=additional_comments, varname=varname)
    else:
        continue