In [None]:
import geopandas as gpd
from geopandas.tools import overlay
import os
import matplotlib.pyplot as plt
import boto3
import zipfile
import sys
import xarray as xr
import pandas as pd

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)

In [None]:
def list_geospatial_files(path):
    """ Build a list of shapefile URIs contained in S3 folder """
    # initiate empty list for s3 URIs
    all_shapefiles = []
    bucket_name = 'ca-climate-index' 
    # initiate s3 session
    session = boto3.Session()
    # use the session to get the resource
    s3 = session.resource('s3')
    my_bucket = s3.Bucket(bucket_name)
    # iterate through directory
    for obj in my_bucket.objects.filter(
        Prefix=path):
        # build list of shapefile URIs
        if obj.key.endswith('.zip'):
            # preceding the URI with 'zip' lets you read in the file without downloading, unzipping, etc
            s3_uri = f"zip+s3://ca-climate-index/"+obj.key
            all_shapefiles.append(s3_uri)
        elif obj.key.endswith('.shp'):
            s3_uri = "s3://ca-climate-index/"+obj.key
            all_shapefiles.append(s3_uri)
    return all_shapefiles

# @append_metadata
def reproject_shapefile(shp_fname='', varname=''):
    """ Given S3 URI which corresponds to a shapefile, (1) reproject it
    to the CRS of the California Census Tracts, (2) clip to California, 
    and (3) send it off to S3. """
    # read in US census tract shapefile
    census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
    ca_boundaries = gpd.read_file(census_shp_dir)

    # read in shapefile of interest from S3 and take a look at it
    gdf = gpd.read_file(shp_fname)
    print(f"Original CRS of {varname}: ")
    print(gdf.crs)
    fig, ax = plt.subplots()
    gdf.plot(ax=ax, markersize=1)
    plt.title("Data on original projection")
    plt.show()
    
    # check the current coordinate system of the census tracts data
    print("CRS of Census Tracts Data: ")
    print(ca_boundaries.crs)
    print("\n")

    # reproject the data to the census tract CRS and clip to California
    gdf_reprojected = gdf.to_crs(ca_boundaries.crs)
    clipped_gdf = overlay(gdf_reprojected, ca_boundaries, how='intersection')

    # visualize results
    fig, ax = plt.subplots()
    ca_boundaries.plot(ax=ax, color='white', edgecolor='black')
    clipped_gdf.plot(ax=ax, marker='o', color='red', markersize=1)
    plt.title("Data on new projection")
    plt.show()

    # write the reprojected file to disk - still looking for a way around this
    clipped_gdf.to_file(f"{varname}.gpkg", driver="GPKG")
    
    # upload it to S3
    s3_client = boto3.client('s3')  
    bucket_name = 'ca-climate-index' 

    if shp_fname.endswith('.zip'):
        shp_fname = shp_fname.replace(
            'zip+',
            '')
    dest_path = shp_fname.replace(
        's3://ca-climate-index/',
        '')
    dest_path = dest_path.replace('1_pull_data','2b_reproject')
    dest_path = dest_path.replace(dest_path.split('/')[-1],f"{varname}.gpkg")

    # with open(varname+'.gpkg', 'rb') as data:
    s3_client.upload_file(
        varname+'.gpkg', bucket_name, f"{dest_path}"
    )
    print("Reprojected data sent to S3 bucket.")
    os.remove(f"{varname}.gpkg")

In [None]:
# build a reference dataframe to map file names to variable names
# We will have a CSV with this information soon
variable_names = [
    "built_hifld_radio_towers",
    "built_hifld_cellular_towers",
    "built_hifld_microwave_towers",
    "built_hifld_paging_towers",
    "built_hifld_tv_towers",
    "built_hifld_mobile_towers"
]
file_names = [ 
    'FM_Transmission_Towers.zip',
    'Cellular_Towers.zip',
    'Microwave_Service_Towers.zip',
    'Paging_Transmission_Towers.zip',
     'TV_Broadcast_Contours.zip',
     'Land_Mobile_Broadcast_Towers.zip'
]
ref_df = pd.DataFrame(
    {'Variable' : variable_names,
     'Path' : file_names}
)
### Define the path
path = "1_pull_data/built_environment/communication_infrastructure/homeland_infrastructure_foundation_level_data/"
# first build a list of shapefiles in the above s3 path
my_list = list_geospatial_files(path)

# iterate through the list 
for fpath in my_list:
    # get the file name by itself (no subdirectories)
    fname = fpath.split('/')[-1]
    # match up file name to variable name 
    varname = ref_df.loc[ref_df["Path"]==fname]["Variable"].values[0]
    print(fname, varname) # just to check
    reproject_shapefile(fpath, varname=varname)