### Cal-CRAI Reprojection -- CSV inputs
This notebook processes csv data files for reprojection where necessary and includes the necessary metadata pieces for clarity. 

In [2]:
import geopandas as gpd
from geopandas.tools import overlay
import os
import matplotlib.pyplot as plt
import boto3
import zipfile
import sys
import pandas as pd
from shapely.geometry import Point

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata  
)

In [3]:
@append_metadata
def reproject_csv_coordinates(file_name, aws_dir, output_file_name, lon='', lat='', export=False, varname=''):
    '''
    Pulls a csv file from AWS and establishes point geometries based off of the dataset's
    latitude and longitude coordinates. These point geometries are then reprojected and 
    clipped to a census coordinate reference system. The final .gpkg is uploaded to AWS.

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in
    ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Methods
    -------
    Utilized files lat and lon columns to create point geometries.
    Reproject file to CRSIs standardized coordinate reference system (CRS) 4269.
    Spatial join data with California boundary data (also at CRS:4269).
    Save resulting file to a .gpkg file.

    Script
    ------
    csv_coordinate_reproject.ipynb

    Parameters
    ----------
    file_name: string
        .csv file name to be reprojected   
    aws_dir: string
        AWS location of .csv file: 
        1_pull_data/governance/natural_resource_conservation/ca_state_water_resources_board
    output_file_name: string
        output file name
    lon: string
        variable name containing the data's longitudinal coordinates
    lat: string
        variable name containing the data's latitudinal coordinates
    export: bool
        if True, uploads resulting .gpkg file to AWS
    '''
    print('Data transformation: create point geometry from datas lat/lon columns')
    print('Data transformation: reproject data to CRS:4269')
    print('Data transformation: spatial join with CA boundary data and save as .gpkg')
    if export == False:
        return
    else:
        s3_client = boto3.client('s3')  
        bucket_name = 'ca-climate-index'  

        # Specify the S3 key (path + file) to download
        s3 = f'{aws_dir}/{file_name}'

        if not os.path.exists(file_name):
            # File not found locally, download from S3
            s3_client.download_file(bucket_name, s3, file_name)

        csv_data = pd.read_csv(file_name)
        
        census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
        ca_boundaries = gpd.read_file(census_shp_dir)
        
        # Create Point geometries from latitude and longitude
        geometry = [Point(xy) for xy in zip(csv_data[lon], csv_data[lat])]
        gdf = gpd.GeoDataFrame(csv_data, geometry=geometry)

        # Perform spatial join
        gdf = gdf.set_crs('4269')
        reproject_gdf = gdf.to_crs(ca_boundaries.crs)

        joined_data = gpd.sjoin(reproject_gdf, ca_boundaries, how='left', predicate='within')

        # Plot the census tracts
        ca_boundaries.plot(figsize=(10, 10), color='white', edgecolor='black')

        # Plot the joined data points
        joined_data.plot(ax=plt.gca(), color='red', markersize=5)

        plt.title('Census Tracts with Joined Data Points')
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.show()
        
        # Replacing the starting aws directory path so output is placed in the reproject folder
        dest_path = aws_dir.replace('1_pull_data','2b_reproject')
        if export==True:
            # Upload the saved .prj file to AWS
            with open(output_file_name+'.gpkg', 'rb') as data:
                s3_client.upload_fileobj(data, bucket_name, f"{dest_path}/{output_file_name}.gpkg")
                
        print(f"Reprojected data called {output_file_name}.gpkg sent to S3 bucket: {dest_path}")

        # Remove local files  
        os.remove(file_name)
        os.remove(output_file_name+'.gpkg')

In [4]:
file_name = 'GAMA_division_drinking_water_2020_2024.csv'
aws_dir = "1_pull_data/governance/natural_resource_conservation/ca_state_water_resources_board"
dataset_lon= 'gm_longitude'
dataset_lat= 'gm_latitude'
output_file_name = 'GAMA_division_drinking_water_reproject'

reproject_csv_coordinates(file_name, aws_dir, output_file_name, dataset_lon, dataset_lat, export=False, varname='governance_swcrb_groundwater_quality')