# Upload USGS % impervious developed land to AWS
The USGS 2021 impervious lands data is CONUS-wide and very large (24 GB). This notebook subsets the locally saved tiff to CA then sends the resultant zarr to the S3 pull bucket. 

In [1]:
import xarray as xr
import rioxarray as rxr
import geopandas as gpd
import os
import sys
import boto3

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import append_metadata
from scripts.utils.file_helpers import to_zarr

In [5]:
# @append_metadata
def subset_to_CA(input_file_name, save_name, export=False, varname=''):
    '''
    Subsets USGS impervious lands data to California, then uploads to AWS subset bucket
    
    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    
    Parameters
    ----------
    input_file_name: string
        Name of origional .csv file    
    ca_boundaries: 
        Read-in gpd file of California Census Tracts
    save_name: string
        Final output as a .csv file   
    export: bool
        If True, runs code and uploads output file to S3
        If False, just generates metadata

    Script
    ------
    usgs_impervious_lands_subset.ipynb
    '''   
    ## set-up for AWS  
    s3_client = boto3.client('s3')  
    bucket_name = 'ca-climate-index'  

    top_dir = '1_pull_data'
    domain = 'natural_systems'
    indicator = 'ecosystem_condition'
    data_source = 'usgs'
    output_file_name =  '{0}/{1}/{2}/{3}/{4}.zarr'.format(
    top_dir, domain, indicator, data_source, save_name
    )
    output_file_name = "s3://ca-climate-index/"+output_file_name
    print(f'{output_file_name} uploaded to AWS.')
    
    # run code if export is true, if false, nothing happens (useful when appending metadata)
    if export == True: 
        # read in raster covering CONUS
        ds = rxr.open_rasterio(input_file_name).squeeze()

        # read in CA state shapefile
        ca_shp_dir = "s3://ca-climate-index/0_map_data/ca_state/"
        ca_shp = gpd.read_file(ca_shp_dir)
        # reproject CA boundaries to the raster's CRS
        ca_shp = ca_shp.to_crs(ds.rio.crs)
        ca_bounds = ca_shp.bounds
        minx = ca_bounds.minx.values[0]
        maxx = ca_bounds.maxx.values[0]
        miny = ca_bounds.miny.values[0]
        maxy = ca_bounds.maxy.values[0]
        
        # subset raster to CA only
        ds_bnd = ds.sel(x=slice(minx, maxx),y=slice(maxy, miny))
        ds_bnd.name = "impervious_surface"
        ds_bnd = ds_bnd.to_dataset()
        ds_bnd = ds_bnd.chunk(chunks="auto")
        ds_bnd.to_zarr(output_file_name, mode='w')       


In [6]:
# read in raster file
# this is saved locally rather than in S3 because it is very large! 
datpath = os.path.join(
    "/mnt", "c", "Users", "eliza", "Downloads", 
    "nlcd_2021_impervious_l48_20230630",
  "nlcd_2021_impervious_l48_20230630.img"
                      )
save_name = 'nlcd_ca_developed_impervious'

subset_to_CA(datpath, save_name, export=True, varname='test')

s3://ca-climate-index/1_pull_data/natural_systems/ecosystem_condition/usgs/nlcd_ca_developed_impervious.zarr uploaded to AWS.
