In [1]:
import pandas as pd
import os
import sys
import geopandas as gpd
import xarray as xr
import boto3
import re
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [4]:
# @append_metadata
def reproject_nlcd_impervious_lands(ds, ca_boundaries, run_code=True, varname=''):
    """
    Reprojects the CA-wide USGS impervious lands zarr to California Census Tract Coordinate Reference System, 
    then clips to these CA tracts, and uploads to AWS S3. This code differs from the 
    reproject_shapefile() function by utilizing dask-geopandas to manipulate large datasets and saving the result
    as 13 parquet files. 

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in
    ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
     
    Methods
    -------
    Use dask-geopandas to work with the large datasets
    
    Parameters
    ----------
    zarr_fname: string
        filename of the USGS impervious lands zarr
    ca_boundaries: 
        read-in gpd file of California Census Tracts
    run_code: bool
        if True, code will run. If false, just metadata file will be updated

    Script
    ------
    large_geospatial_reproject.ipynb    
    """
    s3_client = boto3.client('s3')  
    bucket_name = 'ca-climate-index' 
    var = 'natural_usgs_impervious'
    dest_f = in_fname.replace(
        in_fname.split('/')[-1],f"{var}.parquet.gzip")
    dest_f = re.sub(r'1_pull_data', '2b_reproject', dest_f)
                
    print('Data transformation: Reproject to standard coordinate reference system: 4269.')    
    print('Data transformation: sjoin large geodata with CA census tract boundaries data.')    
    print(
            "Data transformation: Saved as multiple parquet files because"
            +" the resulting dataset is too large to be saved as one file."
    )
    print(f"Parquets saved to: s3://ca-climate-index/2b_reproject/natural_systems/ecosystem_condition/usgs/")
        
    if run_code==True:
        orig_crs = ds.spatial_ref.attrs["crs_wkt"]
        cb_crs = ca_boundaries.crs
        ca_boundaries = ca_boundaries[["GEOID","geometry"]]

        da = ds.impervious_surface.chunk(chunks={'x':5000,'y':5000})
        df = da.to_dask_dataframe()
        df = df[["impervious_surface","x","y"]]
        print('made dask df')

        gdf_list = []
        for i in range(len(list(df.partitions))):
            print(f"reading in partition {i}")
            part_df = df.partitions[i].compute()
            part_df = part_df[part_df["impervious_surface"]!=127.0]
            gdf = gpd.GeoDataFrame(
                part_df, geometry=gpd.points_from_xy(part_df.x,part_df.y, crs=orig_crs)
            )
            gdf = gdf.to_crs(cb_crs)
            gdf = gdf.sjoin(ca_boundaries, how='inner', predicate='intersects')
            gdf = gdf.drop(columns=["index_right","x","y"])
            dest_f = dest_f.replace(
                dest_f.split('/')[-1],f"ca_clipped_{var}_partition_{i}.parquet.gzip")
            gdf.to_parquet(dest_f, compression='gzip')
            # gdf["pixel_area"] = 900
            # gdf["area_impervious"] = gdf["pixel_area"] * gdf["impervious_surface"]
            # gdf = gdf["area_impervious"].groupby(gdf["GEOID"]).sum()
            # gdf_list.append(gdf)

        gdf_merged = pd.concat(gdf_list)
            # raise Exception

        return gdf_list



In [5]:
# open NLCD zarr from our S3 bucket
in_fname = 's3://ca-climate-index/1_pull_data/natural_systems/ecosystem_condition/usgs/nlcd_ca_developed_impervious.zarr'
ds = xr.open_zarr(in_fname)
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
# ca_boundaries = ca_boundaries.to_crs(crs=3310)
varname = 'test'


In [6]:
rdf = reproject_nlcd_impervious_lands(ds, ca_boundaries, run_code=True, varname=varname)

Data transformation: Reproject to standard coordinate reference system: 4269.
Data transformation: sjoin large geodata with CA census tract boundaries data.
Data transformation: Saved as multiple parquet files because the resulting dataset is too large to be saved as one file.
Parquets saved to: s3://ca-climate-index/2b_reproject/natural_systems/ecosystem_condition/usgs/
made dask df
reading in partition 0


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array.reshape(shape)

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    >>> array.reshape(shape, limit='128 MiB')
  rdf = reproject_nlcd_impervious_lands(ds, ca_boundaries, run_code=True, varname=varname)
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array.reshape(shape)

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    >>> array.reshape(shape, limit='128 MiB')
  rdf = reproject_nlcd_impervious_lands(ds, ca_boundaries, run_code=True, varname=varname)


GEOID
06023011200    151798500.0
06023011502            0.0
06023990100            0.0
06045011102     41440500.0
06045990100            0.0
06097990100            0.0
Name: area_impervious, dtype: float64
reading in partition 1
GEOID
06023001300      2674800.0
06023010701     29214900.0
06023010702     64704600.0
06023010800    302330700.0
06023010901     16840800.0
06023010902      5352300.0
06023011000     24581700.0
06023011100     65029500.0
06023011200    298017000.0
06023011501      8081100.0
06023011502    168096600.0
06023990100            0.0
06045010200     74545200.0
06045010300    220070700.0
06045010400    238576500.0
06045010500    148572000.0
06045011001    139797900.0
06045011003    167482800.0
06045011004    137909700.0
06045011102    219902400.0
06045011200     12181500.0
06045990100            0.0
06075980401      1312200.0
06097154304      1839600.0
06097154307     51088500.0
06097154308     91913400.0
06097990100            0.0
Name: area_impervious, dtype: float6

AttributeError: 'Series' object has no attribute 'to_parquet'

In [7]:
rdf

NameError: name 'rdf' is not defined