In [1]:
import geopandas as gpd
import s3fs
import pandas as pd
import boto3
import dask_geopandas
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
import sys
import re

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
def build_usgs_pqt_file_list(
    path='2b_reproject/natural_systems/ecosystem_condition/usgs'
):
    """ Build a list of parquet URIs contained in S3 folder """
    # initiate empty list for s3 URIs
    all_pqt = []
    bucket = 'ca-climate-index' 
    # initiate s3 session
    session = boto3.Session()
    # use the session to get the resource
    s3 = session.resource('s3')
    my_bucket = s3.Bucket(bucket)
    # iterate through directory
    for obj in my_bucket.objects.filter(
        Prefix=path):
        if obj.key.endswith('.parquet.gzip'):
            all_pqt.append( f's3://{bucket}/{obj.key}')
    return all_pqt

def natural_sort(l): 
    """ Sort list numerically despite missing leading 0s 
    
    Parameters
    ----------
    l: list
        list of strings containing numbers to naturally sort
        
    Acknowledgement
    ----------
    Special thanks to Mark Byers on stackoverflow.
    """
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)

In [3]:
# make sorted list of the 45 parquets in the bucket
pqt_list = natural_sort(build_usgs_pqt_file_list())

# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)
# keep the columns we need
ca_boundaries = ca_boundaries[["GEOID","geometry"]]
# change to area-preserving CRS
ca_boundaries = ca_boundaries.to_crs("epsg:3310") # CA Albers
# calculate area of each tract
ca_boundaries["tract_area"] = ca_boundaries.area

In [10]:
%%time
gdf_list = []
for pqt in pqt_list:
    gdf = gpd.read_parquet(pqt)
    gdf["pixel_area"] = 900 # 30x30 m pixel
    gdf["impervious_surface"] = gdf["impervious_surface"]*1e-2
    gdf["pixel_area_impervious"] = gdf["pixel_area"] * gdf["impervious_surface"]
    agg_gdf = pd.Series(gdf["pixel_area_impervious"].groupby(gdf["GEOID"]).sum())
    # gdf = gdf[["GEOID","area_impervious"]]
    gdf_list.append(gdf)
    raise Exception

Exception: 

In [13]:
aa = pd.Series(gdf["pixel_area_impervious"].groupby(gdf["GEOID"]).sum())

In [None]:
aa["percent_impervious"] = 

In [9]:
gdf.area_impervious.unique()

array([nan])

In [5]:
gdf_agg = pd.concat(gdf_list)
gdf_agg

Unnamed: 0,area_impervious
6154,
6155,
6156,
6157,
6158,
...,...
963632076,
963632077,
963672507,
963672508,


In [None]:
# due to partitioning, many GEOIDs have more than one entry
# so we sum over each GEOID one more time

## Code to loop through the parquets, store them, then stitch together into one df

Couple of print statements to see how many unique entries we have for the impervious surface and census tract columns
* counts seem low

## Renaming and reading census data in for later

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

In [None]:
impervious_surfaces_columns = impervious_surfaces_data[['GEOID', 'geometry', 'impervious_surface']]
impervious_surfaces_columns = impervious_surfaces_columns.rename(columns={'GEOID':'tract'})
impervious_surfaces_columns

In [None]:
grouped_impervious_surfaces = impervious_surfaces_columns.groupby('tract')['impervious_surface'].mean().reset_index()
grouped_impervious_surfaces

In [None]:
impervious_surface_merge = pd.merge(ca_tract_county, impervious_surfaces_columns, on='tract', how='left')
impervious_surface_merge