In [1]:
import geopandas as gpd
import s3fs
import pandas as pd
import boto3
import dask_geopandas
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
import sys

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

## Original attempt to pull the parquet files, it only pulls the first parquet file

In [27]:
fs = s3fs.S3FileSystem()
bucket = 'ca-climate-index'
path = '2b_reproject/' 
pqt_list = [
    'natural_systems/ecosystem_condition/usgs/parquet_files/'
]

for pqt in pqt_list:
    ppath = path+pqt
    bucket_uri = f's3://{bucket}/{ppath}'
    print(pqt)
    df = gpd.read_parquet(bucket_uri)
impervious_surfaces_data_single_parquet = df
impervious_surfaces_data_single_parquet

natural_systems/ecosystem_condition/usgs/parquet_files/


Unnamed: 0,impervious_surface,geometry,index_right,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
2092,,POINT (-124.13651 41.45499),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
2093,,POINT (-124.13606 41.45481),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
2094,,POINT (-124.13561 41.45463),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
2095,,POINT (-124.13516 41.45445),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
2096,,POINT (-124.13471 41.45427),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11467,0.0,POINT (-120.00280 39.71763),3216,06,091,010000,06091010000,100,Census Tract 100,G5020,S,2468694582,23299110,+39.5769252,-120.5219926
11468,0.0,POINT (-120.00237 39.71743),3216,06,091,010000,06091010000,100,Census Tract 100,G5020,S,2468694582,23299110,+39.5769252,-120.5219926
11469,0.0,POINT (-120.00194 39.71724),3216,06,091,010000,06091010000,100,Census Tract 100,G5020,S,2468694582,23299110,+39.5769252,-120.5219926
11470,0.0,POINT (-120.00151 39.71705),3216,06,091,010000,06091010000,100,Census Tract 100,G5020,S,2468694582,23299110,+39.5769252,-120.5219926


## Code to loop through the parquets, store them, then stitch together into one df

In [36]:
fs = s3fs.S3FileSystem()
bucket = 'ca-climate-index'
path = '2b_reproject/' 
pqt_list = [
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_0.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_1.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_2.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_3.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_4.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_5.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_6.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_7.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_8.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_9.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_10.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_11.parquet.gzip',
    'natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_12.parquet.gzip'
    ]

all_dfs = []
for pqt in pqt_list:
    ppath = path + pqt
    bucket_uri = f's3://{bucket}/{ppath}'
    
    # List all files in the directory
    files = fs.ls(bucket_uri)
   
    for file in files:
        if file.endswith('.parquet') or file.endswith('.gzip'):
            file_uri = f's3://{bucket}/{ppath}'
            print(f" - {file_uri}")
            df = gpd.read_parquet(file_uri)
            print('length of df:',len(df))
            all_dfs.append(df)

print(len(all_dfs))       
impervious_surfaces_data = gpd.GeoDataFrame(pd.concat(all_dfs, ignore_index=True))
impervious_surfaces_data

 - s3://ca-climate-index/2b_reproject/natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_0.parquet.gzip
length of df: 9380
 - s3://ca-climate-index/2b_reproject/natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_1.parquet.gzip
length of df: 0
 - s3://ca-climate-index/2b_reproject/natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_2.parquet.gzip
length of df: 0
 - s3://ca-climate-index/2b_reproject/natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_3.parquet.gzip
length of df: 0
 - s3://ca-climate-index/2b_reproject/natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_4.parquet.gzip
length of df: 0
 - s3://ca-climate-index/2b_reproject/natural_systems/ecosystem_condition/usgs/parquet_files/ca_clipped_natural_usgs_impervious_5.parquet.gzip
length of df: 0
 - s3://ca-climate-index/2b_reproject/natur

Unnamed: 0,impervious_surface,geometry,index_right,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON
0,,POINT (-124.13651 41.45499),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
1,,POINT (-124.13606 41.45481),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
2,,POINT (-124.13561 41.45463),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
3,,POINT (-124.13516 41.45445),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
4,,POINT (-124.13471 41.45427),2619,06,023,990100,06023990100,9901,Census Tract 9901,G5020,S,0,1061851402,+40.7517379,-124.2478550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9375,0.0,POINT (-120.00280 39.71763),3216,06,091,010000,06091010000,100,Census Tract 100,G5020,S,2468694582,23299110,+39.5769252,-120.5219926
9376,0.0,POINT (-120.00237 39.71743),3216,06,091,010000,06091010000,100,Census Tract 100,G5020,S,2468694582,23299110,+39.5769252,-120.5219926
9377,0.0,POINT (-120.00194 39.71724),3216,06,091,010000,06091010000,100,Census Tract 100,G5020,S,2468694582,23299110,+39.5769252,-120.5219926
9378,0.0,POINT (-120.00151 39.71705),3216,06,091,010000,06091010000,100,Census Tract 100,G5020,S,2468694582,23299110,+39.5769252,-120.5219926


Couple of print statements to see how many unique entries we have for the impervious surface and census tract columns
* counts seem low

In [37]:
print(len(impervious_surfaces_data.impervious_surface.unique()))
print(impervious_surfaces_data.impervious_surface.min())
print(impervious_surfaces_data.impervious_surface.max())
print(impervious_surfaces_data.impervious_surface.unique())

86
0.0
99.0
[nan  0. 22. 36. 31. 37. 13.  5. 20.  1. 11. 25.  2. 12. 21.  3.  8.  6.
 14. 19. 16. 61. 18.  9.  4. 10.  7. 39. 15. 24. 17. 32. 41. 33. 23. 26.
 27. 45. 28. 48. 66. 44. 29. 42. 46. 51. 55. 84. 52. 35. 59. 82. 60. 54.
 43. 47. 49. 40. 56. 38. 93. 98. 95. 87. 75. 88. 96. 94. 92. 85. 78. 83.
 99. 89. 86. 69. 57. 30. 34. 50. 58. 67. 65. 81. 53. 70.]


In [38]:
print(len(impervious_surfaces_data.GEOID.unique()))
print(impervious_surfaces_data.GEOID.unique())

22
['06023990100' '06023010200' '06023010102' '06105000200' '06093000800'
 '06105000102' '06105000101' '06089012400' '06089012500' '06089011803'
 '06089011802' '06089012606' '06089012605' '06089012604' '06089012603'
 '06103000100' '06063000501' '06063000502' '06063000400' '06063000300'
 '06035040600' '06091010000']


In [39]:
print(len(impervious_surfaces_data.COUNTYFP.unique()))

8


## Renaming and reading census data in for later

In [None]:
impervious_surfaces_columns = impervious_surfaces_data[['GEOID', 'geometry', 'impervious_surface']]
impervious_surfaces_columns = impervious_surfaces_columns.rename(columns={'GEOID':'tract'})
impervious_surfaces_columns

In [None]:
grouped_impervious_surfaces = impervious_surfaces_columns.groupby('tract')['impervious_surface'].mean().reset_index()
grouped_impervious_surfaces

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

In [None]:
impervious_surface_merge = pd.merge(ca_tract_county, impervious_surfaces_columns, on='tract', how='left')
impervious_surface_merge