__get_WSEs__</br>
<img src="http://static1.squarespace.com/static/530979d9e4b04bff4a3aadf5/t/5446c34ce4b0e5a2c7ff2614/1413923661373/DewberryLogo_RGB.png?format=1500w" width="40%" align='right'/>
PYTHON 3.6</br></br>
__Overview: Retrieve water surface elevations from tifs and output to a single CSV__</br></br>
Updated 2018-12-14</br>
by Chris Maderia: cmaderia@dewberry.com</br>
by Stephen Duncan: sduncan@dewberry.com

# Import Python Libraries

In [1]:
import os, sys, time, boto3
import numpy as np
import pandas as pd
from osgeo import gdal
from io import BytesIO
import geopandas as gpd

%matplotlib inline

In [11]:
shapefile = gpd.GeoDataFrame
s3 = boto3.resource('s3')

# Functions

In [2]:
def tprint(stime:time.time):
    '''Print the time given a start time'''
    print('Total Time: {:0.6f} seconds'.format(time.time() - stime))

In [3]:
# https://stackoverflow.com/questions/2130016/splitting-a-list-into-n-parts-of-approximately-equal-length
def chunkIt(seq:float, num:float):
    '''Divide data into chunks - this is only for multiprocessing'''
    avg = len(seq) / float(num)
    out = []
    last = 0.0
    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg
    return out

In [12]:
def getRasData(s3path:s3.Object):
    '''Read a raster from S3 into memory and get attributes'''
    image_data = BytesIO(s3path.get()['Body'].read())
    tif_inmem = "/vsimem/wse.tif" #Virtual Folder to Store Data
    gdal.FileFromMemBuffer(tif_inmem, image_data.read())
    ras = gdal.Open(tif_inmem)  
    rb, gt = ras.GetRasterBand(1), ras.GetGeoTransform()
    return rb, gt, ras

In [5]:
def get_files(bucketname:str, prefixname:str, textstring:str):
    '''For navigating "folder" structures in bucket, Get list of files from bucket/prefix in S3
    https://stackoverflow.com/questions/35803027/retrieving-subfolders-names-in-s3-bucket-from-boto3'''
    wselist = []
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(name=bucketname)
    FilesNotFound = True
    for obj in bucket.objects.filter(Prefix=prefixname):
        if textstring in str(obj) and 'xml' not in str(obj):
            #print('{0}:{1}'.format(bucket.name, obj.key))
            wselist.append('{0}:{1}'.format(bucket.name, obj.key))
        FilesNotFound = False
    if FilesNotFound:
        print("ALERT", "No file in {0}/{1}".format(bucket, prefixname))
    return wselist 

In [6]:
def query(x:float,y:float,gt:any,rb:any) -> float:
    '''Queries one specific cell in the rasterband given an x, y in the geotransform'''
    px = int((x - gt[0]) / gt[1])   
    py = int((y - gt[3]) / gt[5])   
    return rb.ReadAsArray(px,py,1,1)[0][0]  

In [9]:
def s3Attributes(s3path:str, replace_str:str='', rtype='') -> any:
    '''Creates the S3 object to write or read from AWS bucket'''
    s3 = boto3.resource('s3')
    parent = s3path.split(':')[0]
    name = s3path.split(':')[1].split('//')[-1].replace(replace_str,'')
    s3path = s3.Object(s3path.split(':')[0], s3path.split(':')[1])
    if rtype == 'NAME': return name
    elif rtype == 'PARENT': return parent
    elif rtype == 'S3PATH': return s3path
    else: return parent, name, s3path

In [26]:
def getWSEfromS3(list_of_wses:list,structures:shapefile,bldgidfield:str,mod_value:int=False):
    '''For a given GeoDataFrame and list of s3 Tif paths, a list of dataframes is returned to the user.
    Each Dataframe contains the Tif value from the intersect and the Building x,y location.'''
    start_time, dflist = time.time(), []
    s3_objects = [s3Attributes(wse,'.tif','S3PATH') for wse in list_of_wses]
    s3_names = [s3Attributes(wse,'.tif','NAME') for wse in list_of_wses]
    for i, s3_obj in enumerate(s3_objects):
        if mod_value:
            if i%mod_value==0:print(f' - Progress at {i}: {(time.time()-start_time)/60:0.2f} Minutes')
        rb, gt, ras = getRasData(s3_obj)
        results = []
        for idx in structures.index:
            uniqueId = structures.loc[idx,bldgidfield]
            bldg = structures.loc[idx,'geometry']
            x, y = bldg.x, bldg.y
            wse_value = query(x,y,gt,rb)
            results.append([uniqueId,wse_value])
        df_temp = pd.DataFrame(results,columns=[bldgidfield,s3_names[i]])
        df_temp.set_index(bldgidfield, inplace=True)
        dflist.append(df_temp)
    print(f'Total Time: {(time.time()-start_time)/60:0.2f} Minutes')
    return dflist

# Read in structures

In [15]:
structures_path = r'T:\CCSI\TECH\FEMA\2018_SO4_Innovation\AAL_Comps\LEVEE_AREAS\AR_LA_MS\structures\market_basket\all_risks_ucmb_MarketBasket_0_ARLAMS_REPROJ.shp'
gdf = gpd.read_file(structures_path)
gdf.head(2)

Unnamed: 0,accntnum,location,BLDG_DED,BLDG_LIMIT,CNT_DED,CNT_LIMIT,STATE,POSTCODE,COUNTRY,LON,...,CONSTR_COD,NUM_STORIE,YEAR_BUILT,BasementFi,FIRST_FLOO,BASE_FLOOD,elev_ft,LEVEED_ID,FC_SYSTEM_,geometry
0,0-LA00972988,LA00972988,5000,105352,3000,52103,LA,71355,US,-91.884948,...,2,1,1934,0,0,0,40.954913,4406000000.0,4405001000.0,POINT (1284806.470806942 2922399.00763996)
1,0-LA00972988,LA00972988,5000,105352,3000,52103,LA,71355,US,-91.884948,...,2,1,1934,0,0,0,40.954913,5906000000.0,5905000000.0,POINT (1284806.470806942 2922399.00763996)


# Identify Amazon S3 Location of Data

In [23]:
# s3 runs that I need to read</br>
bucket = 'probmodelingrepository'
prefix = 'AR-LA-MS-River/ProductionRuns/outputs/BaseAR'
search_str = 'WSE_'
s3_wses = get_files(bucket,prefix,'WSE_')
print(f'Found {len(s3_wses)} "{search_str}" Tifs')

Found 300 "WSE_" Tifs


# Get and Read Values from tifs, concatonate the results into 1 dataframe

In [None]:
bldgidfield = 'accntnum'
dflist = getWSEfromS3(s3_wses,gdf,bldgidfield,mod_value=15)

 - Progress at 0: 0.09 Minutes
 - Progress at 15: 2.18 Minutes
 - Progress at 30: 4.28 Minutes
 - Progress at 45: 6.46 Minutes
 - Progress at 60: 8.48 Minutes
 - Progress at 75: 10.52 Minutes
 - Progress at 90: 12.57 Minutes
 - Progress at 105: 14.83 Minutes
 - Progress at 120: 17.03 Minutes
 - Progress at 135: 19.28 Minutes
 - Progress at 150: 21.34 Minutes
 - Progress at 165: 23.43 Minutes
 - Progress at 180: 25.58 Minutes
 - Progress at 195: 27.79 Minutes
 - Progress at 210: 30.35 Minutes
 - Progress at 225: 32.75 Minutes
 - Progress at 240: 35.00 Minutes
 - Progress at 255: 37.12 Minutes
 - Progress at 270: 39.31 Minutes
 - Progress at 285: 41.57 Minutes


In [None]:
df_full = pd.concat(dflist)

In [None]:
df_full.head()