# Collect AORC v1.0 at lat/lon Points


In [2]:
%pip install dask[distributed] zarr xarray pandas s3fs kerchunk scikit-learn -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
import dask
import zarr
import numpy
import xarray
import pyproj
import pandas
from s3fs import S3FileSystem
from dask.distributed import Client, progress
from kerchunk.combine import MultiZarrToZarr
from sklearn.metrics import pairwise_distances_argmin


import pickle
from datetime import datetime, timedelta


import aorc1

Initiate the Dask client. This will enable us to parallelize our computations.

In [4]:
# use a try accept loop so we only instantiate the client
# if it doesn't already exist.
try:
    print(client.dashboard_link)
except:    
    # The client should be customized to your workstation resources.
    client = Client(n_workers=8) # per worker
    print(client.dashboard_link)


/user/castronova/proxy/8787/status


## Load AORC V1.0 from AWS

In [5]:
%%time
ds = aorc1.load_aorc_dataset('2010', '01', '01')

CPU times: user 4.94 s, sys: 297 ms, total: 5.24 s
Wall time: 8.47 s


In [38]:
training_path = "../Snow-Extrapolation/data/RegionTrain_SCA.pkl"
with open(training_path, 'rb') as f:
    region_train = pickle.load(f)

pts = []
key = 'N_Sierras'
region_train[key]['pt'] = list(zip(region_train[key].Long, region_train[key].Lat))
#region_train[key].loc[region_train[key].pt.unique()
#pts.extend(list(region_train[key]['pt'].unique()))


In [50]:
region_train[key].pt.unique()

array([(-121.9394134663883, 41.16664739027599),
       (-120.6188899987326, 39.675880337476684),
       (-120.87940143112729, 39.786416508865145),
       (-121.78669986808801, 40.78005174338435),
       (-120.17871550951399, 41.99314916228401),
       (-121.3195759203458, 39.81402285325959)], dtype=object)

In [48]:
len(region_train[key].index.unique())

6

In [59]:
# open locations
#df_meta = pandas.read_csv('../Snow-Extrapolation/data/PreProcessed/ground_measures_metadata.csv')
training_path = "../Snow-Extrapolation/data/RegionTrain_SCA.pkl"

with open(training_path, 'rb') as f:
    region_train = pickle.load(f)

pts = []
regions = []
for key in ['N_Sierras', 'S_Sierras_Low', 'S_Sierras_High']:
    region_train[key]['pt'] = list(zip(region_train[key].Long, region_train[key].Lat))
    region_train[key]['region'] = key
    region_train = region_train[key].pt.unique()
    
    regions.append(region_train[key])
    
    pts.extend()
    
#pts = list(set(pts))

In [70]:
len(regions[0].drop_duplicates(subset='pt'))

6

In [75]:
regions[0].region.unique()

array(['N_Sierras'], dtype=object)

In [71]:
len(regions[1].drop_duplicates(subset='pt'))

569

In [76]:
regions[1].region.unique()

array(['S_Sierras_Low'], dtype=object)

In [72]:
len(regions[2].drop_duplicates(subset='pt'))

324

In [77]:
regions[2].region.unique()

array(['S_Sierras_High'], dtype=object)

In [5]:
lats = df_meta.latitude.values
lons = df_meta.longitude.values

In [6]:
# function to collect all indexes

@dask.delayed
def extract_dask(search_points, all_points, final_shape):
    index = pairwise_distances_argmin(X=search_points,
                                      Y=all_points)
    i0, j0 = numpy.unravel_index(index, (final_shape))
    return(i0, j0)
#    return ds.isel(x=j0, y=i0).squeeze()

In [7]:
%%time

points = numpy.array(list(zip(lons, lats)))

# batch index collection using dask
pt_groups = numpy.array_split(numpy.array(points), 100)
all_pts = numpy.c_[ds['lon'].values.ravel(), ds['lat'].values.ravel()]
final_shape = ds['lon'].shape


print('scattering...', end='', flush=True)
all_pts_scattered = client.scatter(all_pts)
print('done')

futures = []
for grp in pt_groups:
    futures.append(extract_dask(grp, all_pts_scattered, final_shape)) 
    

scattering...done
CPU times: user 420 ms, sys: 243 ms, total: 663 ms
Wall time: 669 ms


In [8]:
%%time
results = dask.compute(futures)

CPU times: user 1.63 s, sys: 207 ms, total: 1.84 s
Wall time: 10.6 s


In [9]:
# put the x,y coordinates for the matching cells into lists
i_locs = []
j_locs = []
for grp in results[0]:
    num_elements = len(grp[0])
    for idx in range(0, num_elements):
        i_locs.append(grp[0][idx])
        j_locs.append(grp[1][idx])

In [20]:
# function to collect variables for time range


@dask.delayed
def get_data_dask(i_locs, j_locs, year='2010', month='01', day='01'):
    ds = aorc1.load_aorc_dataset(year, month, day)
    precip = ds.isel(x=i_locs, y=j_locs).squeeze().RAINRATE
    
    
    with open(f'{year}{month}{day}.pkl', 'wb') as f:
        pickle.dump(precip.values, f)
    
    return datetime(int(year), int(month), int(day)),
#            'precip': precip.values}


def get_data_daskbag(args):
    i_locs = args[0]
    j_locs = args[1]
    dt = args[2]
    
    # get the date parts
    month = f'{dt.month:02}'    
    day = f'{dt.day:02}'
    year = f'{dt.year:04}'

    ds = aorc1.load_aorc_dataset(year, month, day)
    
    precip = ds.isel(x=i_locs, y=j_locs) #.squeeze().RAINRATE
    precip = precip.RAINRATE.groupby('time.dayofyear').sum() * 24 * 3600
    pcp_df = precip.to_dataframe().reset_index()
    
    pcp_df['date'] = dt
    pcp_df = pcp_df[['lat', 'lon', 'RAINRATE', 'date']]
    pcp_df.rename(columns={'RAINRATE': 'RAINRATE [mm]'}, inplace=True)
    pcp_df.set_index('date', inplace=True)
    pcp_df.to_csv(f'{year}{month}{day}.csv')
    
    return f'{year}{month}{day}.csv'


In [21]:
%%time 

# isolate the lat/lon points that we're interested in
ind_x = xarray.DataArray(i_locs, dims=["pt"])
ind_y = xarray.DataArray(j_locs, dims=["pt"])


CPU times: user 232 µs, sys: 0 ns, total: 232 µs
Wall time: 238 µs


In [22]:
%%time

# batch variable collection

t = datetime(2010,1,1)
et = datetime(2010,2,1)

input_params = []
while t <= et:
    input_params.append([ind_x, ind_y, t])
    t += timedelta(days=1)
    
import dask.bag as db  
b = db.from_sequence(input_params, npartitions=6)
b = b.map(get_data_daskbag)


CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.63 ms


In [23]:
%%time

results = b.compute()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 14.2 s, sys: 1.51 s, total: 15.7 s
Wall time: 2min 39s


In [None]:
results