In [1]:
# Standard Python modules
import os, sys
import glob
import numpy as np
import pandas as pd
import xarray as xr
import re

# extras
%matplotlib inline
import metpy.calc as mpcalc
from metpy.units import units
from scipy import stats
import dask.dataframe as dd

# Import my modules
sys.path.append('../modules') # Path to modules
from utils import find_closest_MERRA2_lon_df, find_closest_MERRA2_lon, MERRA2_range, roundPartial

pd.options.display.float_format = "{:,.2f}".format # makes it so pandas tables display only first two decimals

In [2]:
path_to_data = '/data/projects/Comet/cwp140/' 
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
## Open text file with coordinates of coastal region along N. America West Coast
textpts_fname = '../data/latlon_coast-modified.txt'
df = pd.read_csv(textpts_fname, header=None, sep=' ', names=['latitude', 'longitude'], engine='python')
df['longitude'] = df['longitude']*-1

## create column with closest MERRA2 lons
df['MERRA2_lon'] = df.apply(lambda row: find_closest_MERRA2_lon_df(row), axis=1)

d = {'lat' : df['latitude'],
    'lon' : df['MERRA2_lon']}

txtpts = pd.DataFrame(d)
txtpts = txtpts.drop_duplicates(subset=['lat', 'lon'])
txtpts

Unnamed: 0,lat,lon
0,60.00,-140.00
1,59.50,-139.38
2,59.00,-138.75
3,58.50,-137.50
4,58.00,-136.25
...,...,...
109,30.50,-86.25
111,30.00,-85.62
112,29.50,-85.00
113,30.00,-84.38


In [17]:
fname = path_to_data + 'preprocessed/MERRA2/MERRA2_ARScale_US-West.nc'
arscale = xr.open_dataset(fname)

HUC8_ID = 14080107 ## Upper Yampa
fname = '/home/dnash/comet_data/preprocessed/ERA5_trajectories/final/PRISM_HUC8_{0}.nc'.format(HUC8_ID)
# fname = path_to_data + 'preprocessed/ERA5_trajectories/final/PRISM_HUC8_{0}.nc'.format(HUC8_ID)
ERA5 = xr.open_dataset(fname)
ERA5 = ERA5.isel(start_date = 0)

new_lst = []
for lon in ERA5.lon.values:
    new_lst.append(find_closest_MERRA2_lon(lon))

t = xr.DataArray(ERA5.time.values, dims=['location'], name='time') 
x = xr.DataArray(new_lst, dims=['location'])
y = xr.DataArray(roundPartial(ERA5.lat.values, 0.5), dims=['location'])

x = xr.DataArray(ERA5.lon.values, dims=("location"), coords={"lon": x}, name='traj_lons')
y = xr.DataArray(ERA5.lat.values, dims=("location"), coords={"lat": y}, name='traj_lats')

# create a new dataset that has the trajectory lat and lons and the closest MERRA2 lat/lons as coords
z = xr.merge([x, y, t])

In [18]:
## Now loop through the lat/lon pairs and see where they match
idx_lst = []
for i, (x, y) in enumerate(zip(z.lon.values, z.lat.values)):
    ### TODO: Change the arscale.lon.values to the .txt list of lat/lons
    for j, (lon, lat) in enumerate(zip(txtpts.lon.values, txtpts.lat.values)):
        ## test if lat/lon pair matches
        result_variable = (x == lon) & (y == lat)

        if (result_variable == True):
            idx = (i, j)
            idx_lst.append(idx)

idx_lst

[(32, 65), (33, 65)]

In [19]:
### Can we select the surrounding grids from a specific point?
idx = idx_lst[0]
# idx[0] is the index from the trajectory
# idx[1] is the index from the coastal intersection values

## this is the time of the trajectory when it crosses west coast
time_match = z.sel(location=idx[0]).time.values
    
# get the location index value where the lat/lon matches the coastal intersection value
idx_ds = int(arscale.location.where((arscale.lat==txtpts.iloc[idx[1]].lat) & (arscale.lon==txtpts.iloc[idx[1]].lon), drop=True).values)

## select the surrounding grid points
tmp = arscale.sel(location=slice(idx_ds-2, idx_ds+3))

## select the 12 hours on each side of the time step
sta = time_match - np.timedelta64(12,'h')
sto = time_match + np.timedelta64(12,'h')
print(sta, sto)
arscale_val = tmp.sel(time=slice(sta, sto)).ar_scale.values.max()
print(arscale_val)

## now put those values into the trajectory dataset
ERA5 = ERA5.assign(ar_scale=arscale_val)
    

2000-10-22T16:00:00.000000000 2000-10-23T16:00:00.000000000
2.0


In [20]:
time_match

numpy.datetime64('2019-12-11T02:00:00.000000000')

In [18]:
arscale_val

In [21]:
arscale_val.ar_scale.values.max()

3.0

In [None]:

if len(idx_lst) > 0:
    ## take first time the trajectory crosses the coast
    idx = idx_lst[0]
    print(idx)
    ## this is the time of the trajectory when it crosses west coast
    time_match = z.sel(location=idx[0]).time.values
    ## this is the value of MERRA2 AR scale etc. when the trajectory crosses the coast
    arscale_val = arscale.sel(location=idx[1]) # first grab the location - this should be an exact match
    arscale_val = arscale_val.sel(time=time_match, method='nearest').ar_scale.values # now grab the nearest time since ERA5 is hourly and MERRA2 is 3-hourly
    print(arscale_val)
    ## now put those values into the trajectory dataset
    ERA5 = ERA5.assign(ar_scale=arscale_val)

In [24]:
## create list of filenames to read
## fname example: MERRA_ARCats_38.0_-123.125.txtMERRA_ARCats_{0}_{1).txt'.format(lat, lon*-1)
fpath = '/data/downloaded/Reanalysis/MERRA2/ARScale/TimeSeries/'
new_df['filename'] = new_df.apply(lambda row: fpath + 'MERRA_ARCats_{0}_{1}.txt'.format(row['lat'], row['lon']), axis=1)

filenames = new_df['filename'].values
new_df.filename.loc[0]
# ## FOR READING ALL THE FILES
# filename_pattern = fpath + 'MERRA_ARCats_*.txt'

# filenames = []
# for name in glob.glob(filename_pattern):
#     filenames.append(name)
# # sort filenames so they are in chronological order
# filenames = sorted(filenames)
# print(len(filenames))

'/data/downloaded/Reanalysis/MERRA2/ARScale/TimeSeries/MERRA_ARCats_59.0_-141.25.txt'

In [26]:
def preprocess_MERRA2_txt_file(fname):
    ## read just one file
    df = pd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
    ## put time info into single column in datetime format
    df['time'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
    df = df.drop(['year', 'month', 'day', 'hour'], axis=1) # drop the other columns

    ## get lat and lon values from fname
    strp_fname = re.findall(r"[-+]?(?:\d*\.*\d+)", fname)
    lat_val = float(strp_fname[1])
    lon_val = float(strp_fname[2])
    # print(lat_val, lon_val)

    # convert to xarray
    ds = df.to_xarray() 
    ds = ds.assign(index=ds.time.values) # assign time values to index
    ds = ds.drop(['time']) # drop time variable
    ds = ds.rename({'index':'time'}) # rename index to time
    ds = ds.assign_coords(lat=lat_val, lon=lon_val) # reassign lat and lon as coords
    # ds = ds.expand_dims(dim={"lat": 1, "lon": 1})
    
    return ds

def dask_2_xarray(ddf, indexname='index'):
    ds = xr.Dataset()
    ds[indexname] = ddf.index
    for key in ddf.columns:
        ds[key] = (indexname, ddf[key].to_dask_array().compute_chunk_sizes())
    
    return ds

def preprocess_MERRA2_txt_file_using_dask(fname, times):
    ## get time information from file
    df = pd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
    ## put time info into single column in datetime format
    times = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
    
    ## create a dask dataframe
    ddf = dd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ',
                      dtype={"year": int, "month": int, "day": int, "hour": int, "ivt": float, "ar_scale": int, "tIVT": float, "duration": int})
    
    ddf = ddf.drop(['year', 'month', 'day', 'hour'], axis=1) # drop the other columns
    ## convert to xarray
    ds = dask_2_xarray(ddf)

    ### weirdly messy hack to get times to assign
    ds = ds.assign(time=times) # assign time values to index
    ds = ds.assign(index=ds.time.values) # assign time values to index
    ds = ds.drop(['time']) # drop time variable
    ds = ds.rename({'index':'time'}) # rename index to time
    ds = ds.drop(['dim_0']) # drop time variable

    ## get lat and lon values from fname
    strp_fname = re.findall(r"[-+]?(?:\d*\.*\d+)", fname)
    lat_val = float(strp_fname[1])
    lon_val = float(strp_fname[2])
    x = xr.DataArray([lon_val], dims=['location'])
    y = xr.DataArray([lat_val], dims=['location'])
    ds = ds.assign_coords(lat=y, lon=x) # reassign lat and lon as coords
    print(len(ds.time), lat_val, lon_val)
    # ds = ds.expand_dims(dim={"lat": 1, "lon": 1})
    
    return ds

In [27]:
%%time
## get time information from first file
df = pd.read_csv(filenames[0], header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
## put time info into single column in datetime format
times = pd.to_datetime(df[['year', 'month', 'day', 'hour']])

ds_lst = []

for i, fname in enumerate(filenames):
    ds = preprocess_MERRA2_txt_file_using_dask(fname, times)
    ds_lst.append(ds)

ds_final = xr.concat(ds_lst, dim="location")
ds_final

117608 59.0 -141.25
117608 59.5 -140.625
117608 60.0 -140.0
117128 60.5 -139.375
117128 61.0 -138.75
117608 58.5 -140.625
117608 59.0 -140.0
117608 59.5 -139.375
117608 60.0 -138.75
117128 60.5 -138.125
117608 58.0 -140.0
117608 58.5 -139.375
117608 59.0 -138.75
117608 59.5 -138.125
117608 60.0 -137.5
117608 57.5 -138.75
117608 58.0 -138.125
117608 58.5 -137.5
117608 59.0 -136.875
117608 59.5 -136.25
117608 57.0 -137.5
117608 57.5 -136.875
117608 58.0 -136.25
117608 58.5 -135.625
117608 59.0 -135.0
117608 56.5 -137.5
117608 57.0 -136.875
117608 57.5 -136.25
117608 58.0 -135.625
117608 58.5 -135.0
117608 56.0 -136.875
117608 56.5 -136.25
117608 57.0 -135.625
117608 57.5 -135.0
117608 58.0 -134.375
117608 55.5 -136.25
117608 56.0 -135.625
117608 56.5 -135.0
117608 57.0 -134.375
117608 57.5 -133.75
117608 55.0 -135.625
117608 55.5 -135.0
117608 56.0 -134.375
117608 56.5 -133.75
117608 57.0 -133.125
117608 54.5 -135.0
117608 55.0 -134.375
117608 55.5 -133.75
117608 56.0 -133.125
117608 56.

Unnamed: 0,Array,Chunk
Bytes,507.12 MiB,0.98 MiB
Shape,"(517, 128568)","(1, 128568)"
Count,4160 Tasks,517 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 507.12 MiB 0.98 MiB Shape (517, 128568) (1, 128568) Count 4160 Tasks 517 Chunks Type float64 numpy.ndarray",128568  517,

Unnamed: 0,Array,Chunk
Bytes,507.12 MiB,0.98 MiB
Shape,"(517, 128568)","(1, 128568)"
Count,4160 Tasks,517 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,507.12 MiB,0.98 MiB
Shape,"(517, 128568)","(1, 128568)"
Count,4677 Tasks,517 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 507.12 MiB 0.98 MiB Shape (517, 128568) (1, 128568) Count 4677 Tasks 517 Chunks Type float64 numpy.ndarray",128568  517,

Unnamed: 0,Array,Chunk
Bytes,507.12 MiB,0.98 MiB
Shape,"(517, 128568)","(1, 128568)"
Count,4677 Tasks,517 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,507.12 MiB,0.98 MiB
Shape,"(517, 128568)","(1, 128568)"
Count,4160 Tasks,517 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 507.12 MiB 0.98 MiB Shape (517, 128568) (1, 128568) Count 4160 Tasks 517 Chunks Type float64 numpy.ndarray",128568  517,

Unnamed: 0,Array,Chunk
Bytes,507.12 MiB,0.98 MiB
Shape,"(517, 128568)","(1, 128568)"
Count,4160 Tasks,517 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,507.12 MiB,0.98 MiB
Shape,"(517, 128568)","(1, 128568)"
Count,4677 Tasks,517 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 507.12 MiB 0.98 MiB Shape (517, 128568) (1, 128568) Count 4677 Tasks 517 Chunks Type float64 numpy.ndarray",128568  517,

Unnamed: 0,Array,Chunk
Bytes,507.12 MiB,0.98 MiB
Shape,"(517, 128568)","(1, 128568)"
Count,4677 Tasks,517 Chunks
Type,float64,numpy.ndarray


In [28]:
## save file
fname = path_to_data + 'preprocessed/MERRA2/MERRA2_ARScale_US-West.nc'
ds_final.to_netcdf(path=fname, mode = 'w', format='NETCDF4')