In [1]:
# Standard Python modules
import os, sys
import glob
import numpy as np
import pandas as pd
import xarray as xr
import re

# extras
%matplotlib inline
import metpy.calc as mpcalc
from metpy.units import units
from scipy import stats
import dask.dataframe as dd

# Import my modules
sys.path.append('../modules') # Path to modules
from utils import find_closest_MERRA2_lon_df

pd.options.display.float_format = "{:,.2f}".format # makes it so pandas tables display only first two decimals

In [2]:
path_to_data = '/data/projects/Comet/cwp140/' 
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
## Open text file with coordinates of coastal region along N. America West Coast
textpts_fname = '../data/latlon_coast.txt'
df = pd.read_csv(textpts_fname, header=None, sep=' ', names=['latitude', 'longitude'], engine='python')
df['longitude'] = df['longitude']*-1

## create column with closest MERRA2 lons
df['MERRA2_lon'] = df.apply(lambda row: find_closest_MERRA2_lon_df(row), axis=1)

## create list of filenames to read
## fname example: MERRA_ARCats_38.0_-123.125.txtMERRA_ARCats_{0}_{1).txt'.format(lat, lon*-1)
fpath = '/data/downloaded/Reanalysis/MERRA2/ARScale/TimeSeries/'
df['filename'] = df.apply(lambda row: fpath + 'MERRA_ARCats_{0}_{1}.txt'.format(row['latitude'], row['MERRA2_lon']), axis=1)

filenames = df['filename'].values
df.filename.loc[0]
# ## FOR READING ALL THE FILES
# filename_pattern = fpath + 'MERRA_ARCats_*.txt'

# filenames = []
# for name in glob.glob(filename_pattern):
#     filenames.append(name)
# # sort filenames so they are in chronological order
# filenames = sorted(filenames)
# print(len(filenames))

'/data/downloaded/Reanalysis/MERRA2/ARScale/TimeSeries/MERRA_ARCats_60.0_-140.0.txt'

In [4]:
def preprocess_MERRA2_txt_file(fname):
    ## read just one file
    df = pd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
    ## put time info into single column in datetime format
    df['time'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
    df = df.drop(['year', 'month', 'day', 'hour'], axis=1) # drop the other columns

    ## get lat and lon values from fname
    strp_fname = re.findall(r"[-+]?(?:\d*\.*\d+)", fname)
    lat_val = float(strp_fname[1])
    lon_val = float(strp_fname[2])
    # print(lat_val, lon_val)

    # convert to xarray
    ds = df.to_xarray() 
    ds = ds.assign(index=ds.time.values) # assign time values to index
    ds = ds.drop(['time']) # drop time variable
    ds = ds.rename({'index':'time'}) # rename index to time
    ds = ds.assign_coords(lat=lat_val, lon=lon_val) # reassign lat and lon as coords
    # ds = ds.expand_dims(dim={"lat": 1, "lon": 1})
    
    return ds

def dask_2_xarray(ddf, indexname='index'):
    ds = xr.Dataset()
    ds[indexname] = ddf.index
    for key in ddf.columns:
        ds[key] = (indexname, ddf[key].to_dask_array().compute_chunk_sizes())
    
    return ds

def preprocess_MERRA2_txt_file_using_dask(fname, times):
    ## get time information from file
    df = pd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
    ## put time info into single column in datetime format
    times = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
    
    ## create a dask dataframe
    ddf = dd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ',
                      dtype={"year": int, "month": int, "day": int, "hour": int, "ivt": float, "ar_scale": int, "tIVT": float, "duration": int})
    
    ddf = ddf.drop(['year', 'month', 'day', 'hour'], axis=1) # drop the other columns
    ## convert to xarray
    ds = dask_2_xarray(ddf)

    ### weirdly messy hack to get times to assign
    ds = ds.assign(time=times) # assign time values to index
    ds = ds.assign(index=ds.time.values) # assign time values to index
    ds = ds.drop(['time']) # drop time variable
    ds = ds.rename({'index':'time'}) # rename index to time
    ds = ds.drop(['dim_0']) # drop time variable

    ## get lat and lon values from fname
    strp_fname = re.findall(r"[-+]?(?:\d*\.*\d+)", fname)
    lat_val = float(strp_fname[1])
    lon_val = float(strp_fname[2])
    x = xr.DataArray([lon_val], dims=['location'])
    y = xr.DataArray([lat_val], dims=['location'])
    ds = ds.assign_coords(lat=y, lon=x) # reassign lat and lon as coords
    print(len(ds.time), lat_val, lon_val)
    # ds = ds.expand_dims(dim={"lat": 1, "lon": 1})
    
    return ds

In [5]:
## get time information from first file
df = pd.read_csv(filenames[0], header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
## put time info into single column in datetime format
times = pd.to_datetime(df[['year', 'month', 'day', 'hour']])

ds_lst = []

for i, fname in enumerate(filenames):
    ds = preprocess_MERRA2_txt_file_using_dask(fname, times)
    ds_lst.append(ds)

ds_lst[0]

117608 60.0 -140.0
117608 59.5 -139.375
117608 59.0 -138.75
117608 58.5 -137.5
117608 58.0 -136.25
117608 57.5 -136.25
117608 57.0 -135.625
117608 56.5 -135.0
117608 56.0 -134.375
117608 55.5 -133.75
117608 55.0 -133.125
117608 54.5 -130.625
117608 54.0 -130.0
117608 53.5 -130.0
117608 53.0 -129.375
111272 52.5 -128.75
128560 52.0 -128.125
128560 51.5 -128.125
128560 51.0 -128.125
128560 50.5 -127.5
128560 50.0 -126.875
128560 49.5 -126.25
128560 49.0 -125.0
117608 48.5 -124.375
117608 48.0 -124.375
117608 47.5 -124.375
117608 47.0 -123.75
117608 46.5 -123.75
117608 46.0 -123.75
117608 45.5 -123.75
117608 45.0 -123.75
117608 44.5 -123.75
128560 44.0 -123.75
117608 43.5 -123.75
117608 43.0 -124.375
117608 42.5 -124.375
117608 42.0 -123.75
117608 41.5 -123.75
117608 41.0 -123.75
117608 40.5 -123.75
117608 40.0 -123.75
117608 39.5 -123.75
117608 39.0 -123.75
117608 38.5 -123.125
117608 38.0 -123.125
117608 37.5 -122.5
117608 37.0 -122.5
117608 36.5 -121.875
117608 36.0 -121.25
117608 35.5

Unnamed: 0,Array,Chunk
Bytes,918.81 kiB,918.81 kiB
Shape,"(117608,)","(117608,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 918.81 kiB 918.81 kiB Shape (117608,) (117608,) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",117608  1,

Unnamed: 0,Array,Chunk
Bytes,918.81 kiB,918.81 kiB
Shape,"(117608,)","(117608,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,918.81 kiB,918.81 kiB
Shape,"(117608,)","(117608,)"
Count,4 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 918.81 kiB 918.81 kiB Shape (117608,) (117608,) Count 4 Tasks 1 Chunks Type int64 numpy.ndarray",117608  1,

Unnamed: 0,Array,Chunk
Bytes,918.81 kiB,918.81 kiB
Shape,"(117608,)","(117608,)"
Count,4 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,918.81 kiB,918.81 kiB
Shape,"(117608,)","(117608,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 918.81 kiB 918.81 kiB Shape (117608,) (117608,) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",117608  1,

Unnamed: 0,Array,Chunk
Bytes,918.81 kiB,918.81 kiB
Shape,"(117608,)","(117608,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,918.81 kiB,918.81 kiB
Shape,"(117608,)","(117608,)"
Count,4 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 918.81 kiB 918.81 kiB Shape (117608,) (117608,) Count 4 Tasks 1 Chunks Type int64 numpy.ndarray",117608  1,

Unnamed: 0,Array,Chunk
Bytes,918.81 kiB,918.81 kiB
Shape,"(117608,)","(117608,)"
Count,4 Tasks,1 Chunks
Type,int64,numpy.ndarray


In [7]:
ds_final = xr.concat(ds_lst, dim="location")
ds_final

Unnamed: 0,Array,Chunk
Bytes,65.72 MiB,0.98 MiB
Shape,"(67, 128568)","(1, 128568)"
Count,548 Tasks,67 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 65.72 MiB 0.98 MiB Shape (67, 128568) (1, 128568) Count 548 Tasks 67 Chunks Type float64 numpy.ndarray",128568  67,

Unnamed: 0,Array,Chunk
Bytes,65.72 MiB,0.98 MiB
Shape,"(67, 128568)","(1, 128568)"
Count,548 Tasks,67 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,65.72 MiB,0.98 MiB
Shape,"(67, 128568)","(1, 128568)"
Count,615 Tasks,67 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 65.72 MiB 0.98 MiB Shape (67, 128568) (1, 128568) Count 615 Tasks 67 Chunks Type float64 numpy.ndarray",128568  67,

Unnamed: 0,Array,Chunk
Bytes,65.72 MiB,0.98 MiB
Shape,"(67, 128568)","(1, 128568)"
Count,615 Tasks,67 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,65.72 MiB,0.98 MiB
Shape,"(67, 128568)","(1, 128568)"
Count,548 Tasks,67 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 65.72 MiB 0.98 MiB Shape (67, 128568) (1, 128568) Count 548 Tasks 67 Chunks Type float64 numpy.ndarray",128568  67,

Unnamed: 0,Array,Chunk
Bytes,65.72 MiB,0.98 MiB
Shape,"(67, 128568)","(1, 128568)"
Count,548 Tasks,67 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,65.72 MiB,0.98 MiB
Shape,"(67, 128568)","(1, 128568)"
Count,615 Tasks,67 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 65.72 MiB 0.98 MiB Shape (67, 128568) (1, 128568) Count 615 Tasks 67 Chunks Type float64 numpy.ndarray",128568  67,

Unnamed: 0,Array,Chunk
Bytes,65.72 MiB,0.98 MiB
Shape,"(67, 128568)","(1, 128568)"
Count,615 Tasks,67 Chunks
Type,float64,numpy.ndarray


In [8]:
## save file
fname = path_to_data + 'preprocessed/MERRA2/MERRA2_ARScale_US-West.nc'
ds_final.to_netcdf(path=fname, mode = 'w', format='NETCDF4')