In [1]:
# Standard Python modules
import os, sys
import glob
import numpy as np
import pandas as pd
import xarray as xr
import re
import datetime

# extras
%matplotlib inline


# Import my modules
sys.path.append('../modules') # Path to modules
from utils import find_closest_MERRA2_lon_df, MERRA2_range

In [2]:
path_to_data = '/data/projects/Comet/cwp140/' 
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
filename_pattern = '/data/downloaded/Reanalysis/AR_Catalogs/Rutz/MERRA2/AnnualFiles/Rutz_ARCatalog_MERRA2_*.nc'

filenames = []
for name in glob.glob(filename_pattern):
    filenames.append(name)
# sort filenames so they are in chronological order
filenames = sorted(filenames)
print(len(filenames))

44


In [4]:
## Open text file with coordinates of coastal region along N. America West Coast
textpts_fname = '../data/latlon_coast-modified.txt'
df = pd.read_csv(textpts_fname, header=None, sep=' ', names=['latitude', 'longitude'], engine='python')
df['longitude'] = df['longitude']*-1

## create column with closest MERRA2 lons
df['MERRA2_lon'] = df.apply(lambda row: find_closest_MERRA2_lon_df(row), axis=1)

## for each location, add surrounding grid cells
df['MERRA2_lons'] = df.apply(lambda row: MERRA2_range(row), axis=1)

## pull out list values, make into single list of pairs, remove duplicates
lat_lst = []
lon_lst = []

for index, row in df.iterrows():
    lat_lst.append(row['MERRA2_lons'][0])
    lon_lst.append(row['MERRA2_lons'][1])
    
d = {'lat' : np.concatenate(lat_lst),
    'lon' : np.concatenate(lon_lst)}

new_df = pd.DataFrame(d)
new_df = new_df.drop_duplicates(subset=['lat', 'lon'])

x = xr.DataArray(new_df['lon'].values, dims=['location'])
y = xr.DataArray(new_df['lat'].values, dims=['location'])

In [5]:

def preprocess_Rutz_MERRA2(fname, x, y):
    # open original file
    ds = xr.open_dataset(fname)
    
    ## build a pandas df of months, days, years, hours
    d = {'year': ds.cal_year.values, 'month': ds.cal_mon.values, 'day': ds.cal_day.values, 'hour': ds.cal_hour.values}
    df = pd.DataFrame(data=d)
    df['date'] = df.apply(lambda row: datetime.datetime(int(row['year']), int(row['month']), int(row['day']), int(row['hour'])), axis=1)
    dates = df.date.values
    
    data_array = ds['ARs'].values
    IVT = ds['IVT'].values
    lons = ds.longitude.values
    lats = ds.latitude.values

    # put into a dataset
    var_dict = {'AR': (['time', 'lat', 'lon'], data_array),
                'IVT': (['time', 'lat', 'lon'], IVT)}
    ds = xr.Dataset(var_dict,
                    coords={'time': (['time'], dates),
                            'lat': (['lat'], lats),
                            'lon': (['lon'], lons)})
    
    ds = ds.sel(lon=x, lat=y, method='nearest')
    
    return ds

In [6]:
%%time
ds_lst = []
for i, fname in enumerate(filenames):
    ds_lst.append(preprocess_Rutz_MERRA2(fname, x, y))

ds_final = xr.concat(ds_lst, dim="time")
ds_final

CPU times: user 2min 2s, sys: 10min 36s, total: 12min 38s
Wall time: 45min 59s


In [7]:
## save file
fname = path_to_data + 'preprocessed/MERRA2/MERRA2_Rutz_US-West.nc'
ds_final.to_netcdf(path=fname, mode = 'w', format='NETCDF4')

In [3]:
import xarray as xr
path_to_data = '/data/projects/Comet/cwp140/' 

fname = path_to_data + 'preprocessed/MERRA2/MERRA2_Rutz_US-West.nc'
ds = xr.open_dataset(fname)
ds