In [1]:
# Standard Python modules
import os, sys
import glob
import numpy as np
import pandas as pd
import xarray as xr
import re
import datetime

# extras
%matplotlib inline


# Import my modules
sys.path.append('../modules') # Path to modules

In [2]:
path_to_data = '/data/projects/Comet/cwp140/' 
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:

filename_pattern = '/data/downloaded/Reanalysis/AR_Catalogs/Rutz/MERRA2/AnnualFiles/Rutz_ARCatalog_MERRA2_202*.nc'

filenames = []
for name in glob.glob(filename_pattern):
    filenames.append(name)
# sort filenames so they are in chronological order
filenames = sorted(filenames)
print(len(filenames))

5


In [4]:
print(filenames)

['/data/downloaded/Reanalysis/AR_Catalogs/Rutz/MERRA2/AnnualFiles/Rutz_ARCatalog_MERRA2_2020.nc', '/data/downloaded/Reanalysis/AR_Catalogs/Rutz/MERRA2/AnnualFiles/Rutz_ARCatalog_MERRA2_2021.nc', '/data/downloaded/Reanalysis/AR_Catalogs/Rutz/MERRA2/AnnualFiles/Rutz_ARCatalog_MERRA2_2022.nc', '/data/downloaded/Reanalysis/AR_Catalogs/Rutz/MERRA2/AnnualFiles/Rutz_ARCatalog_MERRA2_2023.nc', '/data/downloaded/Reanalysis/AR_Catalogs/Rutz/MERRA2/AnnualFiles/Rutz_ARCatalog_MERRA2_2024.nc']


In [5]:
def pull_latlons_ARscale():
    # pull lat and lon values from preprocessed AR scale data
    fname = path_to_data + 'preprocessed/ARScale_ERA5/ERA5_ARScale_WY2023.nc'
    ds1 = xr.open_dataset(fname)
    latmin = ds1.lat.min()
    latmax = ds1.lat.max()
    lonmin = ds1.lon.min()
    lonmax = ds1.lon.max()
    
    return latmin, latmax, lonmin, lonmax

def preprocess_Rutz_MERRA2(fname):
    # open original file
    ds = xr.open_dataset(fname)
    
    ## build a pandas df of months, days, years, hours
    d = {'year': ds.cal_year.values, 'month': ds.cal_mon.values, 'day': ds.cal_day.values, 'hour': ds.cal_hour.values}
    df = pd.DataFrame(data=d)
    df['date'] = df.apply(lambda row: datetime.datetime(int(row['year']), int(row['month']), int(row['day']), int(row['hour'])), axis=1)
    dates = df.date.values
    
    data_array = ds['ARs'].values
    lons = ds.longitude.values
    lats = ds.latitude.values

    # put into a dataset
    var_dict = {'AR': (['time', 'lat', 'lon'], data_array)}
    ds = xr.Dataset(var_dict,
                    coords={'time': (['time'], dates),
                            'lat': (['lat'], lats),
                            'lon': (['lon'], lons)})
    
    # latmin, latmax, lonmin, lonmax = pull_latlons_ARscale()
    latmin, latmax, lonmin, lonmax = 15., 60., -127., -80.
    ds = ds.sel(lat=slice(latmin, latmax), lon=slice(lonmin, lonmax))
    
    return ds

In [6]:
%%time
ds_lst = []
for i, fname in enumerate(filenames):
    ds_lst.append(preprocess_Rutz_MERRA2(fname))

ds_final = xr.combine_by_coords(ds_lst)
ds_final

CPU times: user 1.18 s, sys: 23.4 s, total: 24.6 s
Wall time: 44 s


In [7]:
## save file
fname = path_to_data + 'preprocessed/MERRA2/MERRA2_Rutz_latlon_2020-2024.nc'
ds_final.to_netcdf(path=fname, mode = 'w', format='NETCDF4')