In [1]:
# This notebook is a trial to mask raw Cordex data and extract timeseries

In [2]:
# navigate to correct location to access modules

In [3]:
cd /g/data/mn51/users/gt3409/plotting_maps/

/g/data/mn51/users/gt3409/plotting_maps


In [4]:
from acs_area_statistics import acs_regional_stats
import xarray as xr
import geopandas as gpd
import regionmask
from glob import glob
import numpy as np

import pandas as pd


In [5]:
# import dask
# from dask.distributed import Client, LocalCluster
# dask.config.set({'distributed.worker.daemon': False})
# # cluster = LocalCluster(n_workers=14, processes=True, threads_per_worker=1)
# client = Client()  # start distributed scheduler locally.
# client
# # client.cluster.adapt(minimum=0, maximum=24)

In [8]:
shapefile_list = ["aus_local_gov",
                  "aus_states_territories",
                  "australia", 
                  "broadacre_regions", 
                  "ncra_regions", 
                  "NCRA_regions_coastal_waters_GDA94", 
                  "nrm_regions", ]

name_dict = {"aus_local_gov":"LGA_NAME22", 
             "aus_states_territories":"STE_NAME21",
             "australia": "AUS_NAME21",
             "broadacre_regions": "name",
             "ncra_regions": "regionname", 
             "NCRA_regions_coastal_waters_GDA94": "regionname",
             "nrm_regions":"SubClusNm",
            }


abbr_dict = {"aus_local_gov":"LGA_CODE22", 
             "aus_states_territories":"ABBREV",
             "australia": "AUS_CODE21",
             "broadacre_regions": "aagis",
             "ncra_regions": "short_name", 
             "NCRA_regions_coastal_waters_GDA94": "NCRA",
             "nrm_regions": "SubClusAb",
            }

def get_regions(shapefiles):
    """
    This function takes a list of names of shape files from ia39 and
    returns a combined regionmask.
    
    Parameters
    -----------
    name: str
        one of "aus_local_gov", "aus_states_territories", "australia", 
        "nrm_regions", "ncra_regions","broadacre_regions",
        "NCRA_regions_coastal_waters_GDA94"

    Returns
    -------
    geopandas dataframe
    
    """
    gdfs = {}
    PATH = "/g/data/ia39/aus-ref-clim-data-nci/shapefiles/data"
    
    for i, shapefile in enumerate(shapefiles):
        gdfs[i] = gpd.read_file(glob(f"{PATH}/{shapefile}/*.shp")[0]).rename(columns = {name_dict[shapefile]:"NAME", abbr_dict[shapefile]:"abbrevs"}).to_crs(crs = "GDA2020")
    gdf = pd.concat(gdfs)
    gdf.index = np.arange(0, len(gdf))
    return regionmask.from_geopandas(gdf, names="NAME", abbrevs="abbrevs", name= "-".join(shapefiles), overlap=True) 


In [9]:
PATH = "/g/data/ia39/aus-ref-clim-data-nci/shapefiles/data"
name = "aus_local_gov"
gdf = gpd.read_file(f"{PATH}/{name}/{name}.shp")
gdf

Unnamed: 0,LGA_CODE22,LGA_NAME22,STE_CODE21,STE_NAME21,AREASQKM,LOCI_URI21,SHAPE_Leng,SHAPE_Area,geometry
0,10050,Albury,1,New South Wales,305.6386,https://linked.data.gov.au/dataset/asgsed3/LGA...,1.321768,0.030560,"POLYGON ((146.86566 -36.07292, 146.86512 -36.0..."
1,10180,Armidale,1,New South Wales,7809.4406,https://linked.data.gov.au/dataset/asgsed3/LGA...,6.034583,0.732825,"POLYGON ((152.38816 -30.52639, 152.38812 -30.5..."
2,10250,Ballina,1,New South Wales,484.9692,https://linked.data.gov.au/dataset/asgsed3/LGA...,1.511121,0.044843,"MULTIPOLYGON (((153.57106 -28.87381, 153.57106..."
3,10300,Balranald,1,New South Wales,21690.7493,https://linked.data.gov.au/dataset/asgsed3/LGA...,11.489912,2.115528,"POLYGON ((143.00433 -33.78164, 143.01538 -33.7..."
4,10470,Bathurst,1,New South Wales,3817.8645,https://linked.data.gov.au/dataset/asgsed3/LGA...,5.395114,0.370149,"POLYGON ((149.84877 -33.52784, 149.84864 -33.5..."
...,...,...,...,...,...,...,...,...,...
542,74660,West Arnhem,7,Northern Territory,49675.0342,https://linked.data.gov.au/dataset/asgsed3/LGA...,23.235319,4.130572,"MULTIPOLYGON (((132.99069 -11.07872, 132.99049..."
543,74680,West Daly,7,Northern Territory,14069.6980,https://linked.data.gov.au/dataset/asgsed3/LGA...,7.556529,1.178262,"MULTIPOLYGON (((129.69308 -14.80637, 129.69233..."
544,79399,Unincorporated NT,7,Northern Territory,19777.3595,https://linked.data.gov.au/dataset/asgsed3/LGA...,19.180110,1.648842,"MULTIPOLYGON (((130.74789 -12.42749, 130.74985..."
545,89399,Unincorporated ACT,8,Australian Capital Territory,2358.1330,https://linked.data.gov.au/dataset/asgsed3/LGA...,3.032133,0.234227,"POLYGON ((149.06239 -35.15910, 149.09134 -35.1..."


In [14]:
%%time

# for many regions

PATH = "/g/data/ia39/aus-ref-clim-data-nci/shapefiles/data"
name = ["aus_local_gov",
                  "aus_states_territories",
                  "australia", 
                  "broadacre_regions", 
                  "ncra_regions", 
                  "NCRA_regions_coastal_waters_GDA94", 
                  "nrm_regions", ][0]
regions = get_regions([name])


variable_id = "tasmaxAdjust"
institution_id = "BOM"
parent_model = "ACCESS-CM2"
downscaling_model = "BARPA-R"
member_id = "r4i1p1f1"

common_dir = f"/g/data/ia39/australian-climate-service/test-data/CORDEX-CMIP6/bias-adjustment-output/AGCD-05i"


start_year= 2000
end_year=2030

experiment_id = "historical"
filelist_hist = [f"{common_dir}/{institution_id}/{parent_model}/{experiment_id}/{member_id}/{downscaling_model}/v1-r1-ACS-QME-AGCD-1960-2022/day/{variable_id}/{variable_id}_AGCD-05i_{parent_model}_{experiment_id}_{member_id}_{institution_id}_{downscaling_model}_v1-r1-ACS-QME-AGCD-1960-2022_day_{year}0101-{year}1231.nc" 
                 for year in np.arange(start_year,2015,1)]
experiment_id = "ssp370"
filelist_ssp370 = [f"{common_dir}/{institution_id}/{parent_model}/{experiment_id}/{member_id}/{downscaling_model}/v1-r1-ACS-QME-AGCD-1960-2022/day/{variable_id}/{variable_id}_AGCD-05i_{parent_model}_{experiment_id}_{member_id}_{institution_id}_{downscaling_model}_v1-r1-ACS-QME-AGCD-1960-2022_day_{year}0101-{year}1231.nc" 
                   for year in np.arange(2015,end_year,1)]

filelist = filelist_hist + filelist_ssp370


ds = xr.open_dataset(filelist[0])

# mask = regions.mask_3D_frac_approx(ds)
mask = xr.open_dataset("/scratch/eg3/gt3409/mask_3D_frac_approx_aus_local_gov.nc")["mask"]

lat_weights = np.cos(np.deg2rad(ds["lat"]))

def _preprocess(ds, mask):
    return ds.chunk(chunks={"time":10, "lat": -1, "lon":-1}).weighted(mask * lat_weights)\
             .mean(["lat", "lon"])
    
ds_mf = xr.open_mfdataset(filelist, preprocess=lambda ds: _preprocess(ds[variable_id], mask),
                          autoclose=True, combine="nested", concat_dim="time")
ds_mf = ds_mf.compute()
ds_mf

  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blo

CPU times: user 1h 22min 54s, sys: 23min 21s, total: 1h 46min 15s
Wall time: 9min 16s


In [26]:
%%time 
#this is super memory intensive, so will save to avoid recalculating
mask = regions.mask_3D_frac_approx(ds)
mask.to_netcdf("/scratch/eg3/gt3409/mask_3D_frac_approx_aus_local_gov.nc")

CPU times: user 2min 59s, sys: 24.1 s, total: 3min 23s
Wall time: 3min 23s


In [24]:
mask.to_netcdf("/scratch/eg3/gt3409/mask_3D_aus_local_gov.nc")

In [9]:
filelist

['/g/data/ia39/australian-climate-service/test-data/CORDEX-CMIP6/bias-adjustment-output/AGCD-05i/BOM/ACCESS-CM2/historical/r4i1p1f1/BARPA-R/v1-r1-ACS-QME-AGCD-1960-2022/day/tasmaxAdjust/tasmaxAdjust_AGCD-05i_ACCESS-CM2_historical_r4i1p1f1_BOM_BARPA-R_v1-r1-ACS-QME-AGCD-1960-2022_day_20140101-20141231.nc',
 '/g/data/ia39/australian-climate-service/test-data/CORDEX-CMIP6/bias-adjustment-output/AGCD-05i/BOM/ACCESS-CM2/ssp370/r4i1p1f1/BARPA-R/v1-r1-ACS-QME-AGCD-1960-2022/day/tasmaxAdjust/tasmaxAdjust_AGCD-05i_ACCESS-CM2_ssp370_r4i1p1f1_BOM_BARPA-R_v1-r1-ACS-QME-AGCD-1960-2022_day_20150101-20151231.nc']

In [None]:
%%time
results = []
for file in filelist:
    results.append(_preprocess(ds, mask))
dask.compute(results)

In [None]:
%%time
ds_mf = ds_mf.compute()
ds_mf

In [None]:
%%time

# for many regions

PATH = "/g/data/ia39/aus-ref-clim-data-nci/shapefiles/data"
name = "aus_states_territories"
regions = get_regions([name])

variable_id = "tasmaxAdjust"

institution_id = "BOM"
parent_model = "ACCESS-CM2"
downscaling_model = "BARPA-R"
member_id = "r4i1p1f1"

common_dir = f"/g/data/ia39/australian-climate-service/test-data/CORDEX-CMIP6/bias-adjustment-output/AGCD-05i"

experiment_id = "historical"
filelist_hist = [f"{common_dir}/{institution_id}/{parent_model}/{experiment_id}/{member_id}/{downscaling_model}/v1-r1-ACS-QME-AGCD-1960-2022/day/{variable_id}/{variable_id}_AGCD-05i_{parent_model}_{experiment_id}_{member_id}_{institution_id}_{downscaling_model}_v1-r1-ACS-QME-AGCD-1960-2022_day_{year}0101-{year}1231.nc" 
                 for year in np.arange(2014,2015,1)]
experiment_id = "ssp370"
filelist_ssp370 = [f"{common_dir}/{institution_id}/{parent_model}/{experiment_id}/{member_id}/{downscaling_model}/v1-r1-ACS-QME-AGCD-1960-2022/day/{variable_id}/{variable_id}_AGCD-05i_{parent_model}_{experiment_id}_{member_id}_{institution_id}_{downscaling_model}_v1-r1-ACS-QME-AGCD-1960-2022_day_{year}0101-{year}1231.nc" 
                   for year in np.arange(2015,2016,1)]

filelist = filelist_hist + filelist_ssp370

ds = xr.open_dataset(filelist[0])

mask = regions.mask_3D_frac_approx(ds)

lat_weights = np.cos(np.deg2rad(ds["lat"]))

def _preprocess(ds, mask):
    return ds.weighted(mask * lat_weights)\
                          .mean(["lat", "lon"])
    
ds_mf = xr.open_mfdataset(filelist, preprocess=lambda ds: _preprocess(ds[variable_id], mask))
ds_mf = ds_mf.compute()
ds_mf

In [6]:
%%time

region_idx = 0

PATH = "/g/data/ia39/aus-ref-clim-data-nci/shapefiles/data"
name = "aus_local_gov"
gdf = gpd.read_file(f"{PATH}/{name}/{name}.shp")
max_bounds = gdf["geometry"][region_idx].bounds
regions = regionmask.from_geopandas(gdf.iloc[region_idx:region_idx+1], names ="LGA_NAME22", abbrevs="LGA_CODE22", name=name, overlap=True)

variable_id = "tasmaxAdjust"

institution_id = "BOM"
parent_model = "ACCESS-CM2"
downscaling_model = "BARPA-R"
member_id = "r4i1p1f1"

common_dir = f"/g/data/ia39/australian-climate-service/test-data/CORDEX-CMIP6/bias-adjustment-output/AGCD-05i"

experiment_id = "historical"
filelist_hist = [f"{common_dir}/{institution_id}/{parent_model}/{experiment_id}/{member_id}/{downscaling_model}/v1-r1-ACS-QME-AGCD-1960-2022/day/{variable_id}/{variable_id}_AGCD-05i_{parent_model}_{experiment_id}_{member_id}_{institution_id}_{downscaling_model}_v1-r1-ACS-QME-AGCD-1960-2022_day_{year}0101-{year}1231.nc" 
                 for year in np.arange(2012,2015,1)]
experiment_id = "ssp370"
filelist_ssp370 = [f"{common_dir}/{institution_id}/{parent_model}/{experiment_id}/{member_id}/{downscaling_model}/v1-r1-ACS-QME-AGCD-1960-2022/day/{variable_id}/{variable_id}_AGCD-05i_{parent_model}_{experiment_id}_{member_id}_{institution_id}_{downscaling_model}_v1-r1-ACS-QME-AGCD-1960-2022_day_{year}0101-{year}1231.nc" 
                   for year in np.arange(2015,2018,1)]

filelist = filelist_hist + filelist_ssp370

mask = regions.mask_3D_frac_approx(xr.open_dataset(filelist[0]).sel(lat = slice(max_bounds[1], max_bounds[3]),lon = slice(max_bounds[0], max_bounds[2])))\
              .sel(region=region_idx)

def _preprocess(ds, max_bounds, mask):
    return ds.sel(lat = slice(max_bounds[1], max_bounds[3]),lon = slice(max_bounds[0], max_bounds[2]))\
                          .chunk(chunks={"time":-1, "lat": -1, "lon":-1})\
                          .where(mask)\
                          .mean(["lat", "lon"])
    
ds_mf = xr.open_mfdataset(filelist, preprocess=lambda ds: _preprocess(ds[variable_id], max_bounds, mask))
ds_mf_bounds = ds_mf.compute()

CPU times: user 49.5 s, sys: 3.62 s, total: 53.1 s
Wall time: 1min 17s


In [29]:
ds_mf_bounds

In [33]:
%%time

region_idx = 0

PATH = "/g/data/ia39/aus-ref-clim-data-nci/shapefiles/data"
name = "aus_local_gov"
gdf = gpd.read_file(f"{PATH}/{name}/{name}.shp")
max_bounds = gdf["geometry"][region_idx].bounds
regions = regionmask.from_geopandas(gdf.iloc[region_idx:region_idx+1], names ="LGA_NAME22", abbrevs="LGA_CODE22", name=name, overlap=True)

variable_id = "tasmaxAdjust"

institution_id = "BOM"
parent_model = "ACCESS-CM2"
downscaling_model = "BARPA-R"
member_id = "r4i1p1f1"

common_dir = f"/g/data/ia39/australian-climate-service/test-data/CORDEX-CMIP6/bias-adjustment-output/AGCD-05i"

experiment_id = "historical"
filelist_hist = [f"{common_dir}/{institution_id}/{parent_model}/{experiment_id}/{member_id}/{downscaling_model}/v1-r1-ACS-QME-AGCD-1960-2022/day/{variable_id}/{variable_id}_AGCD-05i_{parent_model}_{experiment_id}_{member_id}_{institution_id}_{downscaling_model}_v1-r1-ACS-QME-AGCD-1960-2022_day_{year}0101-{year}1231.nc" 
                 for year in np.arange(2012,2015,1)]
experiment_id = "ssp370"
filelist_ssp370 = [f"{common_dir}/{institution_id}/{parent_model}/{experiment_id}/{member_id}/{downscaling_model}/v1-r1-ACS-QME-AGCD-1960-2022/day/{variable_id}/{variable_id}_AGCD-05i_{parent_model}_{experiment_id}_{member_id}_{institution_id}_{downscaling_model}_v1-r1-ACS-QME-AGCD-1960-2022_day_{year}0101-{year}1231.nc" 
                   for year in np.arange(2015,2018,1)]

filelist = filelist_hist + filelist_ssp370

ds = xr.open_dataset(filelist[0])

mask = regions.mask_3D_frac_approx(ds.sel(lat = slice(max_bounds[1], max_bounds[3]),lon = slice(max_bounds[0], max_bounds[2])))\
              .sel(region=region_idx)

lat_weights = np.cos(np.deg2rad(ds["lat"]))

def _preprocess(ds, mask):
    return ds.weighted(mask * lat_weights)\
                          .mean(["lat", "lon"])
    
ds_mf = xr.open_mfdataset(filelist, preprocess=lambda ds: _preprocess(ds[variable_id], mask))
ds_mf = ds_mf.compute()
ds_mf


KeyboardInterrupt



In [37]:
ds = xr.open_dataset(filelist[0])
ds

In [46]:
ds

In [None]:
%%time
var="tasmaxAdjust"
# calculate weights due to latitude
lat_weights = np.cos(np.deg2rad(ds["lat"]))

spatial_means = ds[var].weighted(mask_all * lat_weights).mean(dim=("lat", "lon"))

In [8]:
ds_mf

In [9]:
mask

In [10]:
regions = regionmask.from_geopandas(gdf, names ="LGA_NAME22", abbrevs="LGA_CODE22", name=name, overlap=True)
regions

<regionmask.Regions 'aus_local_gov'>
overlap:  True

Regions:
  0 10050                      Albury
  1 10180                    Armidale
  2 10250                     Ballina
  3 10300                   Balranald
  4 10470                    Bathurst
 ..   ...                         ...
542 74660                 West Arnhem
543 74680                   West Daly
544 79399           Unincorporated NT
545 89399          Unincorporated ACT
546 99399 Unincorp. Other Territories

[547 regions]

In [11]:
ds = xr.open_dataset(filelist[0])
ds

In [12]:
%%time
mask_all = regions.mask_3D(ds)
mask_all

CPU times: user 5.32 s, sys: 312 ms, total: 5.63 s
Wall time: 5.65 s


In [22]:
%%time
var="tasmaxAdjust"
# calculate weights due to latitude
lat_weights = np.cos(np.deg2rad(ds["lat"]))

spatial_means = ds[var].weighted(mask_all * lat_weights).mean(dim=("lat", "lon"))


CPU times: user 12.1 s, sys: 1.7 s, total: 13.8 s
Wall time: 13.8 s


In [21]:
spatial_means

In [14]:
%%time

ts_lga0 = ds.where(mask.sel(region = 0))[var].mean(["lat", "lon"]).compute()
ts_lga0

NameError: name 'ds' is not defined

In [9]:
%%time
# Fractional masking does not work unless the lat lons are a regular grid. 
# frac_mask = regions.mask_3D_frac_approx(ds)

CPU times: user 3 μs, sys: 1e+03 ns, total: 4 μs
Wall time: 6.44 μs


In [10]:
%%time
ts_lga0 = ds.where(mask.sel(region = 0)).pr.mean(["lat", "lon"]).compute()
ts_lga0

NameError: name 'ds' is not defined

In [11]:
mask.sel(region=0).total_bounds

KeyError: "no index found for coordinate 'region'"

In [None]:
ts_lga0.plot()

In [None]:
import ee

In [None]:
%%time
ds.where(mask.sel(region = 0)).pr.max().compute()

In [None]:
%%time
# Apply the mask and calculate regional means for all timesteps
var = "pr"
da_means = acs_regional_stats(ds, var=var, dims=("lat", "lon"), mask = mask, how = ["mean"], select_abbr="WANorth")
da_means

In [None]:
# Plot the timeseries of one of the regions
da_means.sel(region = 0)