In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import os
import time
import numpy as np
import xarray as xr
import geopandas as gpd 
from sklearn.cluster import AgglomerativeClustering
from dask.distributed import Client, progress

import FINE.spagat.utils as spu
import FINE.spagat.RE_representation_utils as RE_rep_utils

In [3]:
DATA_PATH = 'C:/Users/s.patil/Documents/code/fine/examples/SPAGAT/RE_representation/InputData/'
shapefile = os.path.join(DATA_PATH, 'Shapefiles/CZE.shp')
GRIDDED_WIND_DATA_PATH = os.path.join(DATA_PATH, 'CZE_wind.nc4') 

In [29]:
timeit = time.time()

In [30]:
@spu.timer
def represent_RE_technology(gridded_RE_ds, 
                            CRS_attr,
                            shp_file,
                            n_timeSeries_perRegion=1,  
                            capacity_var_name='capacity',
                            capfac_var_name='capacity factor',
                            longitude='x', 
                            latitude='y',
                            time='time',
                            index_col='region_ids', 
                            geometry_col='geometry',
                            linkage='average'):

    def _simply_aggregate_RE_technology(region):
    
        #STEP 1. Create resultant xarray dataset 
        time_steps = rasterized_RE_ds[time].values  
        n_timeSteps = len(time_steps)

        ## time series 
        data = np.zeros((n_timeSteps, 1))

        represented_timeSeries = xr.DataArray(data, [(time, time_steps),
                                                    ('region_ids', [region])])

        #capacities
        represented_capacities = xr.DataArray(0, [('region_ids', [region])])

        #STEP 2. Representation
        regional_ds = rasterized_RE_ds.sel(region_ids = region)
    
        regional_capfac_da = regional_ds[capfac_var_name].where(regional_ds.rasters == 1)
        regional_capacity_da = regional_ds[capacity_var_name].where(regional_ds.rasters == 1)

        #STEP 2b. Preprocess regional capfac and capacity dataArrays 

        #STEP 2b (i). Restructure data
        regional_capfac_da = regional_capfac_da.stack(x_y = [longitude, latitude]) 
        regional_capfac_da = regional_capfac_da.transpose(transpose_coords= True) 

        regional_capacity_da = regional_capacity_da.stack(x_y = [longitude, latitude])
        regional_capacity_da = regional_capacity_da.transpose(transpose_coords= True)

        #STEP 2b (ii). Remove all time series with 0 values 
        regional_capfac_da = regional_capfac_da.where(regional_capacity_da > 0)
        regional_capacity_da = regional_capacity_da.where(regional_capacity_da > 0)

        #STEP 2b (iii). Drop NAs 
        regional_capfac_da = regional_capfac_da.dropna(dim='x_y')
        regional_capacity_da = regional_capacity_da.dropna(dim='x_y')

        #Print out number of time series in the region 
        n_ts = len(regional_capfac_da['x_y'].values)
        print(f'Number of time series in {region}: {n_ts}')

        #STEP 2c. Get power curves from capacity factor time series and capacities 
        regional_power_da = regional_capacity_da * regional_capfac_da

        #STEP 2d. Aggregation
        ## capacity
        capacity_total = regional_capacity_da.sum(dim = 'x_y').values
        represented_capacities.loc[region] = capacity_total
        
        ## capacity factor 
        power_total = regional_power_da.sum(dim = 'x_y').values
        capfac_total = power_total/capacity_total
        
        represented_timeSeries.loc[:,region] = capfac_total

        #STEP 3. Create resulting dataset 
        regional_represented_RE_ds = xr.Dataset({capacity_var_name: represented_capacities,
                                        capfac_var_name: represented_timeSeries}) 

    
        return regional_represented_RE_ds 


    def _cluster_RE_technology(region):
        start_time = timeit
        
        #STEP 1. Create resultant xarray dataset 
        time_steps = rasterized_RE_ds[time].values  
        n_timeSteps = len(time_steps)

        TS_ids = [f'TS_{i}' for i in range(n_timeSeries_perRegion)] 

        ## time series 
        data = np.zeros((n_timeSteps, 1, n_timeSeries_perRegion))

        represented_timeSeries = xr.DataArray(data, [(time, time_steps),
                                                    ('region_ids', [region]),
                                                    ('TS_ids', TS_ids)])

        data = np.zeros((1, n_timeSeries_perRegion))

        #capacities
        represented_capacities = xr.DataArray(data, [('region_ids', [region]),
                                                    ('TS_ids', TS_ids)])

        #STEP 2. Representation
        regional_ds = rasterized_RE_ds.sel(region_ids = region)
    
        regional_capfac_da = regional_ds[capfac_var_name].where(regional_ds.rasters == 1)
        regional_capacity_da = regional_ds[capacity_var_name].where(regional_ds.rasters == 1)

        #STEP 2b. Preprocess regional capfac and capacity dataArrays 

        #STEP 2b (i). Restructure data
        regional_capfac_da = regional_capfac_da.stack(x_y = [longitude, latitude]) 
        regional_capfac_da = regional_capfac_da.transpose(transpose_coords= True) 

        regional_capacity_da = regional_capacity_da.stack(x_y = [longitude, latitude])
        regional_capacity_da = regional_capacity_da.transpose(transpose_coords= True)

        #STEP 2b (ii). Remove all time series with 0 values 
        regional_capfac_da = regional_capfac_da.where(regional_capacity_da > 0)
        regional_capacity_da = regional_capacity_da.where(regional_capacity_da > 0)

        #STEP 2b (iii). Drop NAs 
        regional_capfac_da = regional_capfac_da.dropna(dim='x_y')
        regional_capacity_da = regional_capacity_da.dropna(dim='x_y')

        #Print out number of time series in the region 
        n_ts = len(regional_capfac_da['x_y'].values)
        print(f'Number of time series in {region}: {n_ts}')

        #STEP 2c. Get power curves from capacity factor time series and capacities 
        regional_power_da = regional_capacity_da * regional_capfac_da
        
        end_time = timeit
        
        print(f'time taken for preprocessing: {end_time - start_time}')
        
        start_time = timeit
        
        #STEP 2d. Clustering  
        agg_cluster = AgglomerativeClustering(n_clusters=n_timeSeries_perRegion, 
                                              affinity="euclidean",  
                                              linkage=linkage)
        agglomerative_model = agg_cluster.fit(regional_capfac_da)
        
        end_time = timeit
        
        print(f'time taken for clustering: {end_time - start_time}')
        
        start_time = timeit
        #STEP 2e. Aggregation
        for i in range(np.unique(agglomerative_model.labels_).shape[0]):
            ## Aggregate capacities 
            cluster_capacity = regional_capacity_da[agglomerative_model.labels_ == i]
            cluster_capacity_total = cluster_capacity.sum(dim = 'x_y').values

            represented_capacities.loc[region, TS_ids[i]] = cluster_capacity_total

            #aggregate capacity factor 
            cluster_power = regional_power_da[agglomerative_model.labels_ == i]
            cluster_power_total = cluster_power.sum(dim = 'x_y').values
            cluster_capfac_total = cluster_power_total/cluster_capacity_total

            represented_timeSeries.loc[:,region, TS_ids[i]] = cluster_capfac_total
            
        #STEP 3. Create resulting dataset 
        regional_represented_RE_ds = xr.Dataset({capacity_var_name: represented_capacities,
                                        capfac_var_name: represented_timeSeries})  
        
        end_time = timeit
        
        print(f'time taken for aggregation: {end_time - start_time}')
        
        return regional_represented_RE_ds 


    #STEP 1. Rasterize the gridded dataset
    rasterized_RE_ds = RE_rep_utils.rasterize_xr_ds(gridded_RE_ds, 
                                                    CRS_attr,
                                                    shp_file, 
                                                    index_col, 
                                                    geometry_col,
                                                    longitude, 
                                                    latitude)


    region_ids = rasterized_RE_ds['region_ids'].values 


    
    results = []
    if n_timeSeries_perRegion==1:
        for region in region_ids:
            results.append(_simply_aggregate_RE_technology(region))
    else:
        for region in region_ids:
            results.append(_cluster_RE_technology(region))

    represented_RE_ds =  xr.merge(results)
   
    return represented_RE_ds 

In [31]:
represented_re_ds = represent_RE_technology(gridded_RE_ds = GRIDDED_WIND_DATA_PATH,
                                            CRS_attr = 'xy_reference_system',
                                            shp_file = shapefile,
                                            n_timeSeries_perRegion = 5,
                                            capacity_var_name='capacity',
                                            capfac_var_name='capfac',
                                            longitude='x', 
                                            latitude='y',
                                            time='time',
                                            index_col='e-id', 
                                            geometry_col='geometry',
                                            linkage='average')

Number of time series in 39_cz: 364
time taken for preprocessing: 0.0
time taken for clustering: 0.0
time taken for aggregation: 0.0
Number of time series in 40_cz: 213
time taken for preprocessing: 0.0
time taken for clustering: 0.0
time taken for aggregation: 0.0
elapsed time for represent_RE_technology: 0.05 minutes


In [None]:
def calculate_anomaly(da, groupby_type="time.month"):
    gb = da.groupby(groupby_type)
    clim = gb.mean(dim="time")
    return clim

In [None]:
time = xr.cftime_range("1990-01", "1992-01", freq="M")
month = xr.DataArray(time.month, coords={"time": time}, dims=["time"])
np.random.seed(123)
array = xr.DataArray(
    np.random.rand(len(time)),
     dims=["time"],
     coords={"time": time, "month": month},).chunk()

In [None]:
new_array = calculate_anomaly(array)

In [None]:
array 

In [None]:
array.map_blocks(calculate_anomaly, template=new_array).compute()