In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import geopandas
import warnings

from sklearn import preprocessing as prep
from scipy.cluster import hierarchy
from sklearn import metrics

import FINE.spagat.dataset as spd
from ipynb.fs.full import grouping_utils

## test dataset - (test_dataset2)

In [2]:
space = ['01_reg','02_reg','03_reg']
TimeStep = ['T0','T1']
space_2 = space.copy()
component = ['c1','c2','c3','c4']
Period = [0]

demand = np.stack([[[[np.nan,np.nan, np.nan] for i in range(2)]],
                        [[[1, 0.9,  2],
                          [1, 0,  0.9]]],
                        [[[np.nan,np.nan, np.nan] for i in range(2)]],
                        [[[0,   1, 1],
                          [0.3, 2, 1]]]])
demand = xr.DataArray(demand, coords=[component, Period, TimeStep, space], dims=['component', 'Period', 'TimeStep','space'])
cap_1d = np.stack([[0.9,  1,  0.9],
                        [0,    0,  0],
                        [0.9,  1,  0.9],
                        [np.nan] *3])
cap_1d = xr.DataArray(cap_1d, coords=[component,space], dims=['component','space'])
dist_2d = np.stack([[[0,1,2],[1,0,10],[2,10,0]],
                         [[0,0.1,0.2],[0.1,0,1],[0.2,1,0]],
                         [[np.nan] * 3 for i in range(3)],
                         [[np.nan] * 3 for i in range(3)]])
dist_2d = xr.DataArray(dist_2d, coords=[component,space,space_2], dims=['component','space','space_2'])

ds = xr.Dataset({'operationFixRate': demand, '1d_capacity': cap_1d, '2d_distance': dist_2d})

sds = spd.SpagatDataset()
sds.xr_dataset = ds

sds.xr_dataset
dataset = sds.xr_dataset

## data that comes from preprocessDataset()

In [3]:
vars_ts = {}
vars_1d = {}
vars_2d = {}

In [4]:
for varname, da in dataset.data_vars.items():
        # sort the dimensions
        if sorted(da.dims) == sorted(('component','Period','TimeStep', 'space')):   #TODO: maybe space should be generalized with additional variable - dimension_description ?
            # Period is not considered -> TODO: consider the Period dimension.
            da = da.transpose('Period','component','space','TimeStep')[0]  
            vars_ts[varname] = da

        elif sorted(da.dims) == sorted(('component','space')):
            vars_1d[varname] = da

        elif sorted(da.dims) == sorted(('component','space','space_2')):
            vars_2d[varname] = da

        else:
            warnings.warn("Variable '" + varname + "' has dimensions + '" + str(da.dims) + "' which are not considered for spatial aggregation.")

In [5]:
component_list = list(dataset['component'].values)

## preprocessTimeSeries()

In [None]:
vars_ts

In [None]:
vars_dict = vars_ts
n_regions = len(dataset['space'].values)
n_components = len(component_list)

In [None]:
ds_ts = {}

#### Preprocesses data array correspinding to each time series variable 

In [None]:
for var, da in vars_dict.items():
    print(f'var is {var} and data is {da}')
    var = var 
    da = da

In [None]:
matrix_var = np.array([np.zeros(n_regions)]).T
print(f'matrix_var is {matrix_var}')

#### STEP 1. Find the valid components for each variable ( valid_component_weight=1, otherwise=0)

In [None]:

var_mean_df = da.mean(dim="space").mean(dim="TimeStep").to_dataframe()
print(f'var_mean_df is {var_mean_df}')
    
var_mean_df['component_id'] = np.array(range(n_components))
print(f'var_mean_df is {var_mean_df}')
    
valid_component_ids = list(var_mean_df[var_mean_df[var].notna()]['component_id'])
print(f'valid_component_ids is {valid_component_ids}') 
    

In [None]:
da[1]

#### STEP 2. Preprocess data corresponding to each valid component 

In [None]:

for comp_id in valid_component_ids:
    print(f'da[{comp_id}].values is {da[comp_id].values}')
    # Compute the standardized matrix for each valid component: rescale the matrix value to range [0,1]
    # -> the values in time series for this component should be in the same scaling: matrix_MinMaxScaler()
    #### STEP 2a. Obtain a scaled matirx for each valid component's matrix 
    matrix_var_c = grouping_utils.matrix_MinMaxScaler(da[comp_id].values) 
    print(f'matrix_var_c is {matrix_var_c}')  
    
    #### STEP 2b. Join this matrix to the resultant matrix (column-wise) -> matrix of component1 | matrix of component2 
    # Concatenate this matrix block of one component to the final matrix for this 2d variable
    matrix_var = np.concatenate((matrix_var, matrix_var_c), axis=1)
    
matrix_var = np.delete(matrix_var,0,1) 



In [None]:
matrix_var

#### STEP 3. Add it to the resultant dict 

In [None]:
ds_ts[var] = matrix_var

In [None]:
ds_ts

In [None]:
# Each variable has a matrix value
for var, da in vars_dict.items():

    matrix_var = np.array([np.zeros(n_regions)]).T

    # Find the valid components for each variable: valid_component_weight=1, otherwise=0
    var_mean_df = da.mean(dim="space").mean(dim="TimeStep").to_dataframe()
    var_mean_df['component_id'] = np.array(range(n_components))
    valid_component_ids = list(var_mean_df[var_mean_df[var].notna()]['component_id'])

    for comp_id in valid_component_ids:
        # Compute the standardized matrix for each valid component: rescale the matrix value to range [0,1]
        # -> the values in time series for this component should be in the same scaling: matrix_MinMaxScaler()
        matrix_var_c = grouping_utils.matrix_MinMaxScaler(da[comp_id].values) 

        # Concatenate this matrix block of one component to the final matrix for this 2d variable
        matrix_var = np.concatenate((matrix_var, matrix_var_c), axis=1)

    matrix_var = np.delete(matrix_var,0,1)

    ds_ts[var] = matrix_var
           


In [None]:
ds_ts

## matrix_MinMaxScaler()

In [None]:
da[1]

In [None]:
X = da[1]

In [None]:
x_max, x_min = 1,0

In [None]:
((X - np.min(X)) / (np.max(X) - np.min(X))) * (x_max - x_min) + x_min

## preprocess1dVariables()

In [None]:
vars_dict = vars_1d
n_components = len(component_list)

In [None]:
ds_1d = {}

In [None]:
min_max_scaler = prep.MinMaxScaler()

In [None]:
vars_dict

#### Preprocesses data array correspinding to each 1d variable 

In [None]:
for var, da in vars_dict.items():
    var = var
    da = da

#### STEP 1. Find the valid components for each variable ( valid_component_weight=1, otherwise=0)

In [None]:

var_mean_df = da.mean(dim="space").to_dataframe()
var_mean_df['component_id'] = np.array(range(n_components))
valid_component_ids = list(var_mean_df[var_mean_df[var].notna()]['component_id'])
valid_component_ids

#### STEP 2. Retain only the valid components

In [None]:
data = da.values[valid_component_ids]
data

#### STEP 3. Scale, transform, and add it to the resultant dict

In [None]:
ds_1d[var] = min_max_scaler.fit_transform(data.T)

In [None]:
ds_1d

## preprocess2dVariables()

In [27]:
vars_dict = vars_2d
component_list = component_list

In [28]:
vars_dict

{'2d_distance': <xarray.DataArray '2d_distance' (component: 4, space: 3, space_2: 3)>
 array([[[ 0. ,  1. ,  2. ],
         [ 1. ,  0. , 10. ],
         [ 2. , 10. ,  0. ]],
 
        [[ 0. ,  0.1,  0.2],
         [ 0.1,  0. ,  1. ],
         [ 0.2,  1. ,  0. ]],
 
        [[ nan,  nan,  nan],
         [ nan,  nan,  nan],
         [ nan,  nan,  nan]],
 
        [[ nan,  nan,  nan],
         [ nan,  nan,  nan],
         [ nan,  nan,  nan]]])
 Coordinates:
   * component  (component) <U2 'c1' 'c2' 'c3' 'c4'
   * space      (space) <U6 '01_reg' '02_reg' '03_reg'
   * space_2    (space_2) <U6 '01_reg' '02_reg' '03_reg'}

In [29]:
n_components = len(component_list)

In [30]:
ds_2d = {}

#### Preprocesses data array correspinding to each 2d variable 

In [31]:
for var, da in vars_dict.items():
    var = var 
    da = da
    ds_2d_var = {}

In [32]:
# Different region orders
space1 = da.space.values
space2 = da.space_2.values

#### STEP 1.  Find the valid components for each variable

In [33]:

var_mean_df = da.mean(dim="space").mean(dim="space_2").to_dataframe()
var_mean_df['component_id'] = np.array(range(n_components))
valid_component_ids = list(var_mean_df[var_mean_df[var].notna()]['component_id'])
valid_component_ids

[0, 1]

#### STEP 2. # For each valid component : obtain hollow and symmetric connectivity matrix, scale the matrix 

In [34]:

for comp_id in valid_component_ids:
            
    
    var_matr = da[comp_id].values
    print(var_matr)
    print('-----------------------------------------------------------------------------------')        
    #### STEP 2a.obtain hollow and symmetric dist matrix -> order of space and space2 is the same 
    da_comp_df = pd.DataFrame(data=var_matr,columns=space2)
    print(da_comp_df)
    print('-----------------------------------------------------------------------------------') 
    da_comp_df = da_comp_df[space1]
    print(da_comp_df)
    print('-----------------------------------------------------------------------------------')   
    #### STEP 2b.  scale the matrix 
    # Standardize the matrix: keep all the values non-negative! AND keep zeros to be zeros (not change the meaning of connectivity!)
    # => scale the data to the range [0,1]
    ds_2d_var[comp_id] = grouping_utils.matrix_MinMaxScaler(da_comp_df.to_numpy())
    print(ds_2d_var[comp_id])
    print('-----------------------------------------------------------------------------------')
ds_2d[var] = ds_2d_var



[[ 0.  1.  2.]
 [ 1.  0. 10.]
 [ 2. 10.  0.]]
-----------------------------------------------------------------------------------
   01_reg  02_reg  03_reg
0     0.0     1.0     2.0
1     1.0     0.0    10.0
2     2.0    10.0     0.0
-----------------------------------------------------------------------------------
   01_reg  02_reg  03_reg
0     0.0     1.0     2.0
1     1.0     0.0    10.0
2     2.0    10.0     0.0
-----------------------------------------------------------------------------------
[[0.  0.1 0.2]
 [0.1 0.  1. ]
 [0.2 1.  0. ]]
-----------------------------------------------------------------------------------
[[0.  0.1 0.2]
 [0.1 0.  1. ]
 [0.2 1.  0. ]]
-----------------------------------------------------------------------------------
   01_reg  02_reg  03_reg
0     0.0     0.1     0.2
1     0.1     0.0     1.0
2     0.2     1.0     0.0
-----------------------------------------------------------------------------------
   01_reg  02_reg  03_reg
0     0.0     0.1   

In [35]:
ds_2d

{'2d_distance': {0: array([[0. , 0.1, 0.2],
         [0.1, 0. , 1. ],
         [0.2, 1. , 0. ]]),
  1: array([[0. , 0.1, 0.2],
         [0.1, 0. , 1. ],
         [0.2, 1. , 0. ]])}}

#### STEP 3. Perform only if handle_mode='toDissimilarity': condense the matrix, convert from similarity to dissimilarity

In [36]:
for var, var_dict in ds_2d.items():
    print(var)
    print(var_dict)
    print('-----------------------------------------------------------------------------------')        
    # Transform the symmetric connectivity matrix to 1-dim distance vector
    for c, data in var_dict.items():
        print(c) 
        print(data)
        print('-----------------------------------------------------------------------------------')
        #### STEP 3a. condense the matrix  
        # Obtain the vector form of this symmetric connectivity matrix, in the range [0,1]
        # Deactivate checks since small numerical errors can be in the dataset
        vec = hierarchy.distance.squareform(data, checks=False)
        print(vec)
        print('-----------------------------------------------------------------------------------')
        #### STEP 3b.Convert the value of connectivity (similarity) to distance (dissimilarity)
        vec = 1 - vec
        print(vec)
        print('-----------------------------------------------------------------------------------')        
        # Distance vector for this 2d variable and this component: 1 means maximum distance!
        ds_2d[var][c] = vec
        print(ds_2d)
        print('-----------------------------------------------------------------------------------')

2d_distance
{0: array([[0. , 0.1, 0.2],
       [0.1, 0. , 1. ],
       [0.2, 1. , 0. ]]), 1: array([[0. , 0.1, 0.2],
       [0.1, 0. , 1. ],
       [0.2, 1. , 0. ]])}
-----------------------------------------------------------------------------------
0
[[0.  0.1 0.2]
 [0.1 0.  1. ]
 [0.2 1.  0. ]]
-----------------------------------------------------------------------------------
[0.1 0.2 1. ]
-----------------------------------------------------------------------------------
[0.9 0.8 0. ]
-----------------------------------------------------------------------------------
{'2d_distance': {0: array([0.9, 0.8, 0. ]), 1: array([[0. , 0.1, 0.2],
       [0.1, 0. , 1. ],
       [0.2, 1. , 0. ]])}}
-----------------------------------------------------------------------------------
1
[[0.  0.1 0.2]
 [0.1 0.  1. ]
 [0.2 1.  0. ]]
-----------------------------------------------------------------------------------
[0.1 0.2 1. ]
---------------------------------------------------------------------

In [37]:
ds_2d

{'2d_distance': {0: array([0.9, 0.8, 0. ]), 1: array([0.9, 0.8, 0. ])}}

#### if handle_mode='toAffinity'

In [None]:
    if handle_mode == 'toAffinity':
        '''Original matrices as Adjacency matrices : 
            - adjacency matrix: 0 means identical elements; high values means very similar elements
            - adjacency matrix of a graph: symmetric, diagonals = 0
            - add all matrices of different components for each variable 
            
        '''
        return ds_2d