In [44]:
import os, sys
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
sys.path.insert(0, './s2spy')
import s2spy
from s2spy import RGDR
from xeofs.xarray import EOF, Rotator
import utils, plotting
path_obs_data = '~/data/volume_2/observational/'
path_subseasonal_data = '/data/volume_2/subseasonal/'

In [45]:
target = xr.open_dataarray(os.path.join(path_obs_data, 'preprocessed', 'chrips_tp_1981-2021_africa_aggregated_intervals.nc'))
target

In [46]:
hoa_mask = xr.open_dataarray(os.path.join(path_obs_data, 'chirps_hoa_dry_mask_0.25deg.nc'))
target_timeseries = target.where(hoa_mask).mean(dim=('latitude', 'longitude')).to_dataset()

In [47]:
dfs = pd.read_hdf('~/data/volume_2/subseasonal/ecmwf/aggregated/aggregation_timestamps.h5').sort_index()
df = dfs[['aggregation_start_inclusive', 'aggregation_end_inclusive']].copy()
sel_months = [10, 11, 12]
mon_start_mask = [True if m in sel_months else False for m in df.set_index('aggregation_start_inclusive').index.month]
mon_end_mask = [True if m in sel_months else False for m in df.set_index('aggregation_end_inclusive').index.month]
mon_mask = np.logical_and(mon_start_mask, mon_end_mask)
df_sub = dfs[mon_mask][dfs[mon_mask].index.year==dfs[mon_mask].index.year[0]]
anchor_dates = [(d.month, d.day) for d in df_sub.index]


In [48]:
anchor_date = anchor_dates[0]

In [49]:
cal = s2spy.time.AdventCalendar(anchor=anchor_date, freq='30d')
cal = cal.map_years(1981, 2021)

In [50]:
sst = xr.open_dataarray(os.path.join(path_obs_data, 'preprocessed', 'era5_sst_1959-2021_1_12_daily_2.0deg.nc'))
sst_resampled = s2spy.time.resample(cal, sst)

z200 = xr.open_dataarray(os.path.join(path_obs_data, 'preprocessed', 'era5_z_200hpa_1959-2021_1_12_daily_2.0deg.nc'))
z200_resampled = s2spy.time.resample(cal, z200)

In [51]:
# from sklearn.model_selection import ShuffleSplit
# splitter = ShuffleSplit(n_splits=3)
# traintest_splitter = s2spy.traintest.traintest_splits(splitter, cal)

In [52]:
def pipeline_dimred(X_train, y, dimred_list = [RGDR()]):
    
    if type(X_train) is not list:
        X_train = [X_train]
    assert len(X_train) == len(dimred_list), ('# of dimensionality reduction methods in dimred_list '
                                              'is unequal to the xr.DataArrays you want to reduce')
    
    features_list = []
    for i, X_ in enumerate(X_train):
        dimred = dimred_list[i]
        if type(dimred) is type(RGDR()):
            # fit precursor field vs y for RGDR method
            dimred.fit(X_, y)
            features = dimred.transform(X_)
        else: # assuming EOF-like dimred from eofs package
            model = dimred(X_, dim=['anchor_year'], weights='coslat', n_modes=2)
            model.solve()
            features = model.pcs()
        
        features_list.append(features)

    return features_list


period = 0
lags = np.arange(1,3)
target_1d = target_timeseries.sel(target_period=int(period))['precip']
X = [sst_resampled['sst'].sel(i_interval=lags), 
     z200_resampled['z'].sel(i_interval=lags),
     z200_resampled['z'].sel(i_interval=lags).mean(dim='i_interval')] 
pipeline_dimred(X,
                target_1d,
                dimred_list = [RGDR(), RGDR(), EOF])


# X = [sst_resampled]
# for X_train, y_train, X_test, y_test in traintest_spitter.split_iterate(*X, y=target_1d['precip']):
#     print(y_train)
    


[<xarray.DataArray 'sst' (cluster_labels: 17, anchor_year: 41)>
 array([[-0.18934887, -0.83543317, -0.4811875 ,  0.49148616,  0.27662453,
          0.1986428 ,  0.342192  ,  0.67092628,  0.39378849, -0.03283613,
          0.32351848, -0.69084517,  0.03422231, -0.44619794,  0.17653516,
          0.41286142, -0.73635352,  0.18769055,  0.42137426, -0.01033876,
         -0.24144459, -0.13044724,  0.21833654, -0.12961105, -0.04646508,
          0.1933251 ,  0.00450743,  0.44455079, -0.30449347,  0.30428663,
          0.08320675, -0.29699049, -0.04306165, -0.30980811, -0.52202688,
          0.15972283,  0.31960813,  0.3617118 , -0.08878785,  0.29030877,
          1.09275076],
        [ 0.29132529, -0.67492192, -0.2422495 , -0.05537408, -0.03942376,
          0.28453196,  0.00522417, -0.05295241,  0.79216098,  0.37676817,
         -0.16400759,  0.06324448, -0.18058355, -0.86937798,  0.03132709,
          0.48205041, -0.86444804,  1.33586837,  0.10328334, -0.11925998,
          0.39466108,  0.

In [None]:
for period in target_timeseries.target_period.values[::4]:
    corr, pval = s2spy.rgdr.rgdr.correlation(var_resampled[var].sel(i_interval=3), target_timeseries.sel(target_period=int(period)),
                                       corr_dim='anchor_year')
    plotting.plot_maps(corr.where(pval < 0.05))

In [None]:
for period in target_timeseries.target_period.values[::4]:
    rg = s2spy.rgdr.rgdr.RGDR(eps_km=600, alpha=0.05, min_area_km2=3000**2)
    rg.plot_clusters(var_resampled[var].sel(i_interval=1), target_timeseries.sel(target_period=int(period)))

In [19]:
rg = RGDR()
rg.fit?

[0;31mSignature:[0m
[0mrg[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mprecursor[0m[0;34m:[0m [0mxarray[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mdataarray[0m[0;34m.[0m[0mDataArray[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtimeseries[0m[0;34m:[0m [0mxarray[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mdataarray[0m[0;34m.[0m[0mDataArray[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Fits RGDR clusters to precursor data.

Performs DBSCAN clustering on a prepared DataArray, and then groups the data by
their determined clusters, using an weighted mean. The weight is based on the
area of each grid cell.

Density-Based Spatial Clustering of Applications with Noise (DBSCAN) clusters
gridcells together which are of the same sign and in proximity to
each other using DBSCAN.

Clusters labelled with a positive value represent a positive correlation with
the target timeseries, the clusters labelled wit