In [None]:
import sys
sys.path.append("..")
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_process import ArmaProcess

from timeseries import lowpass
from xr_regression import lag_linregress_3D

%matplotlib inline
%load_ext autoreload
%autoreload 2

# correlation significance testing under filtering
We always expect distribution of p-values to be uniform in [0,1] when randomly generated time series are correlated.
For example, for an x% threshold, ca. x% of correlations should be randomly "significant" under that threshold

### generating artifical time series; methods to calculate d.o.f. reduction; MC experiment + visualization

In [None]:
def AR1(n):
    AR_object = ArmaProcess(np.array([1, -.9]), np.array([1]))
    return AR_object.generate_sample(nsample=n)

def AR1_03(n):
    AR_object = ArmaProcess(np.array([1, -.3]), np.array([1]))
    return AR_object.generate_sample(nsample=n)
    
def white_noise(n):
    return np.random.rand(n)

def filtered_noise_5(n):
    return lowpass(np.random.rand(n), 5)

def filtered_noise_10(n):
    return lowpass(np.random.rand(n), 10)

def filtered_noise_20(n):
    return lowpass(np.random.rand(n), 20)

def filtered_AR1(n):
    return lowpass(AR1(n), 10)

def filtered_AR1_03(n):
    return lowpass(AR1_03(n), 10)

In [None]:
def dof_filter(x, y, cutoffs):
    if type(cutoffs)==tuple:
        red_dof = 1./min(cutoffs)
    else:
        red_dof = 1
    return red_dof

def dof_ac_x(x, y, cutoffs=None):
    rx = np.corrcoef(x[1:], x[:-1])[0,1]
    return (1-abs(rx))/(1+abs(rx))*2

def dof_ac_y(x, y, cutoffs=None):
    ry = np.corrcoef(y[1:], y[:-1])[0,1]
    return (1-abs(ry))/(1+abs(ry))*2

def dof_ac_xy(x, y, cutoffs=None):
    rx = np.corrcoef(x[1:], x[:-1])[0,1]
    ry = np.corrcoef(y[1:], y[:-1])[0,1]
    return (1-abs(rx*ry))/(1+abs(rx*ry))

def dof_choose_max(x, y, cutoffs=None):
    dof1 = dof_ac_xy(x, y, cutoffs=None)
    dof2 = dof_filter(x, y, cutoffs)
    return max(dof1, dof2)

def dof_choose_min(x, y, cutoffs=None):
    dof1 = dof_ac_xy(x, y, cutoffs=None)
    dof2 = dof_filter(x, y, cutoffs)
    return min(dof1, dof2)

In [None]:
def plot_results(A):
    f, ax = plt.subplots(1, 2, figsize=(12,4))
    ax[0].hist(A[0,0,:], alpha=.4, label='n=100');
    ax[0].hist(A[0,1,:], alpha=.4, label='n=1000');
    ax[0].legend();

    ax[1].hist(A[1,0,:], alpha=.4);
    ax[1].hist(A[1,1,:], alpha=.4);

def significance_experiment(fcn1, fcn2, cutoffs=None, red_dof=None):
    if red_dof is None:
        red_dof = 1
    A = np.zeros((2, 2, 1000))
    for l, length in enumerate([100, 1000]):
        x = xr.DataArray(data=np.zeros((length)), coords={'time':range(length)}, dims='time' )
        y = xr.DataArray(data=np.zeros((length)), coords={'time':range(length)}, dims='time' )
        for i in range(1000):
            x.values = fcn1(length)
            y.values = fcn2(length)
            if callable(red_dof)==True:
                dof_corr = red_dof(x, y, cutoffs)
            else:
                dof_corr = red_dof
            ds = lag_linregress_3D(x, y, dof_corr=dof_corr)
            A[0,l,i] = ds.cor
            A[1,l,i] = ds.pval
    plot_results(A)
    return

## Experiment 1: correlate two white noise random time series

In [None]:
significance_experiment(white_noise, white_noise)

## Experiment 2: effects of filtering 

### Experiment 2.1: correlate two filtered time series
shwoing that we have to use reduced degrees of freedom in calculation of t-statistics

In [None]:
significance_experiment(filtered_noise_10, filtered_noise_10)

with filtered data, $N'=N\frac{\Delta T}{T_0}$ with $T_0=10$ being the cutoff period and $\Delta T=1$ in the following case

In [None]:
significance_experiment(filtered_noise_10, filtered_noise_10, cutoffs=(10,10), red_dof=dof_filter)

### Experiment 2.2: correlate one filtered with one unfiltered time series

In [None]:
significance_experiment(filtered_noise_10, white_noise)

### Experiment 2.3: two different filter cutoff periods

In [None]:
significance_experiment(filtered_noise_5, filtered_noise_20, cutoffs=(5,20), red_dof=dof_filter)

using the higher of the two reduced numbers of freedom results in somewhat overestimated p-values

### Exp. 2.4: Does the AR dof estimation method work for filtered time series? No

In [None]:
significance_experiment(filtered_noise_5, filtered_noise_20, red_dof=dof_ac_xy)

## Experiment 3: autocorrelated time series from AR(1) process

with autocorrelation, one needs to apply a correction for the dof:

### Exp. 3.1: two AR(1) with $r=0.9$

In [None]:
significance_experiment(AR1, AR1)

In [None]:
significance_experiment(AR1, AR1, red_dof=.1)

In [None]:
significance_experiment(AR1, AR1, red_dof=dof_ac_x)  # same as `dof_ac_y`
# this is by chance correct, I think

In [None]:
significance_experiment(AR1, AR1, red_dof=dof_ac_xy)

## Exp. 3.2: two different $r$-values

In [None]:
significance_experiment(AR1, AR1_03, red_dof=dof_ac_xy)

## Experiment 4: one AR(1) time series and one filtered time series

In [None]:
significance_experiment(filtered_noise_10, AR1_03)

#### using the `dof_ac_xy` estimator

In [None]:
significance_experiment(filtered_noise_10, AR1_03, red_dof=dof_ac_xy)

In [None]:
significance_experiment(filtered_noise_5, AR1, red_dof=dof_ac_xy)

#### using the `dof_filter` estimator

In [None]:
significance_experiment(filtered_noise_10, AR1_03, cutoffs=(5,), red_dof=dof_filter)

In [None]:
significance_experiment(filtered_noise_5, AR1, cutoffs=(5,), red_dof=dof_filter)

#### compromise: choose maximum of DOF correction factor

In [None]:
significance_experiment(filtered_noise_10, AR1_03, cutoffs=(10,), red_dof=dof_choose_max)

In [None]:
significance_experiment(filtered_noise_10, AR1_03, cutoffs=(10,), red_dof=dof_choose_min)

In [None]:
significance_experiment(filtered_noise_5, AR1, cutoffs=(5,), red_dof=dof_choose_max)  # this went wrng with `ac_xy`

In [None]:
significance_experiment(filtered_noise_5, AR1, cutoffs=(5,), red_dof=dof_choose_min)  # this went wrng with `ac_xy`

#### other cases

In [None]:
significance_experiment(filtered_noise_20, AR1_03, cutoffs=(20,), red_dof=dof_choose_max)

In [None]:
significance_experiment(filtered_noise_20, AR1_03, cutoffs=(20,), red_dof=dof_choose_min)

In [None]:
significance_experiment(filtered_AR1, AR1_03, cutoffs=(10,), red_dof=dof_choose_max)

In [None]:
significance_experiment(filtered_AR1, AR1_03, cutoffs=(10,), red_dof=dof_choose_min)

In [None]:
significance_experiment(filtered_AR1_03, AR1_03, cutoffs=(10,), red_dof=dof_choose_max)

In [None]:
significance_experiment(filtered_AR1_03, AR1_03, cutoffs=(10,), red_dof=dof_choose_min)

In [None]:
significance_experiment(filtered_AR1_03, AR1, cutoffs=(10,), red_dof=dof_choose_max)

In [None]:
significance_experiment(filtered_AR1_03, AR1, cutoffs=(10,), red_dof=dof_choose_min)

this somewhat underestimates the p-tails it seems, i.e. something does not show up as significant when in fact it would be under a certain threshold