In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.linalg import toeplitz

from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.base import clone

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from dml.double_ml_data import DoubleMLData
from dml.double_ml_pliv import DoubleMLPLIV
from dml.double_ml_resampling import DoubleMLMultiwayResampling

## Generate data using the example from https://arxiv.org/pdf/1909.03489.pdf

In [None]:
# simulation parameters
N = 25
M = 25
C = 25
dim_X = 100
K = 3

In [None]:
# additional fixed parameters
theta_0 = 1.0
pi_10 = theta_0

xx = np.arange(1,dim_X+1)
zeta_0 = np.power(0.5, xx)
pi_20 = zeta_0
xi_0 = zeta_0

omega_X = np.array([0.25, 0.5])
omega_eps = omega_X
omega_v = omega_X
omega_V = omega_X

s_X = 0.25
s_epsv = s_X

In [None]:
# use np.tile() and np.repeat() for repeating vectors in different styles, i.e.,
# np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3]
# np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3]

In [None]:
alpha_V = np.random.normal(size=(N*M))
alpha_V_i = np.repeat(np.random.normal(size=N),
                      M)
alpha_V_j = np.tile(np.random.normal(size=M),
                    N)

In [None]:
cov_mat = np.array([[1, s_epsv], [s_epsv, 1]])
alpha_eps_v = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N*M,])
alpha_eps = alpha_eps_v[:, 0]
alpha_v = alpha_eps_v[:, 1]

alpha_eps_v_i = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N,])
alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M)
alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M)

alpha_eps_v_j = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[M,])
alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N)
alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N)

In [None]:
cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)])
alpha_X = np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N*M,])
alpha_X_i = np.repeat(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N,]),
                      M, axis = 0)
alpha_X_j = np.tile(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[M,]),
                    (N,1))

In [None]:
# generate variables
X = (1 - omega_X[0] - omega_X[1]) * alpha_X \
      + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j

eps = (1 - omega_eps[0] - omega_eps[1]) * alpha_eps \
      + omega_eps[0] * alpha_eps_i + omega_eps[1] * alpha_eps_j

v = (1 - omega_v[0] - omega_v[1]) * alpha_v \
    + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j

V = (1 - omega_V[0] - omega_V[1]) * alpha_V \
    + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j

In [None]:
Z = np.matmul(X, xi_0) + V
D = Z * pi_10 + np.matmul(X, pi_20) + v
Y = D * theta_0 + np.matmul(X, zeta_0) + eps

In [None]:
ind = pd.MultiIndex.from_product([range(N), range(M)])
cols = ['Y', 'D', 'Z', 'V'] + [f'x{i+1}' for i in np.arange(dim_X)]

data = pd.DataFrame(np.column_stack((Y, D, Z, V, X)),
                    columns=cols,
                    index = ind)

In [None]:
# data with multi index for rows (tuples with two entries)
data.head(30)

## Initialize the objects of class DoubleMLData and DoubleMLPLIV

In [None]:
# collect data
x_cols = data.columns[data.columns.str.startswith('x')].tolist()

# Set machine learning methods for m & g
learner = RandomForestRegressor(max_depth=2, n_estimators=10)
ml_learners = {'ml_m': clone(clone(learner)),
               'ml_g': clone(clone(learner)),
               'ml_r': clone(clone(learner))}

dml_pliv_obj = DoubleMLPLIV(data, x_cols, 'Y', ['D'], 'Z',
                            ml_learners,
                            inf_model='DML2018',
                            dml_procedure='dml1',
                            draw_sample_splitting=False)

## Split samples and transfer the sample splitting to the object

In [None]:
smpl_sizes = [N, M]

obj_dml_multiway_resampling = DoubleMLMultiwayResampling(K, smpl_sizes)

smpls_multi_ind, smpls_lin_ind = obj_dml_multiway_resampling.split_samples()

In [None]:
dml_pliv_obj.set_sample_splitting([smpls_lin_ind])

## Fit the model and return the fitted parameter

In [None]:
dml_pliv_obj.fit()

In [None]:
dml_pliv_obj.coef

## Visualization of sample splitting with tuple and linear indexing

In [None]:
from matplotlib.colors import ListedColormap

#discrete color scheme
x = sns.color_palette("RdBu_r", 7)
cMap = ListedColormap([x[0], x[3], x[6]])

In [None]:
plt.rcParams['figure.figsize'] = 15, 12
sns.set(font_scale=1.3)

### Visualize sample splitting with tuples (one plot per fold)

In [None]:
for i_split, this_split_ind in enumerate(smpls_multi_ind):
    plt.subplot(K, K, i_split+1)
    df = pd.DataFrame(np.zeros([N,M]))
    ind_array_train = np.array([*this_split_ind[0]])
    ind_array_test = np.array([*this_split_ind[1]])
    df.loc[ind_array_train[:, 0], ind_array_train[:, 1]] = -1.
    df.loc[ind_array_test[:, 0], ind_array_test[:, 1]] = 1.
    
    ax = sns.heatmap(df, cmap=cMap);
    ax.invert_yaxis();
    ax.set_ylim([0, M]);
    colorbar = ax.collections[0].colorbar
    colorbar.set_ticks([-0.667, 0, 0.667])
    if i_split % K == (K-1):
        colorbar.set_ticklabels(['Nuisance', '', 'Score'])
    else:
        colorbar.set_ticklabels(['', '', ''])

### Visualize sample splitting with linear indexing (one column per fold)

In [None]:
df = pd.DataFrame(np.zeros([N*M, K*K]))
for i_split, this_split_ind in enumerate(smpls_lin_ind):
    df.loc[this_split_ind[0], i_split] = -1.
    df.loc[this_split_ind[1], i_split] = 1.

ax = sns.heatmap(df, cmap=cMap);
ax.invert_yaxis();
ax.set_ylim([0, N*M]);
colorbar = ax.collections[0].colorbar
colorbar.set_ticks([-0.667, 0, 0.667])
colorbar.set_ticklabels(['Nuisance', '', 'Score'])