In [None]:
## This Jupyter notebook reproduces the results for WDL in Table 1

In [None]:
# import packages
import os, sys
sys.path.append('../../../lib/')
import WDL as wp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split

In [None]:
# load data
X = pd.read_csv('../../../data/income/processed/dat_X.csv').to_numpy()
Y = pd.read_csv('../../../data/income/processed/dat_Y.csv').to_numpy()
loc_cv = pd.read_csv('../../../data/income/processed/dat_CV.csv').to_numpy().flatten()

In [None]:
# nested cross validation
n_dist = Y.shape[0]
n_levs = 100
n_fold = np.max(loc_cv) + 1
q_vec = np.arange(1, n_levs) / n_levs
## transform Y
Q_mat = np.array([np.quantile(Y[i], q_vec) for i in range(n_dist)])
Q_train = np.zeros((Q_mat.shape[0], Q_mat.shape[1], n_fold))
Q_test = np.zeros(Q_mat.shape)

K_list = [2, 3, 5]
lr_list = [1e-1, 1e-2]
n_iter = 1000
## outer loop
time_start = datetime.now()
print('Start training:', time_start)
for i in range(n_fold):
    print('This is fold', str(i+1))
    X_train = X[loc_cv != i]
    Y_train = Q_mat[loc_cv != i]
    X_test = X[loc_cv == i]
    Y_test = Q_mat[loc_cv == i]
    
    n_test = Y_test.shape[0]
    n_train = Y_train.shape[0]
    
    ## inner parameter selection
    X_t_in, X_v_in, Y_t_in, Y_v_in = train_test_split(X_train, Y_train, test_size=0.25, random_state=2022)
    par_combo = [(K, lr) for K in K_list for lr in lr_list]
    loss_ = []
    iters_ = []
    for K_mix, lr in par_combo:
        print(K_mix, lr)
        res_init = wp.WDL(X_t_in, Y_t_in, X_v_in, Y_v_in,
                          q_vec=q_vec, K=K_mix, max_iter=n_iter, warm_up=1, max_depth=1, 
                          patience=10, lr=lr, random_state=2022)
        iters_.append(res_init['iter_best'])
        loss_.append(res_init['val_loss'][res_init['iter_best']])
        
    ## choose the best params
    K_best, lr_best = par_combo[np.argmin(np.array(loss_))]
    iter_best = iters_[np.argmin(np.array(loss_))]
    print('Best:', K_best, lr_best, iter_best)
    ## retrain the model over the training set
    res = wp.WDL(X_train, Y_train, X_test, Y_test, q_vec=q_vec, 
                 K=K_best, max_iter=iter_best, warm_up=1, 
                 max_depth=1, early_stop=False, lr=lr_best, random_state=2022)
    
    alpha_test = np.zeros((n_test, K_best))
    mu_test = np.zeros((n_test, K_best))
    sigma_test = np.zeros((n_test, K_best))
    
    alpha_train = np.zeros((n_train, K_best))
    mu_train = np.zeros((n_train, K_best))
    sigma_train = np.zeros((n_train, K_best))
    
    v_lr = np.array([1] + [lr_best] * iter_best)
    for k in range(K_best):
        alpha_test[:, k] = wp.pred_boost(X_test, res['alpha'][k], lr_=v_lr, n_term=iter_best)
        mu_test[:, k] = wp.pred_boost(X_test, res['mu'][k], lr_=v_lr, n_term=iter_best)
        sigma_test[:, k] = np.exp(wp.pred_boost(X_test, res['sigma'][k], lr_=v_lr, n_term=iter_best))
        
        alpha_train[:, k] = wp.pred_boost(X_train, res['alpha'][k], lr_=v_lr, n_term=iter_best)
        mu_train[:, k] = wp.pred_boost(X_train, res['mu'][k], lr_=v_lr, n_term=iter_best)
        sigma_train[:, k] = np.exp(wp.pred_boost(X_train, res['sigma'][k], lr_=v_lr, n_term=iter_best))
    
    pi_test = np.exp(alpha_test)
    pi_test = (pi_test.T / np.sum(pi_test, axis=1)).T
    Q_test[loc_cv == i] = [wp.qgmm1d(q_vec, mu_test[j], sigma_test[j], pi_test[j]) for j in range(n_test)]
    
    pi_train = np.exp(alpha_train)
    pi_train = (pi_train.T / np.sum(pi_train, axis=1)).T
    Q_train[loc_cv != i, :, i] = [wp.qgmm1d(q_vec, mu_train[j], sigma_train[j], pi_train[j]) for j in range(n_train)]
    Q_train[loc_cv == i, :, i] = np.nan
    
print('Done!')
print('Time:', datetime.now() - time_start )

In [None]:
# evaluate the results
RMSE = np.mean((Q_mat - Q_test)**2)
var_y = np.mean((Q_mat - np.mean(Q_mat, axis=0))**2)
R_sq = 1 - RMSE / var_y
print('Test loss:', RMSE)
print('Test R-squared:', R_sq)

In [None]:
# save prediction results
pd.DataFrame(Q_test).to_csv('../predictions/qt_test_WDL.csv')
np.save('../predictions/qt_train_WDL.npy', Q_train)