# About

This notebook is to test out the CCA-Zoo `KCCA` module to see if the results match the results if we solve it manually.

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import scipy
import warnings
import time
from tqdm.notebook import tqdm

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import CCA, PLSCanonical
from sklearn.utils import Bunch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import rbf_kernel, euclidean_distances
from sklearn.metrics.pairwise import pairwise_kernels

from cca_zoo.models import CCA as cz_CCA
from cca_zoo.models import rCCA as cz_rCCA
from cca_zoo.models import KCCA
from cca_zoo.model_selection import GridSearchCV as cz_GridSearchCV
from cca_zoo.plotting import pairplot_train_test

rs_num = 42
rng = np.random.default_rng(rs_num)

sklearn.set_config(display="diagram")

n_splits = 5
n_repeats = 20

%matplotlib inline

In [2]:
# load data
data_path = 'data//'
X_train = np.genfromtxt(data_path+'example_3_2_X_a.csv', delimiter=',')
Y_train = np.genfromtxt(data_path+'example_3_2_X_b.csv', delimiter=',')

# extract dimensions
X_dimension_num = X_train.shape[1]
Y_dimension_num = Y_train.shape[1]

In [3]:
sample_num = 5
X_dimension_num = 3
Y_dimension_num = 2

X_train = np.zeros((sample_num, X_dimension_num))
X_train[:, 0] = np.arange(1, sample_num+1)
X_train[:, 1] = np.linspace(-0.25, 0.6, sample_num)
X_train[:, 2] = np.logspace(1, 2, sample_num)

Y_train = np.zeros((sample_num, Y_dimension_num))
Y_train[:, 0] = np.exp(X_train[:, 1])*0.25
Y_train[:, 1] = -X_train[:, 0]

# standardization
X_train = StandardScaler().fit_transform(X_train)
Y_train = StandardScaler().fit_transform(Y_train)

# Manual Computation

In [4]:
# for this method, we do not center the matrices
def rbf_kcca(X, Y, c1, c2, gamma1, gamma2, n_components):
    # note: for X and Y, we assume they are standardized already
    sample_num = X.shape[0]
    
    # compute for K matrices
    Kx = rbf_kernel(X, X, gamma1)
    Ky = rbf_kernel(Y, Y, gamma2)
    
    A = np.zeros((2*sample_num, 2*sample_num))
    A[:sample_num, sample_num:] = Kx.dot(Ky)
    A[sample_num:, :sample_num] = Ky.dot(Kx)
    
    B = np.zeros_like(A)
    B[:sample_num, :sample_num] = np.linalg.matrix_power(
        Kx + c1 * np.eye(sample_num), 2
    )
    B[sample_num:, sample_num:] = np.linalg.matrix_power(
        Ky + c2 * np.eye(sample_num), 2
    )
    
    # solve for generalized eigenvalue problem
    [eig_val, eig_vec] = scipy.linalg.eigh(A, B)
    
    # take parts where eigenvalues are larger than 0.0
    eig_val_positive = eig_val[eig_val>=0.0]
    eig_vec_positive = eig_vec[:, eig_val>=0.0]
    
    # sort by eigenvalues, descending order
    sort_ind_des = eig_val_positive.argsort()[::-1]
    eig_val_positive_sorted = eig_val_positive[sort_ind_des]
    eig_vec_positive_sorted = eig_vec_positive[:, sort_ind_des]
    
    # slicing components
    eig_val_positive_sorted = eig_val_positive_sorted[:n_components]
    alpha_sorted = eig_vec_positive_sorted[:sample_num, :n_components]
    beta_sorted = eig_vec_positive_sorted[sample_num:, :n_components]
    
    # transform Kx & Ky
    zx = Kx.dot(alpha_sorted)
    zy = Ky.dot(beta_sorted)
    
    # compute canonical correlation
    cc_arr =  np.diag(cosine_similarity(
        zx.T, zy.T
    ))
    
    return [
        eig_val_positive_sorted, 
        alpha_sorted, beta_sorted, 
        Kx, Ky, zx, zy, cc_arr, 
        A, B
    ]

In [5]:
# some parameters
c1 = 0.2
c2 = 0.2
gamma1 = 0.03977
gamma2 = 0.03966
n_components = 2

In [6]:
ma_kcca_bunch = Bunch()
ma_kcca_bunch.name = 'Manual KCCA'

In [7]:
print('For manually computed KCCA:')

# solve for kcca
[ma_kcca_bunch.eig_val, 
 ma_kcca_bunch.alpha, 
 ma_kcca_bunch.beta, 
 ma_kcca_bunch.Kx, 
 ma_kcca_bunch.Ky, 
 ma_kcca_bunch.zx, 
 ma_kcca_bunch.zy, 
 ma_kcca_bunch.cc_arr, 
 ma_kcca_bunch.A, 
 ma_kcca_bunch.B] = rbf_kcca(
    X_train, Y_train, c1, c2, gamma1, gamma2, n_components
)

print('Canonical Correlation for {} components:'.format(n_components))
print(ma_kcca_bunch.cc_arr)

For manually computed KCCA:
Canonical Correlation for 2 components:
[0.99977359 0.99947471]


# CCA-Zoo KCCA

In [8]:
cz_kcca_bunch = Bunch()
cz_kcca_bunch.name = 'CCA-Zoo KCCA'

In [9]:
print('For CCA-Zoo KCCA:')

cz_kcca_bunch.kcca = KCCA(
    latent_dims=n_components, 
    kernel=['rbf', 'rbf'], 
    gamma=[gamma1, gamma2], 
    c=[c1, c2], 
    eps=1e-9, 
    random_state=rs_num
).fit([X_train, Y_train])

cz_kcca_bunch.cc_arr = cz_kcca_bunch.kcca.score([X_train, Y_train])

print('Canonical Correlation for {} components:'.format(n_components))
print(cz_kcca_bunch.cc_arr)

For CCA-Zoo KCCA:
Canonical Correlation for 2 components:
[0.99831147 0.99947935]


We can see for CCA-Zoo module, the canonical correlations are not sorted, as it should be.

With some debugging tests, there are several issues:
1. For the CCA-Zoo, the standardization is performed with `ddof=1`, but in our cases, `StandardScaler()` will provide biased `ddof=0` computation.
2. When solving for the generalized eigenvalue problem using `scipy.linalg.eigh(a, b)` function, the matrices `a` & `b` are not computed the same for the CCA-Zoo package.