# 0. About

This notebook is explore how the `score` function works in CCA.

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import scipy
import warnings

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import CCA, PLSCanonical
from sklearn.utils import Bunch
from sklearn.metrics.pairwise import cosine_similarity

from cca_zoo.models import CCA as cz_CCA
from cca_zoo.models import rCCA as cz_rCCA
from cca_zoo.model_selection import GridSearchCV as cz_GridSearchCV

rs_num = 14
rng = np.random.default_rng(rs_num)

sklearn.set_config(display="diagram")

%matplotlib inline

# 1. Dataset Construction

In [2]:
sample_num = 100
X_dimension_num = 4
Y_dimension_num = 3

# construct X
x1 = rng.normal(loc=0.0, scale=1.0, size=(sample_num, 1))
x2 = rng.normal(loc=0.0, scale=1.0, size=(sample_num, 1))
x3 = rng.normal(loc=0.0, scale=1.0, size=(sample_num, 1))
x4 = rng.normal(loc=0.0, scale=1.0, size=(sample_num, 1))

X = rng.normal(loc=0.0, scale=1.0, size=(sample_num, X_dimension_num))

# noise
xi_1 = rng.normal(loc=0, scale=np.sqrt(0.2), size=(sample_num, ))
xi_2 = rng.normal(loc=0, scale=np.sqrt(0.4), size=(sample_num, ))
xi_3 = rng.normal(loc=0, scale=np.sqrt(0.3), size=(sample_num, ))

# contruct Y
Y = rng.normal(loc=0.0, scale=1.0, size=(sample_num, Y_dimension_num))
Y[:, 0] = X[:, 2] + xi_1
Y[:, 1] = X[:, 0] + xi_2
Y[:, 2] = -X[:, 3] + xi_3

# split into training and test dataset
[X_train, X_test, Y_train, Y_test] = train_test_split(
    X, Y, test_size=0.20, random_state=rs_num
)

In [3]:
# standardization of two matrices
X_train_scaler = StandardScaler().fit(X_train)
X_train_standardized = X_train_scaler.transform(X_train)
X_test_standardized = X_train_scaler.transform(X_test)

Y_train_scaler = StandardScaler().fit(Y_train)
Y_train_standardized = Y_train_scaler.transform(Y_train)
Y_test_standardized = Y_train_scaler.transform(Y_test)

# 1. Apply CCA via sklearn

In [4]:
print('For sklearn CCA:')

sklearn_cca_bunch = Bunch()
sklearn_cca_bunch.name = 'sklearn cca'

sklearn_cca_bunch.n_components = np.min([X_dimension_num, Y_dimension_num])
sklearn_cca_bunch.max_iter = int(1e15)
sklearn_cca_bunch.tol = 1e-15

# fit cca
sklearn_cca_bunch.cca = CCA(
    n_components=sklearn_cca_bunch.n_components, 
    max_iter=sklearn_cca_bunch.max_iter, 
    tol=sklearn_cca_bunch.tol
).fit(X_train, Y_train)

# transform data
[sklearn_cca_bunch.Xc_train, 
 sklearn_cca_bunch.Yc_train] = sklearn_cca_bunch.cca.transform(X_train, Y_train)

# obtain canonical correlation manually
sklearn_cca_bunch.cc = np.zeros((sklearn_cca_bunch.n_components, ))
for component_i in range(sklearn_cca_bunch.n_components):
    sklearn_cca_bunch.cc[component_i] = cosine_similarity(
        sklearn_cca_bunch.Xc_train[:, component_i].reshape(1, -1), 
        sklearn_cca_bunch.Yc_train[:, component_i].reshape(1, -1)
    )[0][0]
print('  The canonical correlations over training dataset are: ', sklearn_cca_bunch.cc)

# we can also compute the canonical correlations in a different way
print(
    '  Also CC computed in a different way', 
     np.corrcoef(
        sklearn_cca_bunch.Xc_train.T, sklearn_cca_bunch.Yc_train.T
    ).diagonal(offset=sklearn_cca_bunch.n_components)
)

# score the same dataset
print('  The scores over training dataset are: ', sklearn_cca_bunch.cca.score(X_train, Y_train))

print('The score function in sklearn is to try to predict Y and compute R^2 from there.')
print('R^2 can be seen as a standardized version of MSE (Mean-Squared-Error).')

For sklearn CCA:
  The canonical correlations over training dataset are:  [0.91298397 0.89672778 0.8824567 ]
  Also CC computed in a different way [0.91298397 0.89672778 0.8824567 ]
  The scores over training dataset are:  0.7933817650253393
The score function in sklearn is to try to predict Y and compute R^2 from there.
R^2 can be seen as a standardized version of MSE (Mean-Squared-Error).


# 2. Apply CCA via CCA-Zoo

In [5]:
print('For CCA-Zoo CCA:')

cz_cca_bunch = Bunch()
cz_cca_bunch.name = 'cz cca'

cz_cca_bunch.n_components = np.min([X_dimension_num, Y_dimension_num])

# fit cca
cz_cca_bunch.cca = cz_CCA(
    latent_dims=cz_cca_bunch.n_components, 
    random_state=rs_num
).fit((X_train, Y_train))

# transform data
[cz_cca_bunch.Xc_train, 
 cz_cca_bunch.Yc_train] = cz_cca_bunch.cca.transform((X_train, Y_train))

# obtain canonical correlation manually
cz_cca_bunch.cc = np.zeros((cz_cca_bunch.n_components, ))
for component_i in range(cz_cca_bunch.n_components):
    cz_cca_bunch.cc[component_i] = cosine_similarity(
        cz_cca_bunch.Xc_train[:, component_i].reshape(1, -1), 
        cz_cca_bunch.Yc_train[:, component_i].reshape(1, -1)
    )[0][0]
print('  The canonical correlations over training dataset are: ', cz_cca_bunch.cc)

# score the same dataset
print('  The scores over training dataset are: ', cz_cca_bunch.cca.score((X_train, Y_train)))

For CCA-Zoo CCA:
  The canonical correlations over training dataset are:  [0.91298397 0.89672778 0.8824567 ]
  The scores over training dataset are:  [0.91298397 0.89672778 0.8824567 ]
