In [None]:
library(ggplot2)
library(gridExtra)
library(grid)
library(matrixStats)
library(fda)

# Load data

Set up the folders that contains data.

In [None]:
task = 'Rest'
adjust = 'gender_motion'

Load questionnaires and connectome.

In [None]:
c = read.table(paste('Data/Connectome/', task, '/Adjust/c_adj_', adjust, '.csv', sep = ''), sep = ',', header = TRUE, row.names = 1)

In [None]:
q = read.table(paste('Data/Questionnaire/Adjust/q_adj_gender.csv', sep = ''), sep = ',', header = TRUE, row.names = 1)

# Load train and test subjects

In [None]:
subject_train = read.table(paste('Data/Questionnaire/subject_train.csv', sep = ''), sep = ',', header = TRUE)
subject_test = read.table(paste('Data/Questionnaire/subject_test.csv', sep = ''), sep = ',', header = TRUE)

In [None]:
c_train = as.matrix(c[as.character(subject_train$Subject),])
c_test = as.matrix(c[as.character(subject_test$Subject),])

In [None]:
q_train = as.matrix(q[as.character(subject_train$Subject),])
q_test = as.matrix(q[as.character(subject_test$Subject),])

# Load K-fold cross-validation split

In [None]:
split_train = subject_train$Split
K = max(split_train)

# Cross-Validated RCCA

Splits the data in train (90%) and test (10%) at random and returns the canonical correlations for training and test data.
Calculate 1SD confidence intervals for train and test canonical correlations.

In [None]:
RCCA_CV = function(X, Y, lambda1s, split){
    K = max(split)
    ncomp = min(ncol(X), ncol(Y))
    n = nrow(X)
    cor_cv = c()
    #compute R and V for matrix X
    SVD = svd(X)
    R = SVD$u %*% diag(SVD$d)
    V = SVD$v
    rownames(V) = colnames(X)
    for(lambda1 in lambda1s){
        cor_train = matrix(0, ncomp, K)
        cor_test = matrix(0, ncomp, K)
        colnames(cor_train) = paste('split', 1:K, sep = '')
        colnames(cor_test) = paste('split', 1:K, sep = '')
        #repeat split nreps times and save train and test correlations
        for(k in 1:K){
            #split the data into train and test
            ind_test = (split == k)
            ind_train = (split != k)
            #run RCCA
            Crr = var(R[ind_train, ], na.rm = TRUE, use = "pairwise") + diag(lambda1, ncol(R))
            Cyy = var(Y[ind_train, ], na.rm = TRUE, use = "pairwise")
            Cry = cov(R[ind_train, ], Y[ind_train, ], use = "pairwise")
            RCCA = geigen(Cry, Crr, Cyy)
            names(RCCA) = c("cor", "xcoef", "ycoef")
            #compute coefficients
            alpha = V %*% RCCA$xcoef
            beta = RCCA$ycoef
            #compute train and test correlation
            cor_train[,k] = RCCA$cor
            cor_test[,k] = diag(cor(X[ind_test, ]%*%alpha, Y[ind_test, ]%*%beta))
        }
        cor_cv = rbind(cor_cv, data.frame(cor_train, 'comp' = 1:ncomp, 'set' = 'train', 'lambda1' = lambda1), 
                             data.frame(cor_test, 'comp' = 1:ncomp, 'set' = 'test', 'lambda1' = lambda1))
    }
    return(cor_cv)
 }

# Run grid search for RCCA

Vary $\lambda_1$ and compute GRCCA.

In [None]:
lambda1s = 10^seq(-2,6)

cor_cv = RCCA_CV(c_train, q_train, lambda1s, split_train)
write.csv(cor_cv, file = paste('Data/Connectome/', task, '/RCCA/corelation_adj_', adjust, '_train_cv.csv', sep = ''), row.names = FALSE)

Plot GRCCA results for different values of $\lambda_1$ and $\nu$.

In [None]:
cor_plots = list()
ncomp = min(ncol(c), ncol(q))
i = 1
df = c()
for(lam1 in lambda1s){
    cor_train = subset(cor_cv, lambda1 == lam1 & set == 'train', select = -c(lambda1, set, comp))
    df_train = data.frame('component' = 1:ncomp, 'mean' = apply(cor_train, 1, mean), 'se' = apply(cor_train, 1, sd)/sqrt(K), 'set' = 'train', 'lambda1' = lam1) 
    cor_test = subset(cor_cv, lambda1 == lam1 & set == 'test', select = -c(lambda1, set, comp))
    df_test = data.frame('component' = 1:ncomp, 'mean' = apply(cor_test, 1, mean), 'se' = apply(cor_test, 1, sd)/sqrt(K), 'set' = 'test', 'lambda1' = lam1) 
    df = rbind(df, df_train, df_test)
}

In [None]:
library(gridExtra) 
ggplot(df, aes(x = component, y = mean, fill = set, color = set, group = set)) +
geom_hline(yintercept = 0, size = 0.3, color = 'darkgreen')+
geom_point(size = 1.5, shape = 23) +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se, group = set), width=0.1, color = 'black', size = 0.7)+
facet_wrap(~lambda1,  labeller = label_bquote(paste(lambda[1], '=', .(lambda1))))+
scale_fill_manual(values = c('red', 'blue'))+
scale_color_manual(values = c('red', 'blue'))+
ylim(-0.1, 1.1)+
ylab('correlation')
ggsave(file = paste('Data/Connectome/', task, '/RCCA/grid_search_adj_', adjust,'.png', sep = ''), device = 'png', width = 6, height = 8)

Search for the best $\lambda_1$ with the highest value of the first test canonical correlation.

In [None]:
df_first = subset(df, component == 1 & set == 'test')
lambda1_opt = lambda1s[which.max(df_first$mean)]
cor_opt = max(df_first$mean)

In [None]:
ggplot(df_first, aes(x = log(lambda1, 10), y = mean)) +
geom_hline(yintercept = 0, size = 0.3, color = 'darkgreen')+
geom_point(size = 1.5, shape = 23, color = 'blue', fill = 'blue') +
geom_line(color = 'blue')+
geom_errorbar(aes(ymin = mean - se, ymax = mean + se, group = set), width=0.1, color = 'black', size = 0.7)+
ylab('correlation')+
xlab(bquote(paste('log(', lambda[1], ')', sep = '')))+
ggtitle(bquote(paste('max correlation = ', .(round(cor_opt,3)), '  for  ', lambda[1], '=', .(lambda1_opt))))

ggsave(file = paste('Data/Connectome/', task, '/RCCA/best_lambda_adj_', adjust, '.png', sep = ''), device = 'png', width = 7, height = 3)

# Calculate RCCA for the best lambda

In [None]:
RCCA = function(X, Y, lambda1){
    SVD = svd(X)
    R = SVD$u %*% diag(SVD$d)
    V = SVD$v
    rownames(V) = colnames(X)
    Crr = var(R, na.rm = TRUE, use = "pairwise") + diag(lambda1, ncol(R))
    Cyy = var(Y, na.rm = TRUE, use = "pairwise")
    Cry = cov(R, Y, use = "pairwise")
    RCCA = geigen(Cry, Crr, Cyy)
    names(RCCA) = c("cor", "xcoef", "ycoef")
    RCCA$xcoef = V %*% RCCA$xcoef
    return(list(cor = RCCA$cor, xcoef = RCCA$xcoef, ycoef = RCCA$ycoef))
}

In [None]:
best = RCCA(as.matrix(c_train), as.matrix(q_train), lambda1_opt)
cat('lambda optimal =', lambda1_opt)

Save the loadings. 

In [None]:
alpha = best$xcoef
colnames(alpha) = paste('RCCA', 1:ncol(alpha), sep = '')
beta = best$ycoef
colnames(beta) = paste('RCCA', 1:ncol(beta), sep = '')
write.csv(data.frame('c_pair' = rownames(alpha), alpha), file = paste('Data/Connectome/', task, '/RCCA/alpha_adj_', adjust,'.csv', sep = ''), row.names = FALSE)
write.csv(data.frame('q' = rownames(beta), beta), file = paste('Data/Connectome/', task, '/RCCA/beta_adj_', adjust,'.csv', sep = ''), row.names = FALSE)

Save scores train.

In [None]:
brain_scores = as.matrix(c_train)%*%as.matrix(alpha)
write.csv(data.frame('Subject' = rownames(brain_scores), brain_scores), file = paste('Data/Connectome/', task, '/RCCA/brain_scores_adj_', adjust,'_train.csv', sep = ''), row.names = FALSE)
questionnaire_scores = as.matrix(q_train)%*%as.matrix(beta)
write.csv(data.frame('Subject' = rownames(questionnaire_scores), questionnaire_scores), file = paste('Data/Connectome/', task, '/RCCA/questionnaire_scores_adj_', adjust,'_train.csv', sep = ''), row.names = FALSE)
cors = diag(cor(brain_scores, questionnaire_scores))
write.csv(data.frame(t(cors)), file = paste('Data/Connectome/', task, '/RCCA/correlation_adj_', adjust,'_train.csv', sep = ''), row.names = FALSE)

Save scores test.

In [None]:
brain_scores = as.matrix(c_test)%*%as.matrix(alpha)
write.csv(data.frame('Subject' = rownames(brain_scores), brain_scores), file = paste('Data/Connectome/', task, '/RCCA/brain_scores_adj_', adjust,'_test.csv', sep = ''), row.names = FALSE)
questionnaire_scores = as.matrix(q_test)%*%as.matrix(beta)
write.csv(data.frame('Subject' = rownames(questionnaire_scores), questionnaire_scores), file = paste('Data/Connectome/', task, '/RCCA/questionnaire_scores_adj_', adjust,'_test.csv', sep = ''), row.names = FALSE)
cors = diag(cor(brain_scores, questionnaire_scores))
write.csv(data.frame(t(cors)), file = paste('Data/Connectome/', task, '/RCCA/correlation_adj_', adjust,'_test.csv', sep = ''), row.names = FALSE)