In [None]:
library(ggplot2)
library(gridExtra)
library(grid)
library(matrixStats)
library(fda)

# Load data

Set up the folders that contains data.

In [None]:
c_folder_name = 'NIH/Emotion'
q_folder_name = 'NIH'

Load questionnaires and connectome.

In [None]:
c = read.table(paste('Data/Connectome/', c_folder_name, '/Adjust/c_adj.csv', sep = ''), sep = ',', header = TRUE, row.names = 1)

In [None]:
q = read.table(paste('Data/Questionnaire/', q_folder_name, '/Adjust/q_adj.csv', sep = ''), sep = ',', header = TRUE, row.names = 1)
q = q[rownames(c),]

Keep only negative questionnaires.

In [None]:
q = q[,1:6]
colnames(q)

# Cross-Validated RCCA

Splits the data in train (80%) and test (20%) at random and returns the canonical correlations for training and test data.
Calculate 1SD confidence intervals for train and test canonical correlations.

In [None]:
RCCA_CV = function(X, Y, lambda1s, nreps = 100){
    ncomp = min(ncol(X), ncol(Y))
    n = nrow(X)
    cor_cv = c()
    #compute R and V for matrix X
    SVD = svd(X)
    R = SVD$u %*% diag(SVD$d)
    V = SVD$v
    rownames(V) = colnames(X)
    for(lambda1 in lambda1s){
        cor_train = matrix(0, ncomp, nreps)
        cor_test = matrix(0, ncomp, nreps)
        colnames(cor_train) = paste('seed', 1:nreps, sep = '')
        colnames(cor_test) = paste('seed', 1:nreps, sep = '')
        #repeat split nreps times and save train and test correlations
        for(i in 1:nreps){
            set.seed(i)
            #split the data into train and test
            ind_train = sample(n, size = round(n * 0.8))
            #run RCCA
            Crr = var(R[ind_train, ], na.rm = TRUE, use = "pairwise") + diag(lambda1, ncol(R))
            Cyy = var(Y[ind_train, ], na.rm = TRUE, use = "pairwise")
            Cry = cov(R[ind_train, ], Y[ind_train, ], use = "pairwise")
            RCCA = geigen(Cry, Crr, Cyy)
            names(RCCA) = c("cor", "xcoef", "ycoef")
            #compute coefficients
            alpha = V %*% RCCA$xcoef
            beta = RCCA$ycoef
            #compute train and test correlation
            cor_train[,i] = RCCA$cor
            cor_test[,i] = diag(cor(X[-ind_train, ]%*%alpha, Y[-ind_train, ]%*%beta))
        }
        cor_cv = rbind(cor_cv, data.frame(cor_train, 'comp' = 1:ncomp, 'set' = 'train', 'lambda1' = lambda1), 
                             data.frame(cor_test, 'comp' = 1:ncomp, 'set' = 'test', 'lambda1' = lambda1))
    }
    return(cor_cv)
 }

# Run grid search for RCCA

Vary $\lambda_1$ and compute GRCCA.

In [None]:
lambda1s = 10^seq(-2,6)
nreps = 100

cor_cv = RCCA_CV(as.matrix(c), as.matrix(q), lambda1s, nreps)
write.csv(cor_cv, file = paste('Data/Connectome/', c_folder_name, '/RCCA/rcca_cor.csv', sep = ''), row.names = FALSE)

Plot GRCCA results for different values of $\lambda_1$ and $\nu$.

In [None]:
cor_plots = list()
ncomp = ncol(q)
i = 1

for(lam1 in lambda1s){
    cor_train = data.frame(t(subset(cor_cv, lambda1 == lam1 & set == 'train')[, 1:nreps]))
    colnames(cor_train) = 1:ncomp
    cor_test = data.frame(t(subset(cor_cv, lambda1 == lam1 & set == 'test')[, 1:nreps]))
    colnames(cor_test) = 1:ncomp     
    cor_plots[[i]] = ggplot() +
        geom_hline(yintercept = 0, color = 'darkgreen') +
        geom_boxplot(stack(cor_train), mapping = aes(x = ind, y = values), outlier.alpha = 0, fatten = 0, fill = 'white')+
        geom_boxplot(stack(cor_test), mapping = aes(x = ind, y = values), outlier.alpha = 0, fatten = 0, fill = 'white')+
        geom_point(data.frame('x' = 1:ncomp, 'y' = colMeans(cor_train)), mapping = aes(x, y), size = 2, shape = 23, color = 'red', fill = 'red') +
        geom_point(data.frame('x' = 1:ncomp, 'y' = colMeans(cor_test)), mapping = aes(x, y), size = 2, shape = 23, color = 'blue', fill = 'blue') +    
        ylim(-0.3, 1.1)+
        xlab('component') +
        ylab('correlation')+
        ggtitle(bquote(paste(lambda[1], '=', .(lam1))))
    i = i + 1
}
cor_plots = do.call("grid.arrange", c(cor_plots, ncol = 3))
ggsave(file = paste('Data/Connectome/', c_folder_name, '/RCCA/rcca_grid_search(boxplot).png', sep = ''), device = 'png', plot = cor_plots, width = 8, height = 10)

Search for the best $\lambda_1$ with the highest value of the first test canonical correlation.

In [None]:
first_plots = list()

first = data.frame(t(subset(cor_cv, set == 'test' & comp == 1)[,1:nreps]))
colnames(first) = lambda1s
center = colMeans(first)
lambda1_opt = lambda1s[which.max(center)]
first_plot = ggplot() +
    geom_hline(yintercept = 0, color = 'darkgreen') +
    geom_boxplot(stack(first), mapping = aes(x = as.factor(log(as.numeric(paste(ind)), 10)), y = values), outlier.alpha = 0, fatten = 0, fill = 'white')+
    geom_point(data.frame('lambda' = lambda1s, 'cor' = center), mapping = aes(x = as.factor(log(lambda, 10)), y = cor), size = 2, shape = 23, color = 'blue', fill = 'blue') +
    geom_line(data.frame('lambda' = lambda1s, 'cor' = center), mapping = aes(x = as.factor(log(lambda, 10)), y = cor), color = 'blue', group = 1) + 
    xlab(bquote(paste('log(', lambda[1], ')', sep = ''))) +
    ylab('first canonical correlation')+
    ggtitle(bquote(paste('max correlation = ', .(round(max(center), 3)), '     ', lambda[1], ' optimal = ', .(lambda1_opt), sep = ' ')))

first_plot
ggsave(file = paste('Data/Connectome/', c_folder_name, '/RCCA/rcca_best_lambda(boxplot).png', sep = ''), device = 'png', plot = first_plot, width = 7, height = 4)

# Calculate RCCA for the best lambda

In [None]:
RCCA = function(X, Y, lambda1){
    SVD = svd(X)
    R = SVD$u %*% diag(SVD$d)
    V = SVD$v
    rownames(V) = colnames(X)
    Crr = var(R, na.rm = TRUE, use = "pairwise") + diag(lambda1, ncol(R))
    Cyy = var(Y, na.rm = TRUE, use = "pairwise")
    Cry = cov(R, Y, use = "pairwise")
    RCCA = geigen(Cry, Crr, Cyy)
    names(RCCA) = c("cor", "xcoef", "ycoef")
    RCCA$xcoef = V %*% RCCA$xcoef
    return(list(cor = RCCA$cor, xcoef = RCCA$xcoef, ycoef = RCCA$ycoef))
}

In [None]:
#Emotion
#lambda1_opt = 10
#Rest
#lambda1_opt = 10
best = RCCA(as.matrix(c), as.matrix(q), lambda1_opt)

Save the loadings. 

In [None]:
alpha = best$xcoef
colnames(alpha) = paste('RCCA', 1:ncol(alpha), sep = '')
beta = best$ycoef
colnames(beta) = paste('RCCA', 1:ncol(beta), sep = '')
write.csv(data.frame('c_pair' = colnames(c), alpha), file = paste('Data/Connectome/', c_folder_name, '/RCCA/alpha.csv', sep = ''), row.names = FALSE)
write.csv(data.frame('q' = rownames(beta), beta), file = paste('Data/Connectome/', c_folder_name, '/RCCA/beta.csv', sep = ''), row.names = FALSE)

Save scores.

In [None]:
brain_scores = as.matrix(c)%*%as.matrix(alpha)
write.csv(data.frame('Subject' = rownames(brain_scores), brain_scores), file = paste('Data/Connectome/', c_folder_name, '/RCCA/brain_scores.csv', sep = ''), row.names = FALSE)

In [None]:
questionnaire_scores = as.matrix(q)%*%as.matrix(beta)
write.csv(data.frame('Subject' = rownames(questionnaire_scores), questionnaire_scores), file = paste('Data/Connectome/', c_folder_name, '/RCCA/questionnaire_scores.csv', sep = ''), row.names = FALSE)