In [12]:
library(tidyverse)

# plots

In [13]:
pca_plot <- function(df, batch_info, title, path) {
  pca <- prcomp(t(na.omit(df)))
  # Plot PCA
  pca_df <-
    pca$x %>%
    as.data.frame() %>%
    rownames_to_column("file") %>% 
    left_join(batch_info,  by = "file") 
  # add % of explained variance
  var_expl <- pca$sdev^2 / sum(pca$sdev^2)
  names(var_expl) <- paste0("PC", 1:length(var_expl))
  # Add the label for the specific point
  pca_plot <- pca_df %>%
    ggplot(aes(PC1, PC2)) +
    geom_point(aes(col=condition, shape=lab), size=2) +
    theme_classic() +
    labs(title = title,
         x = glue::glue("PC1 [{round(var_expl['PC1']*100, 2)}%]"),
         y = glue::glue("PC2 [{round(var_expl['PC2']*100, 2)}%]"))
  
   # Check if "S37" exists in the pca_df dataframe
  if("Ref8537_S37" %in% pca_df$file) {
    pca_plot <- pca_plot +
      geom_text(data = pca_df[pca_df$file == "Ref8537_S37", ], aes(label = 'S37'), vjust = -1)
  }

  ggsave(path, pca_plot)
}

In [14]:
# boxplot
boxplot_pg <- function(protein_matrix, title, path) {
  # Reshape data into long format
  long_data <- tidyr::gather(protein_matrix, 
                             key = "file", value = "Intensity")
  # Log tranformed scale
  boxplot <- ggplot(long_data, aes(x = file, y = Intensity)) + 
    geom_boxplot() +
    stat_summary(fun = mean, geom = "point", shape = 4, size = 1.5, color = "red") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 5)) +
    labs(title = title) 

  ggsave(path, boxplot, width = 6, height = 6)
}

In [15]:
heatmap_plot <- function(pg_matrix, batch_info, name, plot_name_prefix){
    cor_matrix <- cor(na.omit(pg_matrix), use = "pairwise.complete.obs")
    pheatmap::pheatmap(cor_matrix, 
                        annotation_col = select(batch_info, c(condition, lab)),
                        treeheight_row = 0, treeheight_col = 0, 
                        fontsize_row = 5, fontsize_col = 5,
                        width = 7, height = 7,
                        main = paste0(name, ' heatmap'),
                        filename = plot_name_prefix)
}


In [16]:
plot_three_plots <- function(pg_matrix, batch_info, plot_name_prefix, subname, number){

        batch_info <- batch_info %>%
                mutate(file = case_when(
                lab %in% c('lab_A', 'lab_E') ~ str_split(file, "_") %>% 
                map_chr(~ if (length(.x) == 4) paste(.x[1], .x[2], .x[4], sep = "_") else paste(.x[1], .x[2], sep = "_")),
                
                lab == 'lab_C' ~ str_split(file, "_") %>% 
                map_chr(~ paste(.x[5], .x[6], sep = "_")),
                
                lab == 'lab_D' ~ str_split(file, "_") %>% 
                map_chr(~ paste(.x[6], .x[8], sep = "_")),
                
                TRUE ~ file
        ))

        rownames(batch_info) <- batch_info$file
        colnames(pg_matrix) <- batch_info$file


        boxplot_pg(pg_matrix, 
                paste0(subname, ' boxplot'), 
                paste0(plot_name_prefix, "_", subname, "_", number, "_boxplot.png")
        )

        pca_plot(pg_matrix,
                batch_info, 
                paste0(subname, ' pca'), 
                paste0(plot_name_prefix, "_", subname, "_", number, "_pca.png")
        )

        heatmap_plot(pg_matrix, 
                batch_info, 
                subname, 
                paste0(plot_name_prefix, "_", subname, "_", number, "_heatmap.png")
        )

}

In [18]:
batch_info_ref <- read.csv("/home/yuliya/repos/cosybio/removeBatch/test_data/batch_info_all_ref.csv") %>%
    mutate(lab = factor(lab), condition = factor(condition))
rownames(batch_info_ref) <- batch_info_ref$file
dim(batch_info_ref)
head(batch_info_ref)

Unnamed: 0_level_0,file,condition,lab
Unnamed: 0_level_1,<chr>,<fct>,<fct>
Ref8537_QC1_20230414_1,Ref8537_QC1_20230414_1,A_QC,lab_A
Ref8537_QC1_20230414_2,Ref8537_QC1_20230414_2,A_QC,lab_A
Ref8537_QC2_20230414_1,Ref8537_QC2_20230414_1,A_QC,lab_A
Ref8537_QC2_20230414_2,Ref8537_QC2_20230414_2,A_QC,lab_A
Ref8537_QC3_20230414_1,Ref8537_QC3_20230414_1,B_QC,lab_A
Ref8537_QC3_20230414_2,Ref8537_QC3_20230414_2,B_QC,lab_A


# data
## central

In [17]:
cured_central <- read.csv("/home/yuliya/repos/cosybio/removeBatch/results/central_cured.csv", sep='\t', row.names = 1)
dim(cured_central)
head(cured_central)

Unnamed: 0_level_0,Ref8537_QC1_20230414_1,Ref8537_QC1_20230414_2,Ref8537_QC2_20230414_1,Ref8537_QC2_20230414_2,Ref8537_QC3_20230414_1,Ref8537_QC3_20230414_2,Ref8537_QC4_20230414_1,Ref8537_QC4_20230414_2,Ref8537_S4_20230414,Ref8537_S8_20230414,⋯,CVT09_s55_X012,CVT09_s59_X013,CVT09_s61_X014,CVT09_s67_X015,CVT09_s73_X016,CVT09_s77_X017,CVT09_s84_X018,CVT09_s90_X019,CVT09_s93_X020,CVT09_s99_X021
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A5A614,19.59722,19.63527,19.29085,19.22962,19.172,18.82606,18.47446,18.69794,19.73327,19.80267,⋯,18.8963,,18.96889,18.88278,18.81896,18.44805,18.64991,,19.31842,18.45111
O32583,20.1394,20.32151,20.41263,20.32826,19.98819,19.88644,19.05726,20.19234,20.26307,19.80881,⋯,,,,19.15097,19.97514,,,,,
P00350,26.46399,26.5443,26.49451,26.53334,26.67013,26.66674,26.70791,26.7301,26.665,26.58467,⋯,26.74266,26.79849,26.74247,26.75681,26.77118,26.76576,26.71698,26.74635,26.71519,26.74117
P00363,24.18471,24.19133,24.26651,24.25851,24.26212,24.31023,24.11584,24.11355,24.16982,24.11679,⋯,23.93092,23.81155,23.64283,23.86221,24.01495,23.83545,23.83399,23.92455,23.9778,23.80401
P00370,25.57339,25.58858,25.49763,25.4736,26.18039,26.13536,26.02937,25.97825,25.47854,25.43161,⋯,25.93167,25.92334,25.86242,25.78598,25.97907,25.95999,25.89339,25.93304,26.03555,25.77288
P00393,23.93275,23.94527,23.98156,23.97987,23.86229,23.86701,23.96535,24.00174,23.70935,23.94746,⋯,23.89696,23.78159,23.9553,23.78495,23.80077,23.90782,23.9068,23.85752,23.8416,23.97591


In [19]:
plot_name_prefix <- paste0("/home/yuliya/repos/cosybio/removeBatch/plots/", "R_after_correction")
subname <- "A_B_conditions"
number <- "02"
plot_three_plots(cured_central, batch_info_ref, plot_name_prefix, subname, number)


“[1m[22mRemoved 4470 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 4470 rows containing non-finite values (`stat_summary()`).”
[1m[22mSaving 6.67 x 6.67 in image


## federated

In [32]:
cohorts <- c('lab_A', 'lab_C', 'lab_D', 'lab_E')

cured_federated <- NULL

for(c in cohorts) {
  file_path <- paste0("/home/yuliya/repos/cosybio/removeBatch/results/", c, "_intensities_corrected.tsv")
  temp_df <- read.csv(file_path, sep='\t', row.names = 1)

  # Combine the dataframes
  if (is.null(cured_federated)) {
    cured_federated <- temp_df
  } else {
    cured_federated <- cbind(cured_federated, temp_df)
  }
}

cured_federated <- cured_federated[rownames(cured_central), colnames(cured_central)]
dim(cured_federated)
head(cured_federated)

Unnamed: 0_level_0,Ref8537_QC1_20230414_1,Ref8537_QC1_20230414_2,Ref8537_QC2_20230414_1,Ref8537_QC2_20230414_2,Ref8537_QC3_20230414_1,Ref8537_QC3_20230414_2,Ref8537_QC4_20230414_1,Ref8537_QC4_20230414_2,Ref8537_S4_20230414,Ref8537_S8_20230414,⋯,CVT09_s55_X012,CVT09_s59_X013,CVT09_s61_X014,CVT09_s67_X015,CVT09_s73_X016,CVT09_s77_X017,CVT09_s84_X018,CVT09_s90_X019,CVT09_s93_X020,CVT09_s99_X021
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A5A614,19.59722,19.63527,19.29085,19.22962,19.172,18.82606,18.47446,18.69794,19.73327,19.80267,⋯,18.8963,,18.96889,18.88278,18.81896,18.44805,18.64991,,19.31842,18.45111
O32583,20.1394,20.32151,20.41263,20.32826,19.98819,19.88644,19.05726,20.19234,20.26307,19.80881,⋯,,,,19.15097,19.97514,,,,,
P00350,26.46399,26.5443,26.49451,26.53334,26.67013,26.66674,26.70791,26.7301,26.665,26.58467,⋯,26.74266,26.79849,26.74247,26.75681,26.77118,26.76576,26.71698,26.74635,26.71519,26.74117
P00363,24.18471,24.19133,24.26651,24.25851,24.26212,24.31023,24.11584,24.11355,24.16982,24.11679,⋯,23.93092,23.81155,23.64283,23.86221,24.01495,23.83545,23.83399,23.92455,23.9778,23.80401
P00370,25.57339,25.58858,25.49763,25.4736,26.18039,26.13536,26.02937,25.97825,25.47854,25.43161,⋯,25.93167,25.92334,25.86242,25.78598,25.97907,25.95999,25.89339,25.93304,26.03555,25.77288
P00393,23.93275,23.94527,23.98156,23.97987,23.86229,23.86701,23.96535,24.00174,23.70935,23.94746,⋯,23.89696,23.78159,23.9553,23.78495,23.80077,23.90782,23.9068,23.85752,23.8416,23.97591


In [33]:
plot_name_prefix <- paste0("/home/yuliya/repos/cosybio/removeBatch/plots/", "FED_after_correction")
subname <- "A_B_conditions"
number <- "03"
plot_three_plots(cured_federated, batch_info_ref, plot_name_prefix, subname, number)


“[1m[22mRemoved 4470 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 4470 rows containing non-finite values (`stat_summary()`).”
[1m[22mSaving 6.67 x 6.67 in image


## comparison 

In [49]:
# identical?
identical(round(cured_central, 6), round(cured_federated, 6))

In [50]:
# Check Row-by-Row and Column-by-Column Equality
all.equal(cured_central, cured_federated)


In [68]:
# Calculate the mean of the absolute differences, removing NA's
difference <- cured_central - cured_federated
abs_difference <- abs(difference)
mean_abs_difference <- mean(apply(abs_difference, c(1, 2), mean, na.rm = TRUE), na.rm = TRUE)
mean_abs_difference