In [124]:
library(tidyverse)

In [125]:
# MODE = 'Balanced'
MODE = 'Imalanced'

# plots

In [126]:
pca_plot <- function(df, batch_info, title, path) {
  pca <- prcomp(t(na.omit(df)))
  # Plot PCA
  pca_df <-
    pca$x %>%
    as.data.frame() %>%
    rownames_to_column("file") %>% 
    left_join(batch_info,  by = "file") 
  # add % of explained variance
  var_expl <- pca$sdev^2 / sum(pca$sdev^2)
  names(var_expl) <- paste0("PC", 1:length(var_expl))
  # Add the label for the specific point
  pca_plot <- pca_df %>%
    ggplot(aes(PC1, PC2)) +
    geom_point(aes(col=condition, shape=lab), size=2) +
    theme_classic() +
    labs(title = title,
         x = glue::glue("PC1 [{round(var_expl['PC1']*100, 2)}%]"),
         y = glue::glue("PC2 [{round(var_expl['PC2']*100, 2)}%]"))

  ggsave(path, pca_plot)
}

In [127]:
# boxplot
boxplot_pg <- function(protein_matrix, title, path) {
  # Reshape data into long format
  long_data <- tidyr::gather(protein_matrix, 
                             key = "file", value = "Intensity")
  # Log tranformed scale
  boxplot <- ggplot(long_data, aes(x = file, y = Intensity)) + 
    geom_boxplot() +
    stat_summary(fun = mean, geom = "point", shape = 4, size = 1.5, color = "red") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 5)) +
    labs(title = title) 

  ggsave(path, boxplot, width = 6, height = 6)
}

In [128]:
heatmap_plot <- function(pg_matrix, batch_info, name, plot_name_prefix){
    cor_matrix <- cor(na.omit(pg_matrix), use = "pairwise.complete.obs")
    pheatmap::pheatmap(cor_matrix, 
                        annotation_col = select(batch_info, c(condition, lab)),
                        treeheight_row = 0, treeheight_col = 0, 
                        fontsize_row = 5, fontsize_col = 5,
                        width = 7, height = 7,
                        main = paste0(name, ' heatmap'),
                        filename = plot_name_prefix)
}


In [129]:
plot_three_plots <- function(pg_matrix, batch_info, plot_name_prefix, subname, number){

        batch_info <- batch_info %>%
                mutate(file = case_when(
                lab %in% c('lab_A', 'lab_E') ~ str_split(file, "_") %>% 
                map_chr(~ if (length(.x) == 4) paste(.x[1], .x[2], .x[4], sep = "_") else paste(.x[1], .x[2], sep = "_")),
                
                lab == 'lab_C' ~ str_split(file, "_") %>% 
                map_chr(~ paste(.x[5], .x[6], sep = "_")),
                
                lab == 'lab_D' ~ str_split(file, "_") %>% 
                map_chr(~ paste(.x[6], .x[8], sep = "_")),
                
                TRUE ~ file
        ))

        rownames(batch_info) <- batch_info$file
        colnames(pg_matrix) <- batch_info$file


        boxplot_pg(pg_matrix, 
                paste0(subname, ' boxplot'), 
                paste0(plot_name_prefix, "_", subname, "_", number, "_boxplot.png")
        )

        pca_plot(pg_matrix,
                batch_info, 
                paste0(subname, ' pca'), 
                paste0(plot_name_prefix, "_", subname, "_", number, "_pca.png")
        )

        heatmap_plot(pg_matrix, 
                batch_info, 
                subname, 
                paste0(plot_name_prefix, "_", subname, "_", number, "_heatmap.png")
        )

}

In [130]:
batch_info_ref <- read.csv(paste0("/home/yuliya/repos/other/removeBatch/test_data/raw_files_first_", MODE, "/bath_info_all.tsv"), check.names = FALSE, sep="\t") %>%
  column_to_rownames('rowname') %>%
  mutate(lab = factor(lab), condition = factor(condition))

dim(batch_info_ref)
head(batch_info_ref)

Unnamed: 0_level_0,file,lab,condition
Unnamed: 0_level_1,<chr>,<fct>,<fct>
Ref8537_QC1_20230414_1,Ref8537_QC1_20230414_1,lab_A,Pyr
Ref8537_QC2_20230414_1,Ref8537_QC2_20230414_1,lab_A,Pyr
Ref8537_QC3_20230414_1,Ref8537_QC3_20230414_1,lab_A,Glu
Ref8537_QC4_20230414_1,Ref8537_QC4_20230414_1,lab_A,Glu
Ref8537_S4_20230414,Ref8537_S4_20230414,lab_A,Pyr
Ref8537_S8_20230414,Ref8537_S8_20230414,lab_A,Pyr


# Central analysis

In [131]:
# PG matrix
labs_list = c('lab_A', 'lab_B', 'lab_C', 'lab_D' , 'lab_E')  
pg_matrix <- NULL

for (name in labs_list) {
  file_name_prefix <- paste0('/home/yuliya/repos/other/removeBatch/test_data/raw_files_first_', MODE, '/', name)

  if(is.null(pg_matrix)){
    pg_matrix <- read.csv(paste0(file_name_prefix, '_protein_groups_matrix.tsv'), check.names = FALSE, sep="\t") 
  } else {
    pg_matrix <- inner_join(pg_matrix, 
                       read.csv(paste0(file_name_prefix, '_protein_groups_matrix.tsv'), check.names = FALSE, sep="\t"),
                       by = "rowname")
  }
}

pg_matrix <- pg_matrix %>% column_to_rownames('rowname')
pg_matrix <- log2(pg_matrix + 1)

temp_df <- read.csv( paste0("results/", MODE, '/lab_A', "_intensities_corrected.tsv"), sep='\t', row.names = 1,  check.names = FALSE)
pg_matrix <- pg_matrix[rownames(temp_df),  batch_info_ref$file]

dim(pg_matrix)

In [132]:
plot_name_prefix <- paste0("plots/", MODE, "/BEFORE_correction")
subname <- "A_B_conditions"
number <- "02"
plot_three_plots(pg_matrix, batch_info_ref, plot_name_prefix, subname, number)


“[1m[22mRemoved 2264 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 2264 rows containing non-finite values (`stat_summary()`).”
[1m[22mSaving 6.67 x 6.67 in image


In [133]:
library(limma)

design=model.matrix(~condition, data = batch_info_ref)
pg_matrix_cured <- removeBatchEffect(pg_matrix, batch=batch_info_ref$lab, design=design) %>% as.data.frame()


#write to file
pg_matrix_cured %>% 
  rownames_to_column("rowname") %>% 
  write.table(paste0('results/', MODE, '/central_cured.csv'),
              sep = "\t", quote = FALSE, row.names = FALSE)

# data
## central

In [134]:
cured_central <- read.csv(paste0('results/', MODE, '/central_cured.csv'), sep='\t', row.names = 1,  check.names = FALSE)
dim(cured_central)
head(cured_central)

Unnamed: 0_level_0,Ref8537_QC1_20230414_1,Ref8537_QC2_20230414_1,Ref8537_QC3_20230414_1,Ref8537_QC4_20230414_1,Ref8537_S4_20230414,Ref8537_S8_20230414,Ref8537_S11_20230414,Ref8537_S18_20230414,Ref8537_S21_20230414,Ref8537_S26_20230414,⋯,CVT09_s55_X012,CVT09_s59_X013,CVT09_s61_X014,CVT09_s67_X015,CVT09_s73_X016,CVT09_s77_X017,CVT09_s84_X018,CVT09_s90_X019,CVT09_s93_X020,CVT09_s99_X021
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A5A614,18.07099,17.76463,17.64577,16.94823,18.20704,18.27645,17.90497,17.33893,18.0048,17.77114,⋯,17.45591,16.93699,17.52851,17.44239,17.37857,17.00767,17.20953,17.3206,17.87804,17.01073
O32583,18.78144,19.05467,18.63023,17.69929,18.90511,18.45085,19.05067,19.22559,17.73292,18.85344,⋯,,,,18.011,18.83517,,,,,
P00350,25.19155,25.22207,25.39769,25.43547,25.39256,25.31223,25.223,25.24609,25.09993,25.26596,⋯,25.47001,25.52584,25.46983,25.48417,25.49854,25.49312,25.44433,25.4737,25.44254,25.46852
P00363,23.46546,23.54727,23.54287,23.39659,23.45058,23.39754,22.797,23.25775,23.56995,23.2567,⋯,23.17544,23.05606,22.88735,23.10673,23.25947,23.07996,23.0785,23.16906,23.22231,23.04852
P00370,24.55231,24.47656,25.15931,25.0083,24.45746,24.41053,24.29427,24.37286,24.45943,24.16747,⋯,24.89058,24.88225,24.82133,24.74489,24.93798,24.9189,24.8523,24.89195,24.99446,24.73179
P00393,22.70775,22.75656,22.63729,22.74035,22.48436,22.72246,22.80867,22.36757,21.78038,22.54088,⋯,22.64727,22.5319,22.70562,22.53526,22.55108,22.65813,22.65711,22.60783,22.59191,22.72622


In [135]:
plot_name_prefix <- paste0("plots/", MODE, "/R_after_correction")
subname <- "A_B_conditions"
number <- "02"
plot_three_plots(cured_central, batch_info_ref, plot_name_prefix, subname, number)


“[1m[22mRemoved 2264 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 2264 rows containing non-finite values (`stat_summary()`).”
[1m[22mSaving 6.67 x 6.67 in image


## federated

In [136]:
cohorts <- c('lab_A', 'lab_B', 'lab_C', 'lab_D', 'lab_E')

cured_federated <- NULL

for(c in cohorts) {
  file_path <- paste0("results/", MODE, "/", c, "_intensities_corrected.tsv")
  temp_df <- read.csv(file_path, sep='\t', row.names = 1,  check.names = FALSE)

  # Combine the dataframes
  if (is.null(cured_federated)) {
    cured_federated <- temp_df
  } else {
    cured_federated <- cbind(cured_federated, temp_df)
  }
}

cured_federated <- cured_federated[rownames(cured_central), batch_info_ref$file]
dim(cured_federated)
head(cured_federated)

Unnamed: 0_level_0,Ref8537_QC1_20230414_1,Ref8537_QC2_20230414_1,Ref8537_QC3_20230414_1,Ref8537_QC4_20230414_1,Ref8537_S4_20230414,Ref8537_S8_20230414,Ref8537_S11_20230414,Ref8537_S18_20230414,Ref8537_S21_20230414,Ref8537_S26_20230414,⋯,CVT09_s55_X012,CVT09_s59_X013,CVT09_s61_X014,CVT09_s67_X015,CVT09_s73_X016,CVT09_s77_X017,CVT09_s84_X018,CVT09_s90_X019,CVT09_s93_X020,CVT09_s99_X021
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A5A614,18.07099,17.76463,17.64577,16.94823,18.20704,18.27645,17.90497,17.33893,18.0048,17.77114,⋯,17.45591,16.93699,17.52851,17.44239,17.37857,17.00767,17.20953,17.3206,17.87804,17.01073
O32583,18.78144,19.05467,18.63023,17.69929,18.90511,18.45085,19.05067,19.22559,17.73292,18.85344,⋯,,,,18.011,18.83517,,,,,
P00350,25.19155,25.22207,25.39769,25.43547,25.39256,25.31223,25.223,25.24609,25.09993,25.26596,⋯,25.47001,25.52584,25.46983,25.48417,25.49854,25.49312,25.44433,25.4737,25.44254,25.46852
P00363,23.46546,23.54727,23.54287,23.39659,23.45058,23.39754,22.797,23.25775,23.56995,23.2567,⋯,23.17544,23.05606,22.88735,23.10673,23.25947,23.07996,23.0785,23.16906,23.22231,23.04852
P00370,24.55231,24.47656,25.15931,25.0083,24.45746,24.41053,24.29427,24.37286,24.45943,24.16747,⋯,24.89058,24.88225,24.82133,24.74489,24.93798,24.9189,24.8523,24.89195,24.99446,24.73179
P00393,22.70775,22.75656,22.63729,22.74035,22.48436,22.72246,22.80867,22.36757,21.78038,22.54088,⋯,22.64727,22.5319,22.70562,22.53526,22.55108,22.65813,22.65711,22.60783,22.59191,22.72622


In [137]:
plot_name_prefix <- paste0("plots/", MODE, "/FED_after_correction")
subname <- "A_B_conditions"
number <- "03"
plot_three_plots(cured_federated, batch_info_ref, plot_name_prefix, subname, number)


“[1m[22mRemoved 2264 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 2264 rows containing non-finite values (`stat_summary()`).”
[1m[22mSaving 6.67 x 6.67 in image


## comparison 

In [138]:
dim(cured_central)
dim(cured_federated)

In [139]:
# identical?
identical(round(cured_central, 6), round(cured_federated, 6))

In [140]:
# Check Row-by-Row and Column-by-Column Equality
all.equal(cured_central, cured_federated)


In [141]:
# Calculate the mean of the absolute differences, removing NA's
difference <- cured_central - cured_federated
abs_difference <- abs(difference)
mean_abs_difference <- mean(apply(abs_difference, c(1, 2), mean, na.rm = TRUE), na.rm = TRUE)
mean_abs_difference