In [6]:
library(tidyverse)

In [7]:
MODE = 'Balanced'
# MODE = 'Imalanced'
MODE_TWO = paste0(MODE, '_nocov')
# MODE_TWO = MODE

# plots

In [8]:
pca_plot <- function(df, batch_info, title, path) {
  pca <- prcomp(t(na.omit(df)))
  # Plot PCA
  pca_df <-
    pca$x %>%
    as.data.frame() %>%
    rownames_to_column("file") %>% 
    left_join(batch_info,  by = "file") 
  # add % of explained variance
  var_expl <- pca$sdev^2 / sum(pca$sdev^2)
  names(var_expl) <- paste0("PC", 1:length(var_expl))
  # Add the label for the specific point
  pca_plot <- pca_df %>%
    ggplot(aes(PC1, PC2)) +
    geom_point(aes(col=condition, shape=lab), size=2) +
    theme_classic() +
    labs(title = title,
         x = glue::glue("PC1 [{round(var_expl['PC1']*100, 2)}%]"),
         y = glue::glue("PC2 [{round(var_expl['PC2']*100, 2)}%]"))

  ggsave(path, pca_plot)
}

In [9]:
# boxplot
boxplot_pg <- function(protein_matrix, title, path) {
  # Reshape data into long format
  long_data <- tidyr::gather(protein_matrix, 
                             key = "file", value = "Intensity")
  # Log tranformed scale
  boxplot <- ggplot(long_data, aes(x = file, y = Intensity)) + 
    geom_boxplot() +
    stat_summary(fun = mean, geom = "point", shape = 4, size = 1.5, color = "red") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 5)) +
    labs(title = title) 

  ggsave(path, boxplot, width = 6, height = 6)
}

In [10]:
heatmap_plot <- function(pg_matrix, batch_info, name, plot_name_prefix){
    cor_matrix <- cor(na.omit(pg_matrix), use = "pairwise.complete.obs")
    pheatmap::pheatmap(cor_matrix, 
                        annotation_col = select(batch_info, c(condition, lab)),
                        treeheight_row = 0, treeheight_col = 0, 
                        fontsize_row = 5, fontsize_col = 5,
                        width = 7, height = 7,
                        main = paste0(name, ' heatmap'),
                        filename = plot_name_prefix)
}


In [11]:
plot_three_plots <- function(pg_matrix, batch_info, plot_name_prefix, subname, number){

        batch_info <- batch_info %>%
                mutate(file = case_when(
                lab %in% c('lab_A', 'lab_E') ~ str_split(file, "_") %>% 
                map_chr(~ if (length(.x) == 4) paste(.x[1], .x[2], .x[4], sep = "_") else paste(.x[1], .x[2], sep = "_")),
                
                lab == 'lab_C' ~ str_split(file, "_") %>% 
                map_chr(~ paste(.x[5], .x[6], sep = "_")),
                
                lab == 'lab_D' ~ str_split(file, "_") %>% 
                map_chr(~ paste(.x[6], .x[8], sep = "_")),
                
                TRUE ~ file
        ))

        rownames(batch_info) <- batch_info$file
        colnames(pg_matrix) <- batch_info$file


        boxplot_pg(pg_matrix, 
                paste0(subname, ' boxplot'), 
                paste0(plot_name_prefix, "_", subname, "_", number, "_boxplot.png")
        )

        pca_plot(pg_matrix,
                batch_info, 
                paste0(subname, ' pca'), 
                paste0(plot_name_prefix, "_", subname, "_", number, "_pca.png")
        )

        heatmap_plot(pg_matrix, 
                batch_info, 
                subname, 
                paste0(plot_name_prefix, "_", subname, "_", number, "_heatmap.png")
        )

}

In [12]:
batch_info_ref <- read.csv(paste0("/home/yuliya/repos/other/removeBatch/test_data/raw_files_first_", MODE, "/bath_info_all.tsv"), check.names = FALSE, sep="\t") %>%
  column_to_rownames('rowname') %>%
  mutate(lab = factor(lab), condition = factor(condition))

dim(batch_info_ref)
head(batch_info_ref)

Unnamed: 0_level_0,file,lab,condition
Unnamed: 0_level_1,<chr>,<fct>,<fct>
Ref8537_QC1_20230414_1,Ref8537_QC1_20230414_1,lab_A,Pyr
Ref8537_QC2_20230414_1,Ref8537_QC2_20230414_1,lab_A,Pyr
Ref8537_QC3_20230414_1,Ref8537_QC3_20230414_1,lab_A,Glu
Ref8537_QC4_20230414_1,Ref8537_QC4_20230414_1,lab_A,Glu
Ref8537_S4_20230414,Ref8537_S4_20230414,lab_A,Pyr
Ref8537_S8_20230414,Ref8537_S8_20230414,lab_A,Pyr


# Central analysis

In [13]:
# PG matrix
labs_list = c('lab_A', 'lab_B', 'lab_C', 'lab_D' , 'lab_E')  
pg_matrix <- NULL

for (name in labs_list) {
  file_name_prefix <- paste0('/home/yuliya/repos/other/removeBatch/test_data/raw_files_first_', MODE, '/', name)

  if(is.null(pg_matrix)){
    pg_matrix <- read.csv(paste0(file_name_prefix, '_protein_groups_matrix.tsv'), check.names = FALSE, sep="\t") 
  } else {
    pg_matrix <- inner_join(pg_matrix, 
                       read.csv(paste0(file_name_prefix, '_protein_groups_matrix.tsv'), check.names = FALSE, sep="\t"),
                       by = "rowname")
  }
}

pg_matrix <- pg_matrix %>% column_to_rownames('rowname')
pg_matrix <- log2(pg_matrix + 1)

# temp_df <- read.csv( paste0("results/", MODE, '/lab_A', "_intensities_corrected.tsv"), sep='\t', row.names = 1,  check.names = FALSE)
# pg_matrix <- pg_matrix[rownames(temp_df),  batch_info_ref$file]
pg_matrix <- pg_matrix[,  batch_info_ref$file]

dim(pg_matrix)

In [14]:
plot_name_prefix <- paste0("plots/", MODE_TWO, "/BEFORE_correction")
subname <- "A_B_conditions"
number <- "02"
plot_three_plots(pg_matrix, batch_info_ref, plot_name_prefix, subname, number)


“[1m[22mRemoved 1472 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 1472 rows containing non-finite values (`stat_summary()`).”
[1m[22mSaving 6.67 x 6.67 in image


In [16]:
library(limma)

# design=model.matrix(~condition, data = batch_info_ref)
# pg_matrix_cured <- removeBatchEffect(pg_matrix, batch=batch_info_ref$lab, design=design) %>% as.data.frame()
pg_matrix_cured <- removeBatchEffect(pg_matrix, batch=batch_info_ref$lab) %>% as.data.frame()


#write to file
pg_matrix_cured %>% 
  rownames_to_column("rowname") %>% 
  write.table(paste0('results/', MODE_TWO, '/central_cured.csv'),
              sep = "\t", quote = FALSE, row.names = FALSE)

dim(pg_matrix_cured)

# data
## central

In [17]:
cured_central <- read.csv(paste0('results/', MODE_TWO, '/central_cured.csv'), sep='\t', row.names = 1,  check.names = FALSE)
cured_central <- cured_central[,  batch_info_ref$file]
dim(cured_central)
head(cured_central)

Unnamed: 0_level_0,Ref8537_QC1_20230414_1,Ref8537_QC2_20230414_1,Ref8537_QC3_20230414_1,Ref8537_QC4_20230414_1,Ref8537_S4_20230414,Ref8537_S8_20230414,Ref8537_S11_20230414,Ref8537_S18_20230414,Ref8537_S21_20230414,Ref8537_S26_20230414,⋯,CVT09_s25_X006,CVT09_s28_X007,CVT09_s31_X008,CVT09_s36_X009,CVT09_s41_X010,CVT09_s59_X013,CVT09_s61_X014,CVT09_s73_X016,CVT09_s84_X018,CVT09_s93_X020
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
P0A8D6,21.53906,21.47999,21.35298,21.42296,21.38708,21.42679,21.61248,21.31955,21.04806,21.35049,⋯,21.45216,21.49386,21.53507,21.2404,21.26634,21.18392,21.26768,21.16703,21.31542,21.13339
P0AAX3,19.74532,19.75818,20.67724,20.27436,19.83848,19.39265,18.96252,19.5697,20.53163,19.93006,⋯,19.4326,19.88589,19.8937,19.80521,19.86551,21.17522,20.85025,20.65165,21.04656,20.52732
P00959,23.35176,23.42021,23.61553,23.69256,23.3474,23.3979,23.28755,23.13318,22.92499,23.30205,⋯,23.22424,23.10805,23.09847,23.485,23.42763,23.68962,23.67627,23.7442,23.71391,23.69579
P0AEQ1,20.91692,20.80512,19.54067,19.61048,21.21183,21.07712,20.81611,20.91952,21.68973,21.30193,⋯,20.83636,20.8825,20.82481,20.68226,20.66431,19.78983,19.97801,19.77275,19.69974,19.62681
P25738,23.16835,22.57489,22.53451,22.30098,24.12289,23.87931,23.90141,23.58926,23.35995,23.86318,⋯,23.67415,23.63801,23.77404,23.22189,23.18048,22.63059,23.16712,22.90152,22.61635,22.80023
P64596,22.66786,22.42444,22.06219,22.16543,22.78653,22.74092,22.68579,22.63808,23.01979,22.63787,⋯,22.65854,22.53766,22.53421,22.77303,22.7168,22.45179,22.36571,22.43304,22.41174,22.48665


In [18]:
plot_name_prefix <- paste0("plots/", MODE_TWO, "/R_after_correction")
subname <- "A_B_conditions"
number <- "02"
plot_three_plots(cured_central, batch_info_ref, plot_name_prefix, subname, number)


“[1m[22mRemoved 1472 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 1472 rows containing non-finite values (`stat_summary()`).”
[1m[22mSaving 6.67 x 6.67 in image


## federated

In [19]:
cohorts <- c('lab_A', 'lab_B', 'lab_C', 'lab_D', 'lab_E')

cured_federated <- NULL

for(c in cohorts) {
  file_path <- paste0("results/", MODE_TWO, "/", c, "_intensities_corrected.tsv")
  temp_df <- read.csv(file_path, sep='\t', row.names = 1,  check.names = FALSE)

  # Combine the dataframes
  if (is.null(cured_federated)) {
    cured_federated <- temp_df
  } else {
    cured_federated <- cbind(cured_federated, temp_df)
  }
}

cured_federated <- cured_federated[rownames(cured_central), batch_info_ref$file]
dim(cured_federated)
head(cured_federated)

Unnamed: 0_level_0,Ref8537_QC1_20230414_1,Ref8537_QC2_20230414_1,Ref8537_QC3_20230414_1,Ref8537_QC4_20230414_1,Ref8537_S4_20230414,Ref8537_S8_20230414,Ref8537_S11_20230414,Ref8537_S18_20230414,Ref8537_S21_20230414,Ref8537_S26_20230414,⋯,CVT09_s25_X006,CVT09_s28_X007,CVT09_s31_X008,CVT09_s36_X009,CVT09_s41_X010,CVT09_s59_X013,CVT09_s61_X014,CVT09_s73_X016,CVT09_s84_X018,CVT09_s93_X020
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
P0A8D6,21.53906,21.47999,21.35298,21.42296,21.38708,21.42679,21.61248,21.31955,21.04806,21.35049,⋯,21.45216,21.49386,21.53507,21.2404,21.26634,21.18392,21.26768,21.16703,21.31542,21.13339
P0AAX3,19.74532,19.75818,20.67724,20.27436,19.83848,19.39265,18.96252,19.5697,20.53163,19.93006,⋯,19.4326,19.88589,19.8937,19.80521,19.86551,21.17522,20.85025,20.65165,21.04656,20.52732
P00959,23.35176,23.42021,23.61553,23.69256,23.3474,23.3979,23.28755,23.13318,22.92499,23.30205,⋯,23.22424,23.10805,23.09847,23.485,23.42763,23.68962,23.67627,23.7442,23.71391,23.69579
P0AEQ1,20.91692,20.80512,19.54067,19.61048,21.21183,21.07712,20.81611,20.91952,21.68973,21.30193,⋯,20.83636,20.8825,20.82481,20.68226,20.66431,19.78983,19.97801,19.77275,19.69974,19.62681
P25738,23.16835,22.57489,22.53451,22.30098,24.12289,23.87931,23.90141,23.58926,23.35995,23.86318,⋯,23.67415,23.63801,23.77404,23.22189,23.18048,22.63059,23.16712,22.90152,22.61635,22.80023
P64596,22.66786,22.42444,22.06219,22.16543,22.78653,22.74092,22.68579,22.63808,23.01979,22.63787,⋯,22.65854,22.53766,22.53421,22.77303,22.7168,22.45179,22.36571,22.43304,22.41174,22.48665


In [20]:
plot_name_prefix <- paste0("plots/", MODE_TWO, "/FED_after_correction")
subname <- "A_B_conditions"
number <- "03"
plot_three_plots(cured_federated, batch_info_ref, plot_name_prefix, subname, number)


“[1m[22mRemoved 1538 rows containing non-finite values (`stat_boxplot()`).”
“[1m[22mRemoved 1538 rows containing non-finite values (`stat_summary()`).”
[1m[22mSaving 6.67 x 6.67 in image


## comparison 

In [21]:
dim(cured_central)
dim(cured_federated)

In [22]:
# identical?
identical(round(cured_central, 2), round(cured_federated, 2))

In [23]:
# Check Row-by-Row and Column-by-Column Equality
all.equal(round(cured_central, 2), round(cured_federated, 2))


In [24]:
# Calculate the mean of the absolute differences, removing NA's
difference <- cured_central - cured_federated
abs_difference <- abs(difference)
mean_abs_difference <- mean(apply(abs_difference, c(1, 2), mean, na.rm = TRUE), na.rm = TRUE)
mean_abs_difference