In [1]:
library(DEqMS)
library(patchwork)
library(tidyverse)

source("../../evaluation_utils/evaluation/DE_analysis.R")
source("../../evaluation_utils/plots/DE_plots.R")
source("../../evaluation_utils/filtering/filtering_normalization.R")

library(jsonlite)

Loading required package: ggplot2



“package ‘ggplot2’ was built under R version 4.2.3”
Loading required package: matrixStats

“package ‘matrixStats’ was built under R version 4.2.3”
Loading required package: limma

“package ‘tidyverse’ was built under R version 4.2.2”
“package ‘tibble’ was built under R version 4.2.3”
“package ‘tidyr’ was built under R version 4.2.2”
“package ‘readr’ was built under R version 4.2.2”
“package ‘purrr’ was built under R version 4.2.3”
“package ‘dplyr’ was built under R version 4.2.3”
“package ‘stringr’ was built under R version 4.2.3”
“package ‘forcats’ was built under R version 4.2.2”
“package ‘lubridate’ was built under R version 4.2.2”
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    

# Separate run for meta

In [2]:
datasets_list = c('balanced', 'imbalanced')
labs_list = c('lab_A', 'lab_B', 'lab_C', 'lab_D' , 'lab_E')

filter_list_META = list("balanced" = list(), "imbalanced" = list())

# empty plot
x <- ggplot() + theme_minimal()

for(dataset in datasets_list){
  plots_list = list()
  cat('\n\nDataset: ', dataset, "\n")

  path_to_reports = paste0('/home/yuliya/repos/cosybio/FedProt/data/bacterial_data/', dataset, '/')
  
  for (name in labs_list) {
    output_path = paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/', dataset, '/')
    cat('\nLab: ', name, "\n")

    batch_info = read_tsv(paste0(path_to_reports, name, '/metadata.tsv'), show_col_types = FALSE)
    intensities = read_tsv(paste0(path_to_reports, name, '/protein_groups_matrix.tsv'), show_col_types = FALSE)
    counts = read_tsv(paste0(path_to_reports, name, '/protein_counts.tsv'), show_col_types = FALSE)

    intensities <- intensities %>% column_to_rownames('rowname')
    counts <- counts %>% column_to_rownames('rowname')
    intensities <- intensities[, batch_info$file]
    
    intensities <- filter_by_condition(intensities, batch_info, 
        'file', c('Glu', 'Pyr'), 'condition')
    intensities <- log2(intensities + 1)

    # run DE
    design <- make_design(batch_info, 'condition')
    contrasts <- makeContrasts(Glu-Pyr, levels = colnames(design))
    de_results <- run_DE(intensities, counts, design, contrasts)
    de_results <- de_results %>% rownames_to_column('Protein')
    write.table(de_results, file = paste0(output_path, name, '_res_FULL.tsv'), sep = "\t", quote = FALSE, row.names = FALSE)

    filter_list_META[[dataset]][[name]] <- de_results[['Protein']]

    # plot volcano plots
    if(name == 'lab_E'){
        plot_separate <- volcano_plot(
        de_results, paste(dataset, name, ", Glu/Pyr"),
        pval_threshold = 0.01, logfc_threshold = 0.58,
        show_names = FALSE
      )
    } else {
      plot_separate <- volcano_plot(
        de_results, paste(dataset, name, ", Glu/Pyr"),
        pval_threshold = 0.01, logfc_threshold = 0.58,
        show_names = FALSE, show_legend = FALSE
      )
    }
    plots_list[[name]] = plot_separate
  }

  layout <- (plots_list[['lab_A']] | plots_list[['lab_B']] | plots_list[['lab_C']]) /
            (plots_list[['lab_D']] | plots_list[['lab_E']] | x)
  # save plot
  ggsave(file = paste0(output_path, "volcano_plots.svg"), plot = layout, width = 15, height = 8)
}


write_json(filter_list_META, "/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/filter_list_META.json")



Dataset:  balanced 

Lab:  lab_A 
Filtering by condition - two not-NA per condition
	Before filtering: 2549 24 
	After filtering: 2525 24 

Lab:  lab_B 
Filtering by condition - two not-NA per condition
	Before filtering: 2846 23 
	After filtering: 2828 23 

Lab:  lab_C 
Filtering by condition - two not-NA per condition
	Before filtering: 2820 23 
	After filtering: 2775 23 

Lab:  lab_D 
Filtering by condition - two not-NA per condition
	Before filtering: 2813 24 
	After filtering: 2785 24 

Lab:  lab_E 
Filtering by condition - two not-NA per condition
	Before filtering: 2401 24 
	After filtering: 2363 24 


Dataset:  imbalanced 

Lab:  lab_A 
Filtering by condition - two not-NA per condition
	Before filtering: 2548 16 
	After filtering: 2510 16 

Lab:  lab_B 
Filtering by condition - two not-NA per condition
	Before filtering: 2845 13 
	After filtering: 2798 13 

Lab:  lab_C 
Filtering by condition - two not-NA per condition
	Before filtering: 2818 14 
	After filtering: 2712 14 

L

In [3]:
# prepare filter for meta-analyses
meta_filter <- list()

for(dataset in datasets_list){
  cat('\n\nDataset: ', dataset)
  for (name in labs_list) {
    if(is.null(meta_filter[[dataset]])){
      meta_filter[[dataset]] <- filter_list_META[[dataset]][[name]]
    } else {
      meta_filter[[dataset]] <- intersect(meta_filter[[dataset]], filter_list_META[[dataset]][[name]])
    }
  }
  cat("\n\tIntersection length:",  length(meta_filter[[dataset]]))
}



Dataset:  balanced
	Intersection length: 2242

Dataset:  imbalanced
	Intersection length: 2223

In [4]:
for(dataset in datasets_list){
  for (name in labs_list) {
    # reaed results
    de_results <- read_tsv(paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/', dataset, '/', name, '_res_FULL.tsv'), show_col_types = FALSE)
    de_results <- de_results %>% filter(Protein %in% meta_filter[[dataset]]) %>%
      arrange(desc(Protein))
    cat('\nDataset: ', dataset, 'Lab: ', name, 'Number of DE proteins: ', nrow(de_results))
    write.table(de_results, file = paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/', dataset, '/', name, '_res.tsv'), sep = "\t", quote = FALSE, row.names = FALSE)
    }
}


Dataset:  balanced Lab:  lab_A Number of DE proteins:  2242
Dataset:  balanced Lab:  lab_B Number of DE proteins:  2242
Dataset:  balanced Lab:  lab_C Number of DE proteins:  2242
Dataset:  balanced Lab:  lab_D Number of DE proteins:  2242
Dataset:  balanced Lab:  lab_E Number of DE proteins:  2242
Dataset:  imbalanced Lab:  lab_A Number of DE proteins:  2223
Dataset:  imbalanced Lab:  lab_B Number of DE proteins:  2223
Dataset:  imbalanced Lab:  lab_C Number of DE proteins:  2223
Dataset:  imbalanced Lab:  lab_D Number of DE proteins:  2223
Dataset:  imbalanced Lab:  lab_E Number of DE proteins:  2223

run metaanalyses

In [None]:
# cd /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/balanced/

# Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_MetaDE.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/balanced/ lab_A lab_B lab_C lab_D
# Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_MetaVolcanoR.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/balanced/ lab_A lab_B lab_C lab_D
# Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_RankProd.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/balanced/ lab_A lab_B lab_C lab_D

# cp /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/balanced/MA_* /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/balanced/results/

In [None]:
# cd /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/imbalanced/

# Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_MetaDE.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/imbalanced/ lab_A lab_B lab_C lab_D
# Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_MetaVolcanoR.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/imbalanced/ lab_A lab_B lab_C lab_D
# Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_RankProd.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/imbalanced/ lab_A lab_B lab_C lab_D

# cp /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/imbalanced/MA_* /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/imbalanced/results/

# Centralized run

In [5]:
datasets_list = c('balanced', 'imbalanced')
labs_list = c('lab_A', 'lab_B', 'lab_C', 'lab_D' , 'lab_E')


for(dataset in datasets_list){

  path_to_reports = paste0('/home/yuliya/repos/cosybio/FedProt/data/bacterial_data/', dataset, '/')
  
 
  central_intensities = NULL
  central_counts = NULL
  central_batch_info = read_tsv(paste0(path_to_reports, 'bath_info_all.tsv'), show_col_types = FALSE)
  central_batch_info <- central_batch_info %>%
    mutate(lab = as.factor(lab), condition = as.factor(condition))

  for (name in labs_list) {
    
    intensities = read_tsv(paste0(path_to_reports, name, '/protein_groups_matrix.tsv'), show_col_types = FALSE)
    counts = read_tsv(paste0(path_to_reports, name, '/protein_counts.tsv'), show_col_types = FALSE)

    if(is.null(central_intensities)){
      central_intensities = intensities
      central_counts = counts
    } else {
      central_intensities = full_join(central_intensities, intensities, by = 'rowname')
      central_counts = full_join(central_counts, counts, by = 'rowname')
    }
  }

  cat('\n\nDataset: ', dataset, "\n")
  cat('\tNumber of proteins: ', nrow(central_intensities), '\n')
  cat('\tNumber of samples: ', ncol(central_intensities), '\n')

  central_intensities <- central_intensities %>% column_to_rownames('rowname')
  # save for RBE 
  # write.table(central_intensities, file = paste0('/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/before/', dataset, '_central_intensities.tsv'), sep = "\t", quote = FALSE, row.names = TRUE)
  # write.table(central_batch_info, file = paste0('/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/before/', dataset, '_central_batch_info.tsv'), sep = "\t", quote = FALSE, row.names = FALSE) 
  central_counts <- central_counts %>% column_to_rownames('rowname')
  central_intensities <- central_intensities[, central_batch_info$file]
    
  central_intensities <- filter_by_condition(central_intensities, central_batch_info, 
        'file', c('Glu', 'Pyr'), 'condition')
  central_intensities <- log2(central_intensities + 1)

  # select minimal count across column for each protein (with na.rm = TRUE)
  central_counts$count <- apply(central_counts, 1, min, na.rm = TRUE)
  central_counts <- central_counts %>% select(count)

  # run DE analysis
  design <- make_design(central_batch_info, 'condition', 'lab')
  contrasts <- makeContrasts(Glu-Pyr, levels = colnames(design))
  de_results <- run_DE(central_intensities, central_counts, design, contrasts)
  de_results <- de_results %>% rownames_to_column('Protein')
  write.table(de_results, file = paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/', dataset, '/results/central_res.tsv'), sep = "\t", quote = FALSE, row.names = FALSE)

  # plot volcano plot
  plot_result <- volcano_plot(
    de_results, paste(dataset, "central", ", Glu/Pyr"),
    pval_threshold = 0.01, logfc_threshold = 0.58,
    show_names = FALSE
  )
  ggsave(file = paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/', dataset, '/central_volcano_plot.svg'), plot = plot_result, width = 8, height = 5)

  filter_list_META[[dataset]][['Central']] <- de_results[['Protein']]
}


write_json(filter_list_META, "/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/protein_lists.json")



Dataset:  balanced 
	Number of proteins:  3059 
	Number of samples:  119 
Filtering by condition - two not-NA per condition
	Before filtering: 3059 118 
	After filtering: 3034 118 


“Partial NA coefficients for 761 probe(s)”




Dataset:  imbalanced 
	Number of proteins:  3058 
	Number of samples:  74 
Filtering by condition - two not-NA per condition
	Before filtering: 3058 73 
	After filtering: 3014 73 


“Partial NA coefficients for 741 probe(s)”
