In [1]:
library(DEqMS)
library(patchwork)
library(tidyverse)

source("../../evaluation_utils/evaluation/DE_analysis.R")
source("../../evaluation_utils/plots/DE_plots.R")
source("../../evaluation_utils/filtering/filtering_normalization.R")

library(jsonlite)

Loading required package: ggplot2

“package ‘ggplot2’ was built under R version 4.2.3”
Loading required package: matrixStats

“package ‘matrixStats’ was built under R version 4.2.3”
Loading required package: limma

“package ‘tidyverse’ was built under R version 4.2.2”
“package ‘tibble’ was built under R version 4.2.3”
“package ‘tidyr’ was built under R version 4.2.2”
“package ‘readr’ was built under R version 4.2.2”
“package ‘purrr’ was built under R version 4.2.3”
“package ‘dplyr’ was built under R version 4.2.3”
“package ‘stringr’ was built under R version 4.2.3”
“package ‘forcats’ was built under R version 4.2.2”
“package ‘lubridate’ was built under R version 4.2.2”
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtibble   [39

# Separate run for meta

In [2]:
filter_list_META = list("balanced" = list(), "imbalanced" = list(), "downsampled" = list())

analyzed_proteins <- list()

In [3]:
options(warn=-1)
datasets_list = c('balanced', 'imbalanced', "downsampled") 
labs_list = c('lab_A', 'lab_B', 'lab_C', 'lab_D' , 'lab_E')

# empty plot
x <- ggplot() + theme_minimal()

for(dataset in datasets_list){
  plots_list = list()
  cat('\n\nDataset: ', dataset, "\n")

  path_to_reports = paste0('/home/yuliya/repos/cosybio/FedProt/data/bacterial_data/', dataset, '/')
  
  for (name in labs_list) {
    output_path = paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/', dataset, '/')
    cat('\nLab: ', name, "\n")

    batch_info = read_tsv(paste0(path_to_reports, name, '/metadata.tsv'), show_col_types = FALSE)
    intensities = read_tsv(paste0(path_to_reports, name, '/protein_groups_matrix.tsv'), show_col_types = FALSE)
    counts = read_tsv(paste0(path_to_reports, name, '/protein_counts.tsv'), show_col_types = FALSE)

    intensities <- intensities %>% column_to_rownames('rowname')
    counts <- counts %>% column_to_rownames('rowname')
    intensities <- intensities[, batch_info$file]

    # filter out if count is 1 or 0
    # cat("Rows before filters:", nrow(intensities), "\n")
    # intensities <- intensities[counts$count > 1, ]
    
    intensities <- filter_by_condition(intensities, batch_info, 
        'file', c('Glu', 'Pyr'), 'condition')
    intensities <- filter_na_proteins(intensities, batch_info, "file")
    
    filter_list_META[[dataset]][[name]] <- rownames(intensities)
    analyzed_proteins[[dataset]][[name]] <- rownames(intensities)

    cat("Rows after all filters:", nrow(intensities), "\n")
    intensities <- log2(intensities + 1)

    # run DE
    design <- make_design(batch_info, 'condition')
    contrasts <- makeContrasts(Glu - Pyr, levels = colnames(design))
    de_results <- run_DE(intensities, counts, design, contrasts)
    de_results <- de_results %>% rownames_to_column('Protein')
    # write.table(de_results, file = paste0(output_path, name, '_res_FULL.tsv'), sep = "\t", quote = FALSE, row.names = FALSE)
    write.table(de_results, file = paste0(output_path, name, '_res.tsv'), sep = "\t", quote = FALSE, row.names = FALSE)

    # plot volcano plots
    if(name == 'lab_E'){
        plot_separate <- volcano_plot(
        de_results, paste(dataset, name, ", Glu/Pyr"),
        pval_threshold = 0.05, logfc_threshold = 0.5,
        show_names = FALSE
      )
    } else {
      plot_separate <- volcano_plot(
        de_results, paste(dataset, name, ", Glu/Pyr"),
        pval_threshold = 0.05, logfc_threshold = 0.5,
        show_names = FALSE, show_legend = FALSE
      )
    }
    plots_list[[name]] = plot_separate
  }

  layout <- (plots_list[['lab_A']] | plots_list[['lab_B']] | plots_list[['lab_C']]) /
            (plots_list[['lab_D']] | plots_list[['lab_E']] | x)
  # save plot
  ggsave(file = paste0(output_path, "volcano_plots.svg"), plot = layout, width = 15, height = 8)
}


write_json(filter_list_META, "/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/filter_list_META.json")



Dataset:  balanced 

Lab:  lab_A 
Filtering by condition - two not-NA per condition
	Before filtering: 2549 24 
	After filtering: 2511 24 
Filtering out features that have NAs in all columns
	Before filtering: 2511 24 
	After filtering: 2511 24 
Rows after all filters: 2511 

Lab:  lab_B 
Filtering by condition - two not-NA per condition
	Before filtering: 2846 23 
	After filtering: 2819 23 
Filtering out features that have NAs in all columns
	Before filtering: 2819 23 
	After filtering: 2819 23 
Rows after all filters: 2819 

Lab:  lab_C 
Filtering by condition - two not-NA per condition
	Before filtering: 2820 23 
	After filtering: 2755 23 
Filtering out features that have NAs in all columns
	Before filtering: 2755 23 
	After filtering: 2755 23 
Rows after all filters: 2755 

Lab:  lab_D 
Filtering by condition - two not-NA per condition
	Before filtering: 2813 24 
	After filtering: 2776 24 
Filtering out features that have NAs in all columns
	Before filtering: 2776 24 
	After filt

In [4]:
meta_filter <- list()
meta_union <- list()

# prepare filter for meta-analyses
for(dataset in datasets_list){
  cat('\n\nDataset: ', dataset)
  for (name in labs_list) {
    if(is.null(meta_filter[[dataset]])){
      meta_filter[[dataset]] <- filter_list_META[[dataset]][[name]]
      meta_union[[dataset]] <- filter_list_META[[dataset]][[name]]
    } else {
      meta_filter[[dataset]] <- intersect(meta_filter[[dataset]], filter_list_META[[dataset]][[name]])
      meta_union[[dataset]] <- union(meta_union[[dataset]], filter_list_META[[dataset]][[name]])
    }
  }
  cat("\n\tIntersection length:",  length(meta_filter[[dataset]]))
  cat("\n\tUnion length:",  length(meta_union[[dataset]]))
  filter_list_META[[dataset]][['Meta']] <- meta_filter[[dataset]]
}



Dataset:  balanced
	Intersection length: 2232
	Union length: 3020

Dataset:  imbalanced
	Intersection length: 2229
	Union length: 3016

Dataset:  downsampled
	Intersection length: 2238
	Union length: 3027

In [5]:
# for(dataset in datasets_list){
#   for (name in labs_list) {
#     # reaed results
#     de_results <- read_tsv(paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/', dataset, '/', name, '_res_FULL.tsv'), show_col_types = FALSE)
#     de_results <- de_results %>% filter(Protein %in% meta_filter[[dataset]]) %>%
#       arrange(desc(Protein))
#     cat('\nDataset: ', dataset, 'Lab: ', name, 'Number of DE proteins: ', nrow(de_results))
#     write.table(de_results, file = paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/', dataset, '/', name, '_res.tsv'), sep = "\t", quote = FALSE, row.names = FALSE)
#     }
# }

# Central run

In [6]:
datasets_list = c('balanced', 'imbalanced', "downsampled")
labs_list = c('lab_A', 'lab_B', 'lab_C', 'lab_D' , 'lab_E')


for(dataset in datasets_list){

  path_to_reports = paste0('/home/yuliya/repos/cosybio/FedProt/data/bacterial_data/', dataset, '/')

  central_intensities = NULL
  central_counts = NULL
  central_batch_info = NULL

  for (name in labs_list) {
    batch_info = read_tsv(paste0(path_to_reports, name, '/metadata.tsv'), show_col_types = FALSE)
    intensities = read_tsv(paste0(path_to_reports, name, '/protein_groups_matrix.tsv'), show_col_types = FALSE)
    counts = read_tsv(paste0(path_to_reports, name, '/protein_counts.tsv'), show_col_types = FALSE)

    if(is.null(central_intensities)){
      central_intensities = intensities
      central_counts = counts
      central_batch_info = batch_info
    } else {
      central_intensities = full_join(central_intensities, intensities, by = 'rowname')
      central_counts = full_join(central_counts, counts, by = 'rowname')
      central_batch_info = rbind(central_batch_info, batch_info)
    }
  }
  central_batch_info <- central_batch_info %>%
    mutate(lab = as.factor(lab), condition = as.factor(condition))

  cat('\n\nDataset: ', dataset, "\n")
  cat('\tNumber of proteins: ', nrow(central_intensities), '\n')
  cat('\tNumber of samples: ', ncol(central_intensities)-1, '\n')

  central_intensities <- central_intensities %>% column_to_rownames('rowname')
  # save for RBE 
  # write.table(central_intensities, file = paste0('/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/before/', dataset, '_central_intensities.tsv'), sep = "\t", quote = FALSE, row.names = TRUE)
  # write.table(central_batch_info, file = paste0('/home/yuliya/repos/other/removeBatch/evaluation_data/proteomics/before/', dataset, '_central_batch_info.tsv'), sep = "\t", quote = FALSE, row.names = FALSE) 
  
  central_counts <- central_counts %>% column_to_rownames('rowname')
  central_intensities <- central_intensities[, central_batch_info$file]
    
  central_intensities <- filter_by_condition(central_intensities, central_batch_info, 
        'file', c('Glu', 'Pyr'), 'condition')
  central_intensities <- filter_na_proteins(central_intensities, central_batch_info, "file")

  # select minimal count across column for each protein (with na.rm = TRUE)
  central_counts$count <- apply(central_counts, 1, min, na.rm = TRUE)
  central_counts <- central_counts %>% select(count) %>% as.data.frame()
  # central_intensities <- central_intensities[rownames(central_intensities) %in% rownames(central_counts %>% filter(count > 1)),]

  filter_list_META[[dataset]][['Central']] <- rownames(central_intensities)

  # central_intensities <- filter_per_center(central_intensities, central_batch_info, "file",
  #   unique(central_batch_info$lab), 'lab')

  # use the same proteins as in meta-analysis
  # central_intensities <- central_intensities[meta_filter[[dataset]],]
  
  cat("Rows after all filters:", nrow(central_intensities), "\n")

  central_intensities <- log2(central_intensities + 1)

  # run DE analysis
  design <- make_design(central_batch_info, 'condition', 'lab')
  contrasts <- makeContrasts(Glu-Pyr, levels = colnames(design))
  de_results <- run_DE(central_intensities, central_counts, design, contrasts)
  de_results <- de_results %>% rownames_to_column('Protein')
  write.table(de_results, file = paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/', dataset, '/results/central_res.tsv'), sep = "\t", quote = FALSE, row.names = FALSE)

  # plot volcano plot
  plot_result <- volcano_plot(
    de_results, paste(dataset, "central", ", Glu/Pyr"),
    pval_threshold = 0.05, logfc_threshold = 0.5,
    show_names = FALSE
  )
  ggsave(file = paste0('/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/', dataset, '/central_volcano_plot.svg'), plot = plot_result, width = 8, height = 5)
}




Dataset:  balanced 
	Number of proteins:  3059 
	Number of samples:  118 
Filtering by condition - two not-NA per condition
	Before filtering: 3059 118 
	After filtering: 2862 118 
Filtering out features that have NAs in all columns
	Before filtering: 2862 118 
	After filtering: 2862 118 
Rows after all filters: 2862 


Dataset:  imbalanced 
	Number of proteins:  3058 
	Number of samples:  86 
Filtering by condition - two not-NA per condition
	Before filtering: 3058 86 
	After filtering: 2860 86 
Filtering out features that have NAs in all columns
	Before filtering: 2860 86 
	After filtering: 2860 86 
Rows after all filters: 2860 


Dataset:  downsampled 
	Number of proteins:  3059 
	Number of samples:  86 
Filtering by condition - two not-NA per condition
	Before filtering: 3059 86 
	After filtering: 2859 86 
Filtering out features that have NAs in all columns
	Before filtering: 2859 86 
	After filtering: 2859 86 
Rows after all filters: 2859 


In [7]:
analysed_proteins <- list()

for(dataset in datasets_list){
    analysed_proteins[[dataset]] <- list()
    analysed_proteins[[dataset]]$central <- filter_list_META[[dataset]][['Central']]
    analysed_proteins[[dataset]]$meta <- filter_list_META[[dataset]][['Meta']]
}

# write to json
write_json(analysed_proteins, "/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/analysed_proteins.json")
write_json(analyzed_proteins, "/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/analysed_proteins_LABS.json")

# Meta run

In [9]:
for (dataset in datasets_list) {
    cat("Processing ", dataset, " dataset\n")
    system(paste0("cd /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/", dataset, "/"))

    system(paste0("Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_MetaDE.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/", dataset, "/ lab_A lab_B lab_C lab_D lab_E"))
    system(paste0("Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_MetaVolcanoR.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/", dataset, "/ lab_A lab_B lab_C lab_D lab_E"))
    system(paste0("Rscript /home/yuliya/repos/cosybio/FedProt/evaluation_utils/meta_code/run_RankProd.R /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/", dataset, "/ lab_A lab_B lab_C lab_D lab_E"))

    # Copy the resulting files to the desired directory
    system(paste0("cp /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/Meta_DE/", dataset, "/MA_* /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/", dataset, "/results/"))
}

Processing  balanced  dataset
Processing  imbalanced  dataset
Processing  downsampled  dataset
