From 4298f5d10b404a028d2b045e9fe5ed934d11d6bd Mon Sep 17 00:00:00 2001 From: "I. I. Badshah" Date: Tue, 1 Jul 2025 10:40:50 +0100 Subject: [PATCH] [v0.2.16] `normalize_areas...` dynamic cut off column ('max_scr' default) --- DESCRIPTION | 2 +- R/script_helpers.R | 1001 ++++++++++++++++++++++++++++---------------- 2 files changed, 650 insertions(+), 353 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d6d53ba..b780bcf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: protools2 Type: Package Title: A Set Of Tools For Proteomics And Phosphoproteomics Data Analysis -Version: 0.2.15 +Version: 0.2.16 Date: 2025-04-15 Authors@R: c( person("Pedro", "Cutillas", , "p.cutillas@qmul.ac.uk", role = "aut", diff --git a/R/script_helpers.R b/R/script_helpers.R index 7aa8153..d57731d 100644 --- a/R/script_helpers.R +++ b/R/script_helpers.R @@ -19,11 +19,12 @@ #' #' @export fix_combiPeptData <- function( - df.combi = df.combi, - organism = organism_dbs, - pescal.xlsx = pescal.output.file, - fixed.xlsx = df.combi.fixed.xlsx, - save.xlsx = TRUE) { + df.combi = df.combi, + organism = organism_dbs, + pescal.xlsx = pescal.output.file, + fixed.xlsx = df.combi.fixed.xlsx, + save.xlsx = TRUE +) { require(tidyverse) # Name columns @@ -48,7 +49,9 @@ fix_combiPeptData <- function( df1 <- df.combi %>% dplyr::mutate(uniprot_split = stringr::str_split(uniprot, "; ")) %>% tidyr::unnest(uniprot_split) %>% - dplyr::mutate(uniprot_split = stringr::str_replace_all(uniprot_split, ";", "")) + dplyr::mutate( + uniprot_split = stringr::str_replace_all(uniprot_split, ";", "") + ) df2 <- df.uniprot %>% dplyr::filter(Entry %in% df1$uniprot_split) %>% @@ -57,7 +60,9 @@ fix_combiPeptData <- function( df3 <- df1 %>% dplyr::left_join(df2, by = c("uniprot_split" = "Entry")) %>% dplyr::group_by(db_id) %>% - dplyr::summarise(genes_new = paste(Gene.Names..primary., collapse = "; ")) %>% + dplyr::summarise( + genes_new = paste(Gene.Names..primary., collapse = "; ") + ) %>% dplyr::mutate(genes_new = paste0(genes_new, ";")) df4 <- df.combi %>% @@ -68,8 +73,15 @@ fix_combiPeptData <- function( dplyr::rowwise() %>% dplyr::mutate( genes_mod_vals = list(stringr::str_split_1(genes_mod, ";")), - genes_mod_keep = list(unlist(stringr::str_extract_all(genes_mod_vals, "\\(.*?\\)"))), - genes_new_split = list(stringr::str_replace_all(stringr::str_split_1(genes_new, "; "), ";", "")), + genes_mod_keep = list(unlist(stringr::str_extract_all( + genes_mod_vals, + "\\(.*?\\)" + ))), + genes_new_split = list(stringr::str_replace_all( + stringr::str_split_1(genes_new, "; "), + ";", + "" + )), genes_mod_reinsert = list(paste0(genes_new_split, genes_mod_keep)), genes_mod_recomb = paste0(genes_mod_reinsert, collapse = ";"), genes_mod_recomb = paste0(genes_mod_recomb, ";") @@ -95,7 +107,13 @@ fix_combiPeptData <- function( # Load existing Pescal Excel wb <- openxlsx::loadWorkbook(pescal.xlsx) # Overwrite combiPeptData sheet - openxlsx::writeData(wb, sheet = "combiPeptData", x = df5, colNames = TRUE, rowNames = FALSE) + openxlsx::writeData( + wb, + sheet = "combiPeptData", + x = df5, + colNames = TRUE, + rowNames = FALSE + ) # Save changes to new Excel file openxlsx::saveWorkbook(wb, fixed.xlsx, overwrite = TRUE) } @@ -239,20 +257,21 @@ impute_na <- function(df.design, df.areas) { cols <- df.design$heading[df.design$condition == condition] # Calculate the mean of each row for the current condition, ignoring NA values (`drop = FALSE` ensures that the result is still a data frame) - row_means <- apply(df.areas[, cols, drop = FALSE], 1, function(x) if (all(is.na(x))) NA else mean(x, na.rm = TRUE)) + row_means <- apply(df.areas[, cols, drop = FALSE], 1, function(x) { + if (all(is.na(x))) NA else mean(x, na.rm = TRUE) + }) # Replace NA values in df.areas for the current condition with the row means for (heading in cols) { is_na <- is.na(df.areas[, heading]) df.areas[is_na, heading] <- row_means[is_na] - # If all values in the row for the current condition are NA, replace NA with the minimum value of the dataset [new method] (column - 1 [old method]) + # If all values in the row for the current condition are NA, replace NA with the minimum value of the dataset - 1 [new method] (column - 1 [old method]) all_na <- is.na(row_means) # df.areas[all_na, heading] <- min(df.areas[, heading], na.rm = TRUE) - 1 # min of column (or min - 1) # Assign the minimum value minus 1 to the specified location df.areas[all_na, heading] <- min_value - 1 - } } @@ -288,11 +307,13 @@ impute_na_knn <- function(df.design, df.areas, k = 5) { # Original function mistakenly used object name not of parameter, but of object in # global environment. This meant that `df.combi` was using original from global scope not from argument normalize_areas_return_ppindex_edit <- function( - pescal_output_file, - delta_score_cut_off = 0, # if (fragpipe) {0} else {5} - k_NN = NULL + pescal_output_file, + score_column = "max_scr", + score_cut_off = 0, # if (fragpipe) {0} else {5} + k_NN = NULL ) { - # Set delta_score_cut_off to low (say 1) for proteomics data, + # delta_score_cut_off renamed `score_cut_off` as `score_column` is dynamically set (default 'max_scr') + # Set to low (say 1) for proteomics data, # High (say 15) for phosphoproteomics data library(foreach) library(doParallel) @@ -307,9 +328,15 @@ normalize_areas_return_ppindex_edit <- function( df.design <- readxl::read_excel(pescal_output_file, "design") ) - # select peptides above the delta_score_cut_off - df.combi <- subset(df.combi, df.combi$max_delta_score > delta_score_cut_off) - peptides <- unique(unlist(df.combi[, 25])) # 'sites' + # select peptides above the score_cut_off + # df.combi <- subset(df.combi, df.combi$max_delta_score > delta_score_cut_off) # Now column is dynamically set in line below + df.combi <- subset(df.combi, df.combi[[score_column]] > score_cut_off) + # peptides <- unique(unlist(df.combi[, 25])) # 25 = 'sites' (legacy combiPeptData); below is more robust + if ("sites" %in% colnames(df.combi)) { + peptides <- unique(unlist(df.combi[, "sites"])) + } else { + peptides <- unique(unlist(df.combi[, 25])) + } df.areas <- df.areas[df.areas$db_id %in% df.combi$db_id, ] cols <- colnames(dplyr::select_if(df.areas, is.numeric)) df.areas.n <- data.frame( @@ -321,10 +348,16 @@ normalize_areas_return_ppindex_edit <- function( cl <- makeCluster(cores[1] - 1) # not to overload your computer registerDoParallel(cl) t1 <- Sys.time() - df <- foreach(p = peptides, .combine = "rbind") %dopar% { - ids <- na.omit(df.combi[df.combi[, 25] == p, ]$db_id) - apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum) - } + df <- foreach(p = peptides, .combine = "rbind") %dopar% + { + # ids <- na.omit(df.combi[df.combi[, 25] == p, ]$db_id) # 25 = 'sites' (legacy combiPeptData); below is more robust + if ("sites" %in% colnames(df.combi)) { + ids <- na.omit(df.combi[df.combi[, "sites"] == p, ]$db_id) + } else { + ids <- na.omit(df.combi[df.combi[, 25] == p, ]$db_id) + } + apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum) + } stopCluster(cl) df.norm <- data.frame(sites = peptides, df * 1e+06) rownames(df.norm) <- df.norm$sites @@ -347,7 +380,8 @@ normalize_areas_return_ppindex_edit <- function( # Normalised NA imputation (no log2, no scale, no centre) ---- df.norm.na.imputed.no.log2.no.scale.no.centre <- df.norm df.norm.na.imputed.no.log2.no.scale.no.centre[cols] <- lapply( - df.norm.na.imputed.no.log2.no.scale.no.centre[cols], function(x) { + df.norm.na.imputed.no.log2.no.scale.no.centre[cols], + function(x) { replace(x, is.na(x), min(x, na.rm = TRUE) / 5) # Correct NA imputation for no log2 (legacy method) } ) @@ -370,17 +404,19 @@ normalize_areas_return_ppindex_edit <- function( df.norm.log2.centered.scaled.na.imputed2 <- df.norm.log2.centered.scaled # Correct NA imputation (legacy method) df.norm.log2.centered.scaled.na.imputed[cols] <- lapply( - df.norm.log2.centered.scaled.na.imputed[cols], function(x) { + df.norm.log2.centered.scaled.na.imputed[cols], + function(x) { replace(x, is.na(x), min(x, na.rm = TRUE) - 1) # Correct for log2 } ) # Previous NA imputation (renamed by appending 1) df.norm.log2.centered.scaled.na.imputed1[ is.na(df.norm.log2.centered.scaled.na.imputed1) - ] <- min(df.norm.log2.centered.scaled.na.imputed1[, cols], na.rm = T) / 5 # Previous incorrect method for log2 + ] <- min(df.norm.log2.centered.scaled.na.imputed1[, cols], na.rm = T) / 5 # Previous incorrect method for log2 # Alternate NA imputation (renamed by appending 2) df.norm.log2.centered.scaled.na.imputed2[cols] <- lapply( - df.norm.log2.centered.scaled.na.imputed2[cols], function(x) { + df.norm.log2.centered.scaled.na.imputed2[cols], + function(x) { replace(x, is.na(x), min(x, na.rm = TRUE)) # Alternate NA imputation (not used) } ) @@ -403,7 +439,8 @@ normalize_areas_return_ppindex_edit <- function( df.norm.log2.centered.na.imputed <- df.norm.log2.centered # Correct NA imputation (legacy method) df.norm.log2.centered.na.imputed[cols] <- lapply( - df.norm.log2.centered.na.imputed[cols], function(x) { + df.norm.log2.centered.na.imputed[cols], + function(x) { replace(x, is.na(x), min(x, na.rm = TRUE) - 1) # Correct for log2 } ) @@ -433,16 +470,16 @@ normalize_areas_return_ppindex_edit <- function( return(list( normalized.data = df.norm, - normalized.na.imputed = df.norm.na.imputed, # New NA imputation (no log2, no centre, no scale) + normalized.na.imputed = df.norm.na.imputed, # New NA imputation (no log2, no centre, no scale) df.norm.na.imputed.no.log2.no.scale.no.centre = df.norm.na.imputed.no.log2.no.scale.no.centre, # legacy NA imputation normalized.plus.log2.cent.data = df.norm.log2.centered, normalized.plus.log2.cent.scaled.data = df.norm.log2.centered.scaled, df.norm.log2.centered.na.imputed = df.norm.log2.centered.na.imputed, df.norm.log2.centered.na.imputed.new = df.norm.log2.centered.na.imputed.new, - df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed, # Main output legacy + df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed, # Main output legacy df.norm.log2.centered.scaled.na.imputed1 = df.norm.log2.centered.scaled.na.imputed1, df.norm.log2.centered.scaled.na.imputed2 = df.norm.log2.centered.scaled.na.imputed2, - df.norm.log2.centered.scaled.na.imputed.new = df.norm.log2.centered.scaled.na.imputed.new # Main output new (log2, centre, scale) + df.norm.log2.centered.scaled.na.imputed.new = df.norm.log2.centered.scaled.na.imputed.new # Main output new (log2, centre, scale) )) } # Alias for the new function @@ -454,17 +491,24 @@ normalize_areas_return_ppindex <- normalize_areas_return_ppindex_edit # Original function mistakenly used object name not of parameter, but of object in # global environment. This meant that `df.combi` was using original from global scope not from argument normalize_areas_return_protein_groups_edit <- function( - pescal_output_file, - mascot.score.cut.off = 50, # if (fragpipe) {0} else {40} - n.peptide.cut.off = 1, # if (fragpipe) {0} else {1} - k_NN = NULL + pescal_output_file, + score_column = "max_scr", + score_cut_off = 0, # if (fragpipe) {0} else {40} + n_peptide_cut_off = 0, # if (fragpipe) {0} else {1} + k_NN = NULL ) { suppressMessages( - df.areas <- data.frame(readxl::read_excel(pescal_output_file, "output_areas")) + df.areas <- data.frame(readxl::read_excel( + pescal_output_file, + "output_areas" + )) ) colnames(df.areas) <- gsub("-", ".", colnames(df.areas), fixed = T) suppressMessages( - df.combi <- data.frame(readxl::read_excel(pescal_output_file, "combiPeptData")) # Original `pescal.output.file` object is accessed from global scope + df.combi <- data.frame(readxl::read_excel( + pescal_output_file, + "combiPeptData" + )) # Original `pescal.output.file` object is accessed from global scope ) suppressMessages( df.design <- readxl::read_excel(pescal_output_file, "design") @@ -478,7 +522,7 @@ normalize_areas_return_protein_groups_edit <- function( ) # find protein groups - protein.groups <- na.omit(unique(unlist(df.combi[, 29]))) # 29 = genes + protein.groups <- na.omit(unique(unlist(df.combi[, 29]))) # 29 = 'gene' n.protein.groups <- length(protein.groups) # group peptides by protein group @@ -486,26 +530,27 @@ normalize_areas_return_protein_groups_edit <- function( cl <- makeCluster(cores[1] - 1) # not to overload your computer registerDoParallel(cl) t1 <- Sys.time() - df <- foreach(p = protein.groups, .combine = "rbind") %dopar% { - dfx <- df.combi[df.combi[, 29] == p, ] - ids <- na.omit(dfx$db_id) - best.mascot.score <- max(dfx$max_scr, na.rm = T) - protein.name <- dfx$protein[1] - acc <- na.omit(dfx$acc_no)[1] - uniprot.id <- na.omit(dfx[1, 30])[1] - n.peptides <- length(ids) - nPSMs <- na.omit(dfx[, "N_peptides"]) - c( - protein.group = p, - apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum), - best.mascot.score = best.mascot.score, - n.peptides = n.peptides, - n.psm = sum(nPSMs), - acc = acc, - uniprot.id = uniprot.id, - protein.name = protein.name - ) - } + df <- foreach(p = protein.groups, .combine = "rbind") %dopar% + { + dfx <- df.combi[df.combi[, 29] == p, ] # 29 = 'gene' + ids <- na.omit(dfx$db_id) + best.mascot.score <- max(dfx[[score_column]], na.rm = T) # Now dynamic instead of hard-coding `dfx$max_scr` + protein.name <- dfx$protein[1] + acc <- na.omit(dfx$acc_no)[1] + uniprot.id <- na.omit(dfx[1, 30])[1] # 30 = 'uniprot_acc' + n.peptides <- length(ids) + nPSMs <- na.omit(dfx[, "N_peptides"]) + c( + protein.group = p, + apply(df.areas.n[df.areas.n$ids %in% ids, cols], 2, sum), + best.mascot.score = best.mascot.score, + n.peptides = n.peptides, + n.psm = sum(nPSMs), + acc = acc, + uniprot.id = uniprot.id, + protein.name = protein.name + ) + } stopCluster(cl) write.csv(df, "temp.csv") @@ -530,7 +575,8 @@ normalize_areas_return_protein_groups_edit <- function( # Normalised NA imputation (no log2, no scale, no centre) ---- df.norm.na.imputed.no.log2.no.scale.no.centre <- df.norm df.norm.na.imputed.no.log2.no.scale.no.centre[cols] <- lapply( - df.norm.na.imputed.no.log2.no.scale.no.centre[cols], function(x) { + df.norm.na.imputed.no.log2.no.scale.no.centre[cols], + function(x) { replace(x, is.na(x), min(x, na.rm = TRUE) / 5) # Correct NA imputation for no log2 (legacy method) } ) @@ -570,7 +616,8 @@ normalize_areas_return_protein_groups_edit <- function( # Correct NA imputation (legacy method) ---- df.norm.log2.centered.scaled.na.imputed[cols] <- lapply( - df.norm.log2.centered.scaled.na.imputed[cols], function(x) { + df.norm.log2.centered.scaled.na.imputed[cols], + function(x) { replace(x, is.na(x), min(x, na.rm = TRUE) - 1) # Correct NA imputation for log2 } ) @@ -591,23 +638,39 @@ normalize_areas_return_protein_groups_edit <- function( rownames(df.norm) <- df.norm.log2.centered$protein.group rownames(df.norm.na.imputed) <- df.norm.log2.centered$protein.group - rownames(df.norm.na.imputed.no.log2.no.scale.no.centre) <- df.norm.na.imputed.no.log2.no.scale.no.centre$protein.group + rownames( + df.norm.na.imputed.no.log2.no.scale.no.centre + ) <- df.norm.na.imputed.no.log2.no.scale.no.centre$protein.group rownames(df.norm.log2.centered) <- df.norm.log2.centered$protein.group - rownames(df.norm.log2.centered.scaled) <- df.norm.log2.centered.scaled$protein.group - rownames(df.norm.log2.centered.na.imputed.new) <- df.norm.log2.centered.na.imputed.new$protein.group - rownames(df.norm.log2.centered.scaled.na.imputed) <- df.norm.log2.centered.scaled.na.imputed$protein.group - rownames(df.norm.log2.centered.scaled.na.imputed.new) <- df.norm.log2.centered.scaled.na.imputed.new$protein.group - + rownames( + df.norm.log2.centered.scaled + ) <- df.norm.log2.centered.scaled$protein.group + rownames( + df.norm.log2.centered.na.imputed.new + ) <- df.norm.log2.centered.na.imputed.new$protein.group + rownames( + df.norm.log2.centered.scaled.na.imputed + ) <- df.norm.log2.centered.scaled.na.imputed$protein.group + rownames( + df.norm.log2.centered.scaled.na.imputed.new + ) <- df.norm.log2.centered.scaled.na.imputed.new$protein.group + + # FILTERING xx <- x[ - x$best.mascot.score > mascot.score.cut.off & - x$n.peptides > n.peptide.cut.off, + x$best.mascot.score > score_cut_off & # 'best.mascot.score' created internally previously, i.e. not a df.combi column name + x$n.peptides > n_peptide_cut_off, ] selected.prot.groups <- xx$protein.group cc <- c( - "protein.group", "best.mascot.score", "n.peptides", - "n.psm", "acc", "uniprot.id", "protein.name" + "protein.group", + "best.mascot.score", + "n.peptides", + "n.psm", + "acc", + "uniprot.id", + "protein.name" ) df.norm <- merge.data.frame(df.norm, x[, cc], by = "protein.group") @@ -648,12 +711,17 @@ normalize_areas_return_protein_groups_edit <- function( ) return(list( - normalized.data = df.norm[df.norm$protein.group %in% selected.prot.groups, ], - normalized.na.imputed = df.norm.na.imputed[ # New NA imputation (no log2, no centre, no scale) + normalized.data = df.norm[ + df.norm$protein.group %in% selected.prot.groups, + ], + normalized.na.imputed = df.norm.na.imputed[ + # New NA imputation (no log2, no centre, no scale) df.norm.na.imputed$protein.group %in% selected.prot.groups, ], - df.norm.na.imputed.no.log2.no.scale.no.centre = df.norm.na.imputed.no.log2.no.scale.no.centre[ # Legacy NA imputation - df.norm.na.imputed.no.log2.no.scale.no.centre$protein.group %in% selected.prot.groups, + df.norm.na.imputed.no.log2.no.scale.no.centre = df.norm.na.imputed.no.log2.no.scale.no.centre[ + # Legacy NA imputation + df.norm.na.imputed.no.log2.no.scale.no.centre$protein.group %in% + selected.prot.groups, ], normalized.plus.log2.cent.data = df.norm.log2.centered[ df.norm.log2.centered$protein.group %in% selected.prot.groups, @@ -661,14 +729,19 @@ normalize_areas_return_protein_groups_edit <- function( normalized.plus.log2.cent.scaled.data = df.norm.log2.centered.scaled[ df.norm.log2.centered.scaled$protein.group %in% selected.prot.groups, ], - df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed[ # Legacy NA imputation - df.norm.log2.centered.scaled.na.imputed$protein.group %in% selected.prot.groups, + df.norm.log2.centered.scaled.na.imputed = df.norm.log2.centered.scaled.na.imputed[ + # Legacy NA imputation + df.norm.log2.centered.scaled.na.imputed$protein.group %in% + selected.prot.groups, ], df.norm.log2.centered.na.imputed.new = df.norm.log2.centered.na.imputed.new[ - df.norm.log2.centered.na.imputed.new$protein.group %in% selected.prot.groups, + df.norm.log2.centered.na.imputed.new$protein.group %in% + selected.prot.groups, ], - df.norm.log2.centered.scaled.na.imputed.new = df.norm.log2.centered.scaled.na.imputed.new[ # Main output, new NA imputation (with log2, centre, scale) - df.norm.log2.centered.scaled.na.imputed.new$protein.group %in% selected.prot.groups, + df.norm.log2.centered.scaled.na.imputed.new = df.norm.log2.centered.scaled.na.imputed.new[ + # Main output, new NA imputation (with log2, centre, scale) + df.norm.log2.centered.scaled.na.imputed.new$protein.group %in% + selected.prot.groups, ] )) } @@ -733,7 +806,10 @@ summary.qual.data_edit <- function(df.combi, df.design, fragpipe) { big.mark = "," ) if (fragpipe) { - df.mods.phospho.st <- df.mods[grepl("S\\(", df.mods$Var1, fixed = TRUE) | grepl("T\\(", df.mods$Var1, fixed = TRUE), ] + df.mods.phospho.st <- df.mods[ + grepl("S\\(", df.mods$Var1, fixed = TRUE) | + grepl("T\\(", df.mods$Var1, fixed = TRUE), + ] df.mods.phospho.y <- df.mods[grepl("Y\\(", df.mods$Var1, fixed = TRUE), ] } else { df.mods.phospho.st <- df.mods[grepl("ST", df.mods$Var1), ] @@ -746,18 +822,40 @@ summary.qual.data_edit <- function(df.combi, df.design, fragpipe) { nconditions <- length(unique(df.design$condition)) nruns <- nrow(df.design) df.phospho <- subset(df.combi, grepl("Phospho", df.combi$pep_mod, fixed = T)) - total.phosphoproteins <- format(length(unique(df.phospho$acc_no)), big.mark = ",") + total.phosphoproteins <- format( + length(unique(df.phospho$acc_no)), + big.mark = "," + ) total.proteins <- format(length(unique(df.combi$acc_no)), big.mark = ",") summary.text <- paste( - "The experiment identified", n.peptides.total, - "peptides, of which", n.peptides.unique, "were unique. There were", - phospho.s.unique, "unique phosphoserine,", phospho.t.unique, - "phosphothreonine and", phospho.y.unique, "phosphotyrosine sites identified and quantified in the experiment.", - "A total of", total.proteins, "proteins were identified in the experiment of which", - total.phosphoproteins, "were phosphorylated.", " \n", - "The experiment compared", nconditions, "conditions in", - if ((nruns / nconditions) %% 1 == 0) nruns / nconditions else paste("a total of", nruns), - "replicates, requiring", nruns, "LC-MS/MS runs." + "The experiment identified", + n.peptides.total, + "peptides, of which", + n.peptides.unique, + "were unique. There were", + phospho.s.unique, + "unique phosphoserine,", + phospho.t.unique, + "phosphothreonine and", + phospho.y.unique, + "phosphotyrosine sites identified and quantified in the experiment.", + "A total of", + total.proteins, + "proteins were identified in the experiment of which", + total.phosphoproteins, + "were phosphorylated.", + " \n", + "The experiment compared", + nconditions, + "conditions in", + if ((nruns / nconditions) %% 1 == 0) { + nruns / nconditions + } else { + paste("a total of", nruns) + }, + "replicates, requiring", + nruns, + "LC-MS/MS runs." ) } @@ -765,10 +863,11 @@ summary.qual.data_edit <- function(df.combi, df.design, fragpipe) { # Edited `protools2::summary.qual.data.proteomics()` ---- # Edited as `nruns` was defined after use summary.qual.data.proteomics_edit <- function( - df.norm, - df.design, - replicates = 0, - conditions = 0) { + df.norm, + df.design, + replicates = 0, + conditions = 0 +) { protein.groups <- unique(unlist(df.norm$protein.group)) n.protein.groups <- format(length(protein.groups), big.mark = ",") n.peptides.unique <- format(sum(df.norm$n.peptides), big.mark = ",") @@ -781,18 +880,36 @@ summary.qual.data.proteomics_edit <- function( } data.points <- format(nruns * length(protein.groups), big.mark = ",") summary.text <- paste( - "The experiment produced", n.peptides.total, - "peptides spectral matches, of which", n.peptides.unique, - "were unique peptides, and belonging to", n.protein.groups, - "protein groups.", " \n", "The experiment compared", - conditions, "conditions in", replicates, "replicates, requiring", - nruns, "LC-MS/MS runs and producing", data.points, "quantitative datapoints." + "The experiment produced", + n.peptides.total, + "peptides spectral matches, of which", + n.peptides.unique, + "were unique peptides, and belonging to", + n.protein.groups, + "protein groups.", + " \n", + "The experiment compared", + conditions, + "conditions in", + replicates, + "replicates, requiring", + nruns, + "LC-MS/MS runs and producing", + data.points, + "quantitative datapoints." ) } # Edited `protools2::pca.plot()` ---- -pca.plot_edit <- function(df, df.design, colorfactor = "", shapefactor = "", legend_position = NULL, title = "") { +pca.plot_edit <- function( + df, + df.design, + colorfactor = "", + shapefactor = "", + legend_position = NULL, + title = "" +) { df <- dplyr::select_if(df, is.numeric) res.pca <- prcomp(df, scale = F) df.pca <- data.frame(res.pca$rotation) @@ -819,13 +936,14 @@ pca.plot_edit <- function(df, df.design, colorfactor = "", shapefactor = "", leg y = paste("PC2,", round(pc2, digits = 2) * 100, "%") ) if (!is.null(legend_position)) { - plot1 <- plot1 + theme( - legend.position = legend_position, - legend.text = element_text( - size = 8, - lineheight = .8 + plot1 <- plot1 + + theme( + legend.position = legend_position, + legend.text = element_text( + size = 8, + lineheight = .8 + ) ) - ) legend1 <- cowplot::get_legend(plot1) plot1 <- plot1 + theme(legend.position = "none") } @@ -843,13 +961,14 @@ pca.plot_edit <- function(df, df.design, colorfactor = "", shapefactor = "", leg y = paste("PC3,", round(pc3, digits = 2) * 100, "%") ) if (!is.null(legend_position)) { - plot2 <- plot2 + theme( - legend.position = legend_position, - legend.text = element_text( - size = 8, - lineheight = .8 + plot2 <- plot2 + + theme( + legend.position = legend_position, + legend.text = element_text( + size = 8, + lineheight = .8 + ) ) - ) legend2 <- cowplot::get_legend(plot2) plot2 <- plot2 + theme(legend.position = "none") } @@ -866,7 +985,12 @@ pca.plot_edit <- function(df, df.design, colorfactor = "", shapefactor = "", leg # Edited `protools2::identify_differences_in_comparison_plus_volcano()` ---- -identify_differences_in_comparison_plus_volcano_edit <- function(df, fold.cutoff = 0.5, qval.cutoff = 0.05, graph.header = "") { +identify_differences_in_comparison_plus_volcano_edit <- function( + df, + fold.cutoff = 0.5, + qval.cutoff = 0.05, + graph.header = "" +) { colnames(df) <- c("sites", "fold", "pvalue", "qvalue") df.up <- subset(df, df$qvalue < qval.cutoff & df$fold > fold.cutoff) df.do <- subset(df, df$qvalue < qval.cutoff & df$fold < (-fold.cutoff)) @@ -895,10 +1019,19 @@ identify_differences_in_comparison_plus_volcano_edit <- function(df, fold.cutoff plot.volcano.1 <- ggplot(df, aes(x = fold, y = -log10(qvalue))) + geom_point(size = 1) + geom_point(data = df.up, aes(x = fold, y = -log10(qvalue)), color = "red") + - geom_point(data = df.do, aes(x = fold, y = -log10(qvalue)), color = "blue") + + geom_point( + data = df.do, + aes(x = fold, y = -log10(qvalue)), + color = "blue" + ) + labs( title = graph.header, - subtitle = paste0("Increased = ", nrow(df.up), "; Decreased = ", nrow(df.do)) + subtitle = paste0( + "Increased = ", + nrow(df.up), + "; Decreased = ", + nrow(df.do) + ) ) return( list( @@ -914,53 +1047,70 @@ identify_differences_in_comparison_plus_volcano_edit <- function(df, fold.cutoff # Edited `protools2::identify_differences_by_pvalue_in_comparison_plus_volcano()` ---- # Original incorrectly displays duplicated pvalue distribution, so removed repeated plot identify_differences_by_pvalue_in_comparison_plus_volcano_edit <- function( - df, - fold.cutoff = 0.5, - pval.cutoff = 0.05, - graph.header = "") { + df, + fold.cutoff = 0.5, + pval.cutoff = 0.05, + graph.header = "" +) { colnames(df) <- c("sites", "fold", "pvalue", "qvalue") - df.up <- subset(df, df$pvalue < pval.cutoff & df$fold > - fold.cutoff) - df.do <- subset(df, df$pvalue < pval.cutoff & df$fold < - (-fold.cutoff)) + df.up <- subset(df, df$pvalue < pval.cutoff & df$fold > fold.cutoff) + df.do <- subset(df, df$pvalue < pval.cutoff & df$fold < (-fold.cutoff)) df.up <- df.up[order(-df.up$fold), ] df.do <- df.do[order(df.do$fold), ] - suppressMessages(plot.pval.dist <- ggplot(df, aes(x = pvalue)) + - geom_histogram( - # fill = "white", - color = "black", - alpha = 0.7 - ) + - labs(title = "Distrubtion of pvalues") + - theme_bw()) - suppressMessages(plot.qval.dist <- ggplot(df, aes(x = qvalue)) + - geom_histogram( - # fill = "white", - color = "black", - alpha = 0.7 - ) + - labs(title = "Distrubtion of pvalues (BH adjusted pvalues)") + - theme_bw()) + suppressMessages( + plot.pval.dist <- ggplot(df, aes(x = pvalue)) + + geom_histogram( + # fill = "white", + color = "black", + alpha = 0.7 + ) + + labs(title = "Distrubtion of pvalues") + + theme_bw() + ) + suppressMessages( + plot.qval.dist <- ggplot(df, aes(x = qvalue)) + + geom_histogram( + # fill = "white", + color = "black", + alpha = 0.7 + ) + + labs(title = "Distrubtion of pvalues (BH adjusted pvalues)") + + theme_bw() + ) plot.volcano.1 <- ggplot(df, aes(x = fold, y = -log10(pvalue))) + geom_point(size = 1) + - geom_point(data = df.up, aes( - x = fold, - y = -log10(pvalue) - ), color = "red") + + geom_point( + data = df.up, + aes( + x = fold, + y = -log10(pvalue) + ), + color = "red" + ) + geom_point( data = df.do, - aes(x = fold, y = -log10(pvalue)), color = "blue" + aes(x = fold, y = -log10(pvalue)), + color = "blue" ) + - labs(title = graph.header, subtitle = paste0( - "Increased = ", - nrow(df.up), "; Decreased = ", nrow(df.do) - )) + labs( + title = graph.header, + subtitle = paste0( + "Increased = ", + nrow(df.up), + "; Decreased = ", + nrow(df.do) + ) + ) return( list( - df.increased = df.up, df.decreased = df.do, + df.increased = df.up, + df.decreased = df.do, volcanoplot = plot.volcano.1, - pvalue.distributions = cowplot::plot_grid(plot.pval.dist, ggplot() + - theme_void()) + pvalue.distributions = cowplot::plot_grid( + plot.pval.dist, + ggplot() + + theme_void() + ) # pvalue.distributions = cowplot::plot_grid(plot.pval.dist, plot.qval.dist) ) ) @@ -970,11 +1120,12 @@ identify_differences_by_pvalue_in_comparison_plus_volcano_edit <- function( # Edited `protools2::barplot.top.peptides()` ---- # Edited to change plot titles barplot.top.peptides_edit <- function( - df.up, - df.do, - graph.heading = "", - context = "peptides", - subtitle = "") { + df.up, + df.do, + graph.heading = "", + context = "peptides", + subtitle = "" +) { df.up$effect <- "Increased" df.do$effect <- "Decreased" df.up.down <- na.omit( @@ -983,7 +1134,11 @@ barplot.top.peptides_edit <- function( # a <- df.up.down$sites # The following 3 lines of code are replaced with `stringr` method below # a <- ifelse(nchar(a) > 20, paste0(strtrim(a, 20), "..."), a) # df.up.down$sites <- a - df.up.down$sites <- stringr::str_trunc(df.up.down$sites, width = 30, side = "right") + df.up.down$sites <- stringr::str_trunc( + df.up.down$sites, + width = 30, + side = "right" + ) barplot.top.up <- ggplot( df.up.down, aes(x = reorder(sites, fold), y = fold) @@ -996,7 +1151,12 @@ barplot.top.peptides_edit <- function( axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) ) + labs( - title = paste0("Top increased / decreased ", context, " in ", graph.heading), + title = paste0( + "Top increased / decreased ", + context, + " in ", + graph.heading + ), y = "log2(fold)", x = "", subtitle = subtitle @@ -1006,7 +1166,11 @@ barplot.top.peptides_edit <- function( # Edited `protools2::plot.kinase.relationships()` ---- # `reshape2` is deprecated -plot.kinase.relationships_edit <- function(ksea.data, graph.title, pval.cutoff) { +plot.kinase.relationships_edit <- function( + ksea.data, + graph.title, + pval.cutoff +) { library(ggrepel) library(igraph) @@ -1029,7 +1193,8 @@ plot.kinase.relationships_edit <- function(ksea.data, graph.title, pval.cutoff) pvalue <- ksea.data[ksea.data$kinase_dataset == k1, "pvalues"] zscore <- ksea.data[ksea.data$kinase_dataset == k1, "zscores"] genes.1 <- unlist(strsplit( - ksea.data[ksea.data$kinase_dataset == k1, "sites"], ";" + ksea.data[ksea.data$kinase_dataset == k1, "sites"], + ";" )) for (k2 in kinases) { genes.2 <- unlist( @@ -1050,8 +1215,12 @@ plot.kinase.relationships_edit <- function(ksea.data, graph.title, pval.cutoff) } } df.network <- data.frame( - node.1 = pth1, node.2 = pth2, weight = w, - pvalue.1 = pval1, pvalue.2 = pval2, zscores = zscores, + node.1 = pth1, + node.2 = pth2, + weight = w, + pvalue.1 = pval1, + pvalue.2 = pval2, + zscores = zscores, counts = counts ) # POSSIBLY CHECK `if (nrow(df.network) < 2) { return(0) }` df.network <- df.network[order(df.network$pvalue.1), ] @@ -1068,7 +1237,8 @@ plot.kinase.relationships_edit <- function(ksea.data, graph.title, pval.cutoff) fr.all <- layout_with_graphopt(obs.spp.all) fr.all.df <- as.data.frame(fr.all) fr.all.df$species <- colnames(xx[, 2:ncol(xx)]) # RESULT WAS `NULL` (Now works ? after `ksea_edit()`) - if (is.null(colnames(xx[, 2:ncol(xx)]))) { # possibly `NULL` if nrow() < 2 ? + if (is.null(colnames(xx[, 2:ncol(xx)]))) { + # possibly `NULL` if nrow() < 2 ? fr.all.df$species <- colnames(xx[2:ncol(xx)]) # As above now works, this is no longer needed (still is needed) } # EMPTY DATAFRAME `g` (0 rows, columns = "from", "to"): @@ -1086,45 +1256,64 @@ plot.kinase.relationships_edit <- function(ksea.data, graph.title, pval.cutoff) g$pvalue.2 <- df.network$pvalue.2[match(g$from, df.network$node.2)] g$weight <- df.network$weight[match(g$from, df.network$node.1)] fr.all.df$pvalue.1 <- df.network$pvalue.1[match( - fr.all.df$species, df.network$node.1 + fr.all.df$species, + df.network$node.1 )] fr.all.df$pvalue.2 <- df.network$pvalue.2[match( - fr.all.df$species, df.network$node.2 + fr.all.df$species, + df.network$node.2 )] fr.all.df$E <- df.network$zscores[match( - fr.all.df$species, df.network$node.1 # + fr.all.df$species, + df.network$node.1 # )] fr.all.df$counts <- df.network$counts[match( - fr.all.df$species, df.network$node.2 + fr.all.df$species, + df.network$node.2 )] pplot <- ggplot() + geom_segment( - data = g, aes( - x = from.x, xend = to.x, y = from.y, yend = to.y, size = weight / 10 + data = g, + aes( + x = from.x, + xend = to.x, + y = from.y, + yend = to.y, + size = weight / 10 ), - colour = "black", linetype = 2 + colour = "black", + linetype = 2 ) + geom_point( data = fr.all.df, - aes(x = V1, y = V2, fill = E, size = counts), shape = 21 + aes(x = V1, y = V2, fill = E, size = counts), + shape = 21 ) + geom_label_repel( data = fr.all.df, aes(x = V1, y = V2, label = species) ) + scale_fill_gradient2( - low = "blue", high = "red", mid = "white", midpoint = 0 + low = "blue", + high = "red", + mid = "white", + midpoint = 0 ) + scale_x_continuous(expand = expansion(add = 10)) + # c(0, 1)) + # `expansion()` to provide padding scale_y_continuous(expand = expansion(add = 10)) + # c(0, 1)) + labs(y = "") + # To prevent overlapping of labels and title theme_bw() + theme( - axis.text.x = element_blank(), axis.text.y = element_blank(), - axis.ticks = element_blank(), axis.title.x = element_blank(), - axis.title.y = element_text(size = 30), panel.background = element_blank(), # axis.title.y modified - panel.border = element_blank(), panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), plot.background = element_blank() + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.ticks = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_text(size = 30), + panel.background = element_blank(), # axis.title.y modified + panel.border = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + plot.background = element_blank() ) + ggtitle(graph.title) + # labs(caption = stringr::str_wrap( @@ -1248,17 +1437,18 @@ plot.kinase.relationships_edit <- function(ksea.data, graph.title, pval.cutoff) # return(results) # } - # Edited `protools2::pathway_enrichment()` ---- # Edited to adjust colour scales pathway_enrichment_edit <- function( - increased.peptides, - decreased.peptides, - background.data, - prot_dbs = c("kegg", "hallmark.genes", "nci", "process"), - graph.heading = "", - is.ksea = FALSE, - trim = 75) { # 1000 + increased.peptides, + decreased.peptides, + background.data, + prot_dbs = c("kegg", "hallmark.genes", "nci", "process"), + graph.heading = "", + is.ksea = FALSE, + trim = 75 +) { + # 1000 library(foreach) library(doParallel) background.list <- background.data$protein @@ -1266,45 +1456,56 @@ pathway_enrichment_edit <- function( cl <- makeCluster(cores[1] - 1) registerDoParallel(cl) t1 <- Sys.time() - enrich.up <- foreach(db = prot_dbs, .combine = rbind, .packages = "protools2") %dopar% { - e <- protools2::enrichment.from.list( # To fix "cannot xtfrm data frames" error, function has been overwritten in `protools2` - list.of.peptides = increased.peptides, - background.list = background.list, - prot_db = db, - is.ksea = is.ksea - ) - if (nrow(e) > 0) { - e$prot_db <- db - if (is.ksea) { - e$pathway <- paste0(e$pathway, "_", db) + enrich.up <- foreach( + db = prot_dbs, + .combine = rbind, + .packages = "protools2" + ) %dopar% + { + e <- protools2::enrichment.from.list( + # To fix "cannot xtfrm data frames" error, function has been overwritten in `protools2` + list.of.peptides = increased.peptides, + background.list = background.list, + prot_db = db, + is.ksea = is.ksea + ) + if (nrow(e) > 0) { + e$prot_db <- db + if (is.ksea) { + e$pathway <- paste0(e$pathway, "_", db) + } + return(e) + } else { + return(data.frame()) } - return(e) - } else { - return(data.frame()) } - } stopCluster(cl) t2 <- Sys.time() cl <- makeCluster(cores[1] - 1) registerDoParallel(cl) t1 <- Sys.time() - enrich.do <- foreach(db = prot_dbs, .combine = rbind, .packages = "protools2") %dopar% { - e <- protools2::enrichment.from.list( - list.of.peptides = decreased.peptides, - background.list = background.list, - prot_db = db, - is.ksea = is.ksea - ) - if (nrow(e) > 0) { - e$prot_db <- db - if (is.ksea) { - e$pathway <- paste0(e$pathway, "_", db) + enrich.do <- foreach( + db = prot_dbs, + .combine = rbind, + .packages = "protools2" + ) %dopar% + { + e <- protools2::enrichment.from.list( + list.of.peptides = decreased.peptides, + background.list = background.list, + prot_db = db, + is.ksea = is.ksea + ) + if (nrow(e) > 0) { + e$prot_db <- db + if (is.ksea) { + e$pathway <- paste0(e$pathway, "_", db) + } + return(e) + } else { + return(data.frame()) } - return(e) - } else { - return(data.frame()) } - } stopCluster(cl) t2 <- Sys.time() if (length(enrich.do) > 0 & length(enrich.up) > 0) { @@ -1319,11 +1520,14 @@ pathway_enrichment_edit <- function( enrich.do$effect <- "Decreased" enrich.up$effect <- "Increased" enrich.up.do <- rbind.data.frame( - enrich.do[1:25, ], enrich.up[1:25, ] + enrich.do[1:25, ], + enrich.up[1:25, ] ) diff.enrich <- merge.data.frame( - enrich.do, enrich.up, - by = "pathway", all = T + enrich.do, + enrich.up, + by = "pathway", + all = T ) diff.enrich$enrichment.x[is.na(diff.enrich$enrichment.x)] <- 0 diff.enrich$enrichment.y[is.na(diff.enrich$enrichment.y)] <- 0 @@ -1331,13 +1535,14 @@ pathway_enrichment_edit <- function( diff.enrich$pvalues.y[is.na(diff.enrich$pvalues.y)] <- 1 diff.enrich$counts.x[is.na(diff.enrich$counts.x)] <- 0 diff.enrich$counts.y[is.na(diff.enrich$counts.y)] <- 0 - diff.enrich$delta.enrichment.up.vs.down <- ( - diff.enrich$enrichment.y - diff.enrich$enrichment.x - ) + diff.enrich$delta.enrichment.up.vs.down <- (diff.enrich$enrichment.y - + diff.enrich$enrichment.x) diff.enrich$delta.pval.up.vs.down <- -log10(diff.enrich$pvalues.y) - (-log10(diff.enrich$pvalues.x)) diff.enrich$delta.counts <- diff.enrich$counts.y - diff.enrich$counts.x - diff.enrich <- diff.enrich[order(-diff.enrich$delta.enrichment.up.vs.down), ] + diff.enrich <- diff.enrich[ + order(-diff.enrich$delta.enrichment.up.vs.down), + ] dif.e.up <- diff.enrich[1:15, ] diff.enrich <- diff.enrich[order(diff.enrich$delta.enrichment.up.vs.down), ] dif.e.do <- diff.enrich[1:15, ] @@ -1347,20 +1552,25 @@ pathway_enrichment_edit <- function( cl <- makeCluster(cores[1] - 1) registerDoParallel(cl) t1 <- Sys.time() - enrich.combined <- foreach(db = prot_dbs, .combine = rbind, .packages = "protools2") %dopar% { - e <- protools2::enrichment.from.list( - list.of.peptides = c(increased.peptides, decreased.peptides), - background.list, - prot_db = db, - is.ksea = is.ksea - ) - if (nrow(e) > 0) { - e$prot_db <- db - return(e) - # } else { # COMMENT OUT - # return(data.frame()) # COMMENT OUT + enrich.combined <- foreach( + db = prot_dbs, + .combine = rbind, + .packages = "protools2" + ) %dopar% + { + e <- protools2::enrichment.from.list( + list.of.peptides = c(increased.peptides, decreased.peptides), + background.list, + prot_db = db, + is.ksea = is.ksea + ) + if (nrow(e) > 0) { + e$prot_db <- db + return(e) + # } else { # COMMENT OUT + # return(data.frame()) # COMMENT OUT + } } - } stopCluster(cl) t2 <- Sys.time() enrich.combined <- enrich.combined[order(enrich.combined$pvalues), ] @@ -1376,11 +1586,14 @@ pathway_enrichment_edit <- function( labs( title = "Pathway Enrichement Analysis", subtitle = "Combined analysis using increased and decreased peptides", - size = "log2(E)", color = "-log10(q)", - x = "# Proteins", y = "" + size = "log2(E)", + color = "-log10(q)", + x = "# Proteins", + y = "" ) + scale_color_gradient( - low = "orange", high = "red", + low = "orange", + high = "red", oob = scales::squish_infinite ) + # na.value = "red") + facet_wrap(~effect, scales = "free") + @@ -1398,10 +1611,14 @@ pathway_enrichment_edit <- function( labs( title = "Pathway Enrichement Analysis", subtitle = paste0("Changed in ", graph.heading), - y = "", size = "delta \ncounts", color = "delta \npvalue" + y = "", + size = "delta \ncounts", + color = "delta \npvalue" ) + scale_color_gradient2( - low = "blue", mid = "yellow", high = "red", + low = "blue", + mid = "yellow", + high = "red", oob = scales::squish_infinite ) + # na.value = "red") + # scale_colour_viridis(option = "turbo") + @@ -1416,11 +1633,14 @@ pathway_enrichment_edit <- function( labs( title = "Pathway Enrichement Analysis", subtitle = paste0("Changed in ", graph.heading), - size = "log2(E)", color = "-log10(q)", - x = "# Proteins", y = "" + size = "log2(E)", + color = "-log10(q)", + x = "# Proteins", + y = "" ) + scale_color_gradient( - low = "orange", high = "red", + low = "orange", + high = "red", oob = scales::squish_infinite ) + # na.value = "red") + # scale_colour_viridis(option = "plasma") + @@ -1434,10 +1654,12 @@ pathway_enrichment_edit <- function( labs( title = "Pathway Enrichement Analysis", subtitle = paste0("Changed in ", graph.heading), - color = "-log10(q)", y = "" + color = "-log10(q)", + y = "" ) + scale_color_gradient( - low = "orange", high = "red", + low = "orange", + high = "red", oob = scales::squish_infinite ) + # na.value = "red") + facet_wrap(~effect, scales = "free") + @@ -1447,40 +1669,54 @@ pathway_enrichment_edit <- function( .look.at.proteins.in.pathways <- function(enrich.data) { mypathways <- list() j <- 1 - mypathways <- foreach(j = 1:10, .combine = rbind, .packages = "foreach") %do% { - ontology <- enrich.data[j, ]$pathway - prots <- unlist(strsplit(enrich.data[j, "proteins"], ";")) - prot <- prots[1] - dfxx <- foreach(prot = prots, .combine = rbind) %do% { - vv <- background.data[grepl(prot, background.data$protein, fixed = T), ] - vv <- vv[order(vv$pvalues), ] - vv[1, ] - } - prots <- unique(unlist(strsplit(dfxx$protein, ";"))) - df.names <- foreach( - ppp = unlist(dfxx$protein), .combine = rbind - ) %do% { - prot <- unlist(strsplit(ppp, ";")) - fff <- foreach(pp = prot, .combine = rbind) %do% { - p <- strsplit(pp, "(", fixed = T)[[1]][1] - nn <- uniprot.data[grepl(p, uniprot.data$gene.name, fixed = T), ] - accs <- paste0(nn$acc, collapse = ";") - names <- paste0(nn$protein, collapse = ";") - data.frame(protein = ppp, accs, names) - } - data.frame( - protein = ppp, - acc = paste0(fff$accs, collapse = ";"), - name = paste0(fff$names, collapse = ";") - ) + mypathways <- foreach( + j = 1:10, + .combine = rbind, + .packages = "foreach" + ) %do% + { + ontology <- enrich.data[j, ]$pathway + prots <- unlist(strsplit(enrich.data[j, "proteins"], ";")) + prot <- prots[1] + dfxx <- foreach(prot = prots, .combine = rbind) %do% + { + vv <- background.data[ + grepl(prot, background.data$protein, fixed = T), + ] + vv <- vv[order(vv$pvalues), ] + vv[1, ] + } + prots <- unique(unlist(strsplit(dfxx$protein, ";"))) + df.names <- foreach( + ppp = unlist(dfxx$protein), + .combine = rbind + ) %do% + { + prot <- unlist(strsplit(ppp, ";")) + fff <- foreach(pp = prot, .combine = rbind) %do% + { + p <- strsplit(pp, "(", fixed = T)[[1]][1] + nn <- uniprot.data[ + grepl(p, uniprot.data$gene.name, fixed = T), + ] + accs <- paste0(nn$acc, collapse = ";") + names <- paste0(nn$protein, collapse = ";") + data.frame(protein = ppp, accs, names) + } + data.frame( + protein = ppp, + acc = paste0(fff$accs, collapse = ";"), + name = paste0(fff$names, collapse = ";") + ) + } + df.comb <- merge.data.frame(dfxx, df.names, by = "protein") + df.comb$ontology <- ontology + df.comb } - df.comb <- merge.data.frame(dfxx, df.names, by = "protein") - df.comb$ontology <- ontology - df.comb - } colnames(mypathways)[2] <- "fold" mypathways$name.short <- paste( - stringr::str_trunc(mypathways$protein, trim), stringr::str_trunc(mypathways$name, trim) # 15 -> `trim`; 40 -> `trim` + stringr::str_trunc(mypathways$protein, trim), + stringr::str_trunc(mypathways$name, trim) # 15 -> `trim`; 40 -> `trim` ) p <- strsplit(pp, "(", fixed = T)[[1]][1] ff <- data.frame(table(mypathways$ontology)) @@ -1499,9 +1735,10 @@ pathway_enrichment_edit <- function( labs( title = ont, color = "-log10(p)", - size = "", y = "" + size = "", + y = "" ) + - scale_color_gradient2(low = "seagreen", high = "purple") + # Replaced by below for clarity + scale_color_gradient2(low = "seagreen", high = "purple") + # Replaced by below for clarity # scale_color_gradient2(low = "blue", high = "red") + theme_bw() + theme(plot.title = element_text(hjust = 1)) @@ -1530,7 +1767,11 @@ pathway_enrichment_edit <- function( # Edited `protools2::plot.ont.vs.prot.relationships()` ---- # Modified to adjust y axis label padding to prevent overlapping of `cowplot` figure labels -plot.ont.vs.prot.relationships_edit <- function(results.ontology.analysis, n.pathways = 12, graph.title = "") { +plot.ont.vs.prot.relationships_edit <- function( + results.ontology.analysis, + n.pathways = 12, + graph.title = "" +) { library(ggrepel) library(igraph) library(ggplot2) @@ -1546,22 +1787,26 @@ plot.ont.vs.prot.relationships_edit <- function(results.ontology.analysis, n.pat g <- character() counts <- numeric() i <- 1 - df.network <- foreach(pathway = pathways, .combine = "rbind") %do% { - count <- rresults[rresults$pathway == pathway, "counts"] - pvalue <- rresults[rresults$pathway == pathway, "pvalues"] - genes <- unlist(strsplit(rresults[rresults$pathway == pathway, "genes"], ";")) - data.frame( - pathway = rep(pathway, length(genes)), - protein = genes - ) - } + df.network <- foreach(pathway = pathways, .combine = "rbind") %do% + { + count <- rresults[rresults$pathway == pathway, "counts"] + pvalue <- rresults[rresults$pathway == pathway, "pvalues"] + genes <- unlist(strsplit( + rresults[rresults$pathway == pathway, "genes"], + ";" + )) + data.frame( + pathway = rep(pathway, length(genes)), + protein = genes + ) + } g <- graph.data.frame(df.network, directed = T) V(g)$type <- bipartite.mapping(g)$type # Occasionally produces error when `.network.plot()` called for "Decreased" (see below, line 458) # "Error in i_set_vertex_attr(x, attr(value, "name"), index = value, value = attr(value, : Length of new attribute value must be 1 or 8, the number of target vertices, not 0" V(g)$color <- ifelse(V(g)$type, "lightblue", "salmon") V(g)$shape <- ifelse(V(g)$type, "circle", "square") E(g)$color <- "lightgray" - V(g)$label.color <- ifelse(V(g)$type, "black", "purple") # "purple" <- "purple4" + V(g)$label.color <- ifelse(V(g)$type, "black", "purple") # "purple" <- "purple4" V(g)$label.cex <- ifelse(V(g)$type, 0.7, 1.2) V(g)$frame.color <- "gray" V(g)$size <- ifelse(V(g)$type, 3, 8) @@ -1569,8 +1814,10 @@ plot.ont.vs.prot.relationships_edit <- function(results.ontology.analysis, n.pat fr.all <- layout.fruchterman.reingold(g) fr.all <- layout_with_graphopt(g) df <- data.frame( - name = V(g)$name, color = V(g)$label.color, - type = ifelse(V(g)$type, 1, 2), font.size = V(g)$label.cex, + name = V(g)$name, + color = V(g)$label.color, + type = ifelse(V(g)$type, 1, 2), + font.size = V(g)$label.cex, degree = degree(g, mode = "all") ) fr.all.df <- as.data.frame(fr.all) @@ -1584,26 +1831,39 @@ plot.ont.vs.prot.relationships_edit <- function(results.ontology.analysis, n.pat xxx <- merge.data.frame(fr.all.df, df, by = "name") pplot <- ggplot() + geom_segment( - data = gx, aes( - x = from.x, xend = to.x, y = from.y, yend = to.y + data = gx, + aes( + x = from.x, + xend = to.x, + y = from.y, + yend = to.y ), colour = "grey4", - linetype = 3, alpha = 0.5 + linetype = 3, + alpha = 0.5 ) + geom_point( - data = xxx, aes( - x = V1, y = V2, shape = as.character(type), - color = as.character(type), size = as.character(type) + data = xxx, + aes( + x = V1, + y = V2, + shape = as.character(type), + color = as.character(type), + size = as.character(type) ) ) + geom_text_repel( - data = xxx, aes( - x = V1, y = V2, - label = name, colour = as.character(type), size = as.character(type) + data = xxx, + aes( + x = V1, + y = V2, + label = name, + colour = as.character(type), + size = as.character(type) ), alpha = 1 ) + - scale_colour_manual(values = c(`2` = "purple", `1` = "black")) + # "purple" <- "purple4" + scale_colour_manual(values = c(`2` = "purple", `1` = "black")) + # "purple" <- "purple4" scale_size_manual(values = c(3, 5)) + scale_x_continuous(expand = c(0, 1)) + scale_y_continuous(expand = c(0, 1)) + @@ -1611,18 +1871,25 @@ plot.ont.vs.prot.relationships_edit <- function(results.ontology.analysis, n.pat theme_bw() + theme( axis.text.x = element_blank(), - axis.text.y = element_blank(), axis.ticks = element_blank(), - axis.title.x = element_blank(), axis.title.y = element_text(size = 30), # Modified axis.title.y - panel.background = element_blank(), panel.border = element_blank(), - panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - plot.background = element_blank(), legend.position = "none" + axis.text.y = element_blank(), + axis.ticks = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_text(size = 30), # Modified axis.title.y + panel.background = element_blank(), + panel.border = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + plot.background = element_blank(), + legend.position = "none" ) + ggtitle(my.title) pplot return(pplot) } - x.inc <- subset(results.ontology.analysis, results.ontology.analysis$effect == - "Increased") + x.inc <- subset( + results.ontology.analysis, + results.ontology.analysis$effect == "Increased" + ) rresults <- x.inc[order(-x.inc$counts), ] my.title <- paste(graph.title, "Increased") plot.inc <- try( @@ -1630,8 +1897,10 @@ plot.ont.vs.prot.relationships_edit <- function(results.ontology.analysis, n.pat .network.plot(), silent = TRUE ) - x.dec <- subset(results.ontology.analysis, results.ontology.analysis$effect == - "Decreased") + x.dec <- subset( + results.ontology.analysis, + results.ontology.analysis$effect == "Decreased" + ) rresults <- x.dec[order(-x.dec$counts), ] my.title <- paste(graph.title, "Decreased") plot.dec <- try( @@ -1642,15 +1911,20 @@ plot.ont.vs.prot.relationships_edit <- function(results.ontology.analysis, n.pat # plots_list <- list(plot.inc, plot.dec) if (!inherits(plot.inc, "try-error") & !inherits(plot.dec, "try-error")) { return(list( - plot.increased = plot.inc, plot.decreased = plot.dec, + plot.increased = plot.inc, + plot.decreased = plot.dec, combined.plot = cowplot::plot_grid(plot.inc, plot.dec, nrow = 2) )) - } else if (!inherits(plot.inc, "try-error") & inherits(plot.dec, "try-error")) { + } else if ( + !inherits(plot.inc, "try-error") & inherits(plot.dec, "try-error") + ) { return(list( plot.increased = plot.inc, combined.plot = cowplot::plot_grid(plot.inc) )) - } else if (inherits(plot.inc, "try-error") & !inherits(plot.dec, "try-error")) { + } else if ( + inherits(plot.inc, "try-error") & !inherits(plot.dec, "try-error") + ) { return(list( plot.decreased = plot.dec, combined.plot = cowplot::plot_grid(plot.dec) @@ -1668,13 +1942,15 @@ plot.ont.vs.prot.relationships_edit <- function(results.ontology.analysis, n.pat kinase.substrate.enrichment_edit <- function(dfx, ks_db, is.ksea = TRUE) { dfx <- data.frame(dfx) # rownames(dfx) <- make.names(dfx[, 1], unique = T) # No longer necessary - if (is.ksea) { # Original: `is.ksea == TRUE` + if (is.ksea) { + # Original: `is.ksea == TRUE` column.with.priors <- 3 } else { x1 <- grepl("(", as.character(dfx[, 1]), fixed = T) if (TRUE %in% x1) { dfx[, 1] <- unlist(lapply( - dfx[, 1], function(x) strsplit(as.character(x), "(", fixed = TRUE)[[1]][1] + dfx[, 1], + function(x) strsplit(as.character(x), "(", fixed = TRUE)[[1]][1] )) column.with.priors <- "genes" } @@ -1695,8 +1971,10 @@ kinase.substrate.enrichment_edit <- function(dfx, ks_db, is.ksea = TRUE) { for (r in 1:nr) { mym <- df.ks[r, 2] kinase <- df.ks[r, 1] - if (!is.na(mym)) { # Changed from `is.na(mym) == F` to `!is.na(mym)` - if (mym >= 2) { # Changed from original `> 2` to `>= 2` + if (!is.na(mym)) { + # Changed from `is.na(mym) == F` to `!is.na(mym)` + if (mym >= 2) { + # Changed from original `> 2` to `>= 2` substrates <- as.character(df.ks[r, column.with.priors]) ss <- c(unlist(strsplit(substrates, ";"))) start.time <- Sys.time() @@ -1704,7 +1982,8 @@ kinase.substrate.enrichment_edit <- function(dfx, ks_db, is.ksea = TRUE) { df.xx <- subset(dfx, dfx[, 1] %in% ss) # Changed from above sites <- paste(unlist(rownames(df.xx)), collapse = ";") sites <- gsub(";;", ";", sites) # Added - if (nrow(df.xx) >= 2) { # Changed from original `> 2` to `>= 2` + if (nrow(df.xx) >= 2) { + # Changed from original `> 2` to `>= 2` df.xx$prot.group <- rep(kinase, nrow(df.xx)) # Edited to correct error, original: `<- kinase` sites.x <- data.frame(site = df.xx[, 1]) sites.x$kinase <- rep(kinase, nrow(df.xx)) # Edited to correct error, original: `<- kinase` @@ -1743,8 +2022,11 @@ kinase.substrate.enrichment_edit <- function(dfx, ks_db, is.ksea = TRUE) { } xx <- data.frame( kinases, - zscores = results.zscores, pvalues = results.pvalues, - m = results.m, distance = results.distance, sites = results.sites, + zscores = results.zscores, + pvalues = results.pvalues, + m = results.m, + distance = results.distance, + sites = results.sites, kinase_dataset = paste0(kinases, "_", ks_db) ) xx$kinase_group <- unlist(kinases) @@ -1759,20 +2041,22 @@ kinase.substrate.enrichment_edit <- function(dfx, ks_db, is.ksea = TRUE) { # To fix error in nested function: # `protools2::kinase.substrate.enrichment()` (see above this function) ksea_edit <- function( - df.fold, - ks_db = c("pdts", "psite"), - graph.heading = "", - pval.cut.off = 0.05) { + df.fold, + ks_db = c("pdts", "psite"), + graph.heading = "", + pval.cut.off = 0.05 +) { library(ggrepel) yy <- protools2::expand.phosphopeptide.dataset(df.fold) cores <- detectCores() cl <- makeCluster(cores[1] - 1) registerDoParallel(cl) - xx <- foreach(db = ks_db, .combine = "rbind") %do% { - # x <- protools2::kinase.substrate.enrichment(dfx = yy, ks_db = db) # Edited, eventually build package to correct actual function - x <- kinase.substrate.enrichment_edit(dfx = yy, ks_db = db) - x - } + xx <- foreach(db = ks_db, .combine = "rbind") %do% + { + # x <- protools2::kinase.substrate.enrichment(dfx = yy, ks_db = db) # Edited, eventually build package to correct actual function + x <- kinase.substrate.enrichment_edit(dfx = yy, ks_db = db) + x + } stopCluster(cl) # Labels if (ks_db == "ctams") { @@ -1872,13 +2156,16 @@ ksea_edit <- function( #' @return A \code{\link{gplots::heatmap.2}} heatmap. #' @export myHeatMap_With_PValues <- function( - df.zcr, - df.pval, - mytitle = "", - key.title = "log2Fold") { + df.zcr, + df.pval, + mytitle = "", + key.title = "log2Fold" +) { # Colours palette.breaks <- seq(-1.5, 1.5, 0.1) # these values can be changed if needed - color.palette <- colorRampPalette(c("blue", "white", "red"))(length(palette.breaks) - 1) + color.palette <- colorRampPalette(c("blue", "white", "red"))( + length(palette.breaks) - 1 + ) # P-values df.pp <- data.frame(matrix(nrow = nrow(df.zcr), ncol = ncol(df.zcr))) @@ -1959,13 +2246,14 @@ myHeatMap_With_PValues <- function( #' @return `Large Heatmap`. See \code{\link{ComplexHeatmap::Heatmap}} #' @export complex_heatmap <- function( - zscores_df, - pvalues_df, - row_title = "", - column_title = "", - legend_title = "", - fig_width = 10, - fig_height = NULL) { + zscores_df, + pvalues_df, + row_title = "", + column_title = "", + legend_title = "", + fig_width = 10, + fig_height = NULL +) { # Calculate dynamic figure height if (is.null(fig_height)) { base_height <- 4 # base height when there are no rows @@ -2066,7 +2354,9 @@ complex_heatmap <- function( # Cell annotation cell_fun = function(j, i, x, y, width, height, fill) { grid.text( - df.pp[i, j], x, y, + df.pp[i, j], + x, + y, gp = gpar(fontsize = 4) ) } @@ -2121,12 +2411,13 @@ complex_heatmap <- function( #' #' @examples umap_and_plot <- function( - input, - label = NULL, - size = 3, - max.overlaps = getOption("ggrepel.max.overlaps", default = 10), - title = "UMAP", - guide = FALSE) { + input, + label = NULL, + size = 3, + max.overlaps = getOption("ggrepel.max.overlaps", default = 10), + title = "UMAP", + guide = FALSE +) { # Packages require(umap) require(tidyverse) @@ -2150,12 +2441,18 @@ umap_and_plot <- function( } # Projection - message(paste("UMAP computation start:", format(Sys.time(), "%d-%m-%Y %H:%M:%S"))) + message(paste( + "UMAP computation start:", + format(Sys.time(), "%d-%m-%Y %H:%M:%S") + )) umap_projection <- umap::umap( matrix_data, n_neighbors = min(nrow(matrix_data) - 1, umap::umap.defaults$n_neighbors) ) - message(paste("UMAP computation end:", format(Sys.time(), "%d-%m-%Y %H:%M:%S"))) + message(paste( + "UMAP computation end:", + format(Sys.time(), "%d-%m-%Y %H:%M:%S") + )) umap_layout <- as.data.frame(umap_projection$layout) umap_layout <- umap_layout %>%