Bioc revision2 - cont. (function lengths) (#103)

* reduced code length * edited NEWS file & version bump
BIONF · Aug 8, 2019 · 884913e · 884913e
1 parent f2c3b22
commit 884913e
Show file tree

Hide file tree

Showing 23 changed files with 457 additions and 1,264 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: PhyloProfile
-Version: 0.99.20
+Version: 0.99.21
 Date: 2019-08-06
 Title: PhyloProfile
 Authors@R: c(

diff --git a/R/analyzeDistribution.R b/R/analyzeDistribution.R
@@ -18,36 +18,29 @@ createPercentageDistributionData <- function(inputData, rankName = NULL) {
         stop("Input data or rank name cannot be NULL!")
     allMainRanks <- getTaxonomyRanks()
     if (!(rankName[1] %in% allMainRanks)) stop("Invalid taxonomy rank given!")
-
     if (ncol(inputData) < 4) {
         colnames(inputData) <- c("geneID", "ncbiID", "orthoID")
     } else if (ncol(inputData) < 5) {
         colnames(inputData) <- c("geneID", "ncbiID", "orthoID", "var1")
     } else {
         colnames(inputData) <- c("geneID", "ncbiID", "orthoID", "var1", "var2")
     }
-
     # count number of inparalogs
     paralogCount <- plyr::count(inputData, c("geneID", "ncbiID"))
     inputData <- merge(inputData, paralogCount, by = c("geneID", "ncbiID"))
     colnames(inputData)[ncol(inputData)] <- "paralog"
-
     # get sorted taxonomy list
     inputTaxonID <- getInputTaxaID(inputData)
     inputTaxonName <- getInputTaxaName(rankName, inputTaxonID)
     refTaxon <- inputTaxonName$fullName[1]
     taxaTree <- NULL
     taxaList <- sortInputTaxa(inputTaxonID, rankName, refTaxon, taxaTree)
-
     # calculate frequency of all supertaxa
     taxaCount <- plyr::count(taxaList, "supertaxon")
-
     # merge inputData, inputDatavar2 and taxaList to get taxonomy info
     taxaMdData <- merge(inputData, taxaList, by = "ncbiID")
-
     # calculate % present species
     finalPresSpecDt <- calcPresSpec(taxaMdData, taxaCount)
-
     finalPresSpecDt[!is.na(finalPresSpecDt$geneID),]
     return(finalPresSpecDt)
 }
@@ -84,28 +77,19 @@ createVariableDistributionData <- function(
         colnames(inputData) <- c("geneID", "ncbiID", "orthoID", "var1", "var2")
         splitDt <- inputData[, c("orthoID", "var1", "var2")]
     }
-
     splitDt$orthoID[splitDt$orthoID == "NA" | is.na(splitDt$orthoID)] <- NA
     splitDt <- splitDt[complete.cases(splitDt), ]
 
-    if (length(levels(as.factor(splitDt$var2))) == 1) {
+    if (length(levels(as.factor(splitDt$var2))) == 1)
         if (levels(as.factor(splitDt$var2)) == "") splitDt$var2 <- 0
-    }
 
     # Filter based on variable cutoffs
-    if ("var1" %in% colnames(splitDt)) {
-        # filter splitDt based on selected var1 cutoff
+    if ("var1" %in% colnames(splitDt))
         splitDt <- splitDt[
-            splitDt$var1 >= var1Cutoff[1] & splitDt$var1 <= var1Cutoff[2],
-            ]
-    }
-    if ("var2" %in% colnames(splitDt)) {
-        # filter splitDt based on selected var2 cutoff
+            splitDt$var1 >= var1Cutoff[1] & splitDt$var1 <= var1Cutoff[2],]
+    if ("var2" %in% colnames(splitDt))
         splitDt <- splitDt[
-            splitDt$var2 >= var2Cutoff[1] & splitDt$var2 <= var2Cutoff[2],
-            ]
-    }
-
+            splitDt$var2 >= var2Cutoff[1] & splitDt$var2 <= var2Cutoff[2],]
     return(splitDt)
 }
 
@@ -145,16 +129,10 @@ createVariableDistributionDataSubset <- function(
     fullProfileData, distributionData,
     selectedGenes = "all", selectedTaxa = "all"
 ) {
-    geneID <- NULL
-    orthoID <- NULL
-    var1.x <- NULL
-    var2.y <- NULL
-    supertaxonMod <- NULL
-
+    geneID <- orthoID <- var1.x <- var2.y <- supertaxonMod <- NULL
     # check parameters
     if (is.null(fullProfileData) | is.null(distributionData)) 
         stop("Full processed profiles or distribution data cannot be NULL!")
-
     # get geneID and supertaxon name for distributionData
     distributionDataName <- merge(
         distributionData, fullProfileData, by = "orthoID", all.x = TRUE
@@ -170,8 +148,7 @@ createVariableDistributionDataSubset <- function(
     colnames(distributionDataName) <- c(
         "orthoID", "var1", "var2", "supertaxonMod", "geneID"
     )
-
-    # filter
+    # filter data
     if (selectedTaxa[1] == "all" & selectedGenes[1] != "all") {
         # select data from dataHeat for selected sequences only
         distributionData <- subset(
@@ -182,7 +159,7 @@ createVariableDistributionDataSubset <- function(
         distributionData <- subset(
             distributionDataName, supertaxonMod %in% selectedTaxa
         )
-    }  else if (selectedGenes[1] == "all" & selectedTaxa[1] == "all") {
+    } else if (selectedGenes[1] == "all" & selectedTaxa[1] == "all") {
         return(distributionData)
     } else {
         # select data from dataHeat for selected sequences and taxa
@@ -198,7 +175,7 @@ createVariableDistributionDataSubset <- function(
 #' @description Create distribution plot for one of the additional variable or
 #' the percentage of the species present in the supertaxa.
 #' @usage createVarDistPlot(data, varName = "var", varType = "var1",
-#'     percent = c(0, 1), distTextSize = 12)
+#'     percent = c(0, 1), textSize = 12)
 #' @param data dataframe contains data for plotting (see
 #' ?createVariableDistributionData, ?createVariableDistributionDataSubset or
 #' ?createPercentageDistributionData)
@@ -207,7 +184,7 @@ createVariableDistributionDataSubset <- function(
 #' @param varType type of variable (either "var1", "var2" or "presSpec").
 #' Default = "var1".
 #' @param percent range of percentage cutoff (between 0 and 1). Default = c(0,1)
-#' @param distTextSize text size of the distribution plot (in px). Default = 12.
+#' @param textSize text size of the distribution plot (in px). Default = 12.
 #' @return A distribution plot for the selected variable as a ggplot object
 #' @import ggplot2
 #' @author Vinh Tran {tran@bio.uni-frankfurt.de}
@@ -224,18 +201,17 @@ createVariableDistributionDataSubset <- function(
 #' varName <- "Variable abc"
 #' varType <- "var1"
 #' percent <- c(0,1)
-#' distTextSize <- 12
+#' textSize <- 12
 #' createVarDistPlot(
 #'     data,
 #'     varName,
 #'     varType,
 #'     percent,
-#'     distTextSize
+#'     textSize
 #' )
 
 createVarDistPlot <- function(
-    data, varName = "var", varType = "var1", percent = c(0, 1),
-    distTextSize = 12
+    data, varName = "var", varType = "var1", percent = c(0, 1), textSize = 12
 ) {
     if (is.null(data)) stop("Input data cannot be NULL!")
     if (varType == "presSpec") {
@@ -254,22 +230,17 @@ createVarDistPlot <- function(
     } else data <- data[!is.na(data[,varType]), ]
 
     data.mean <- mean(data[,varType])
-
     p <- ggplot(data, aes(x = data[,varType])) +
         geom_histogram(binwidth = .01, alpha = .5, position = "identity") +
         geom_vline(
-            data = data,
-            aes(xintercept = data.mean, colour = "red"),
-            linetype = "dashed",
-            size = 1
-        ) +
-        theme_minimal()
+            data = data, aes(xintercept = data.mean, colour = "red"),
+            linetype = "dashed", size = 1
+        ) + theme_minimal()
     p <- p + theme(
             legend.position = "none",
-            axis.title = element_text(size = distTextSize),
-            axis.text = element_text(size = distTextSize)
-        ) +
-        labs(
+            axis.title = element_text(size = textSize),
+            axis.text = element_text(size = textSize)
+        ) + labs(
             x = paste0(varName," (mean = ", round(mean(data[,varType]), 3),")"),
             y = "Frequency"
         )

diff --git a/R/clusterProfile.R b/R/clusterProfile.R
@@ -1,16 +1,16 @@
 #' Get data for calculating distance matrix from phylogenetic profiles
 #' @export
-#' @usage getDataClustering(data, profileType = "binary",
-#'     var1AggregateBy = "max", var2AggregateBy = "max")
+#' @usage getDataClustering(data, profileType = "binary", var1AggBy = "max", 
+#'     var2AggBy = "max")
 #' @param data a data frame contains processed and filtered profiles (see
 #' ?fullProcessedProfile and ?filterProfileData, ?fromInputToProfile)
 #' @param profileType type of data used for calculating the distance matrix.
 #' Either "binary" (consider only the presence/absence status of orthlogs), or
 #' "var1"/"var2" for taking values of the additional variables into account.
 #' Default = "binary".
-#' @param var1AggregateBy aggregate method for VAR1 (min, max, mean or median).
+#' @param var1AggBy aggregate method for VAR1 (min, max, mean or median).
 #' Default = "max".
-#' @param var2AggregateBy aggregate method for VAR2 (min, max, mean or median).
+#' @param var2AggBy aggregate method for VAR2 (min, max, mean or median).
 #' Default = "max".
 #' @return A wide dataframe contains values for calculating distance matrix.
 #' @author Carla Mölbert (carla.moelbert@gmx.de), Vinh Tran
@@ -25,51 +25,39 @@
 #' getDataClustering(data, profileType, var1AggregateBy, var2AggregateBy)
 
 getDataClustering <- function(
-    data = NULL, profileType = "binary",
-    var1AggregateBy = "max", var2AggregateBy = "max"
+    data = NULL, profileType = "binary", var1AggBy = "max", var2AggBy = "max"
 ) {
     if (is.null(data)) stop("Input data cannot be NULL!")
-    supertaxon <- NULL
-    presSpec <- NULL
-
+    supertaxon <- presSpec <- NULL
     # remove lines where there is no found ortholog
     subDataHeat <- subset(data, data$presSpec > 0)
-
     # transform data into wide matrix
     if (profileType == "binary") {
         subDataHeat <- subDataHeat[, c("geneID", "supertaxon", "presSpec")]
         subDataHeat$presSpec[subDataHeat$presSpec > 0] <- 1
         subDataHeat <- subDataHeat[!duplicated(subDataHeat), ]
         wideData <- data.table::dcast(
-            subDataHeat, geneID ~ supertaxon, value.var = "presSpec"
-        )
+            subDataHeat, geneID ~ supertaxon, value.var = "presSpec")
     } else {
         var <- profileType
-
         subDataHeat <- subDataHeat[, c("geneID", "supertaxon", var)]
         subDataHeat <- subDataHeat[!duplicated(subDataHeat), ]
-
         # aggreagte the values by the selected method
-        if (var == "var1") aggregateBy <- var1AggregateBy
-        else aggregateBy <- var2AggregateBy
-
+        if (var == "var1") aggregateBy <- var1AggBy
+        else aggregateBy <- var2AggBy
         subDataHeat <- aggregate(
             subDataHeat[, var],
             list(subDataHeat$geneID, subDataHeat$supertaxon),
             FUN = aggregateBy
         )
-
         colnames(subDataHeat) <- c("geneID", "supertaxon", var)
         wideData <- data.table::dcast(
-            subDataHeat, geneID ~ supertaxon, value.var = var
-        )
+            subDataHeat, geneID ~ supertaxon, value.var = var)
     }
-
     # set name for wide matrix as gene IDs
     dat <- wideData[, 2:ncol(wideData)]
     rownames(dat) <- wideData[, 1]
     dat[is.na(dat)] <- 0
-
     return(dat)
 }
 
@@ -100,7 +88,6 @@ getDistanceMatrix <- function(profiles = NULL, method = "mutualInformation") {
     if (is.null(profiles)) stop("Profile data cannot be NULL!")
     profiles <-  profiles[, colSums(profiles != 0) > 0]
     profiles <-  profiles[rowSums(profiles != 0) > 0, ]
-
     distMethods <- c("euclidean", "maximum", "manhattan", "canberra", "binary")
     if (method %in% distMethods) {
         distanceMatrix <- stats::dist(profiles, method = method)
@@ -118,7 +105,6 @@ getDistanceMatrix <- function(profiles = NULL, method = "mutualInformation") {
         # are clustered together
         matrix <- 1 - matrix
         matrix <- as.data.frame(matrix)
-
         profileNames <- rownames(profiles)
         colnames(matrix) <- profileNames[seq_len(length(profileNames)) - 1]
         rownames(matrix) <- profileNames
@@ -129,7 +115,6 @@ getDistanceMatrix <- function(profiles = NULL, method = "mutualInformation") {
     } else if (method == "pearson") {
         distanceMatrix <-  bioDist::cor.dist(as.matrix(profiles))
     }
-
     return(distanceMatrix)
 }