Skip to content

Commit

Permalink
Bioc revision2 - cont. (function lengths) (#103)
Browse files Browse the repository at this point in the history
* reduced code length

* edited NEWS file & version bump
  • Loading branch information
trvinh authored Aug 8, 2019
1 parent f2c3b22 commit 884913e
Show file tree
Hide file tree
Showing 23 changed files with 457 additions and 1,264 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: PhyloProfile
Version: 0.99.20
Version: 0.99.21
Date: 2019-08-06
Title: PhyloProfile
Authors@R: c(
Expand Down
67 changes: 19 additions & 48 deletions R/analyzeDistribution.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,29 @@ createPercentageDistributionData <- function(inputData, rankName = NULL) {
stop("Input data or rank name cannot be NULL!")
allMainRanks <- getTaxonomyRanks()
if (!(rankName[1] %in% allMainRanks)) stop("Invalid taxonomy rank given!")

if (ncol(inputData) < 4) {
colnames(inputData) <- c("geneID", "ncbiID", "orthoID")
} else if (ncol(inputData) < 5) {
colnames(inputData) <- c("geneID", "ncbiID", "orthoID", "var1")
} else {
colnames(inputData) <- c("geneID", "ncbiID", "orthoID", "var1", "var2")
}

# count number of inparalogs
paralogCount <- plyr::count(inputData, c("geneID", "ncbiID"))
inputData <- merge(inputData, paralogCount, by = c("geneID", "ncbiID"))
colnames(inputData)[ncol(inputData)] <- "paralog"

# get sorted taxonomy list
inputTaxonID <- getInputTaxaID(inputData)
inputTaxonName <- getInputTaxaName(rankName, inputTaxonID)
refTaxon <- inputTaxonName$fullName[1]
taxaTree <- NULL
taxaList <- sortInputTaxa(inputTaxonID, rankName, refTaxon, taxaTree)

# calculate frequency of all supertaxa
taxaCount <- plyr::count(taxaList, "supertaxon")

# merge inputData, inputDatavar2 and taxaList to get taxonomy info
taxaMdData <- merge(inputData, taxaList, by = "ncbiID")

# calculate % present species
finalPresSpecDt <- calcPresSpec(taxaMdData, taxaCount)

finalPresSpecDt[!is.na(finalPresSpecDt$geneID),]
return(finalPresSpecDt)
}
Expand Down Expand Up @@ -84,28 +77,19 @@ createVariableDistributionData <- function(
colnames(inputData) <- c("geneID", "ncbiID", "orthoID", "var1", "var2")
splitDt <- inputData[, c("orthoID", "var1", "var2")]
}

splitDt$orthoID[splitDt$orthoID == "NA" | is.na(splitDt$orthoID)] <- NA
splitDt <- splitDt[complete.cases(splitDt), ]

if (length(levels(as.factor(splitDt$var2))) == 1) {
if (length(levels(as.factor(splitDt$var2))) == 1)
if (levels(as.factor(splitDt$var2)) == "") splitDt$var2 <- 0
}

# Filter based on variable cutoffs
if ("var1" %in% colnames(splitDt)) {
# filter splitDt based on selected var1 cutoff
if ("var1" %in% colnames(splitDt))
splitDt <- splitDt[
splitDt$var1 >= var1Cutoff[1] & splitDt$var1 <= var1Cutoff[2],
]
}
if ("var2" %in% colnames(splitDt)) {
# filter splitDt based on selected var2 cutoff
splitDt$var1 >= var1Cutoff[1] & splitDt$var1 <= var1Cutoff[2],]
if ("var2" %in% colnames(splitDt))
splitDt <- splitDt[
splitDt$var2 >= var2Cutoff[1] & splitDt$var2 <= var2Cutoff[2],
]
}

splitDt$var2 >= var2Cutoff[1] & splitDt$var2 <= var2Cutoff[2],]
return(splitDt)
}

Expand Down Expand Up @@ -145,16 +129,10 @@ createVariableDistributionDataSubset <- function(
fullProfileData, distributionData,
selectedGenes = "all", selectedTaxa = "all"
) {
geneID <- NULL
orthoID <- NULL
var1.x <- NULL
var2.y <- NULL
supertaxonMod <- NULL

geneID <- orthoID <- var1.x <- var2.y <- supertaxonMod <- NULL
# check parameters
if (is.null(fullProfileData) | is.null(distributionData))
stop("Full processed profiles or distribution data cannot be NULL!")

# get geneID and supertaxon name for distributionData
distributionDataName <- merge(
distributionData, fullProfileData, by = "orthoID", all.x = TRUE
Expand All @@ -170,8 +148,7 @@ createVariableDistributionDataSubset <- function(
colnames(distributionDataName) <- c(
"orthoID", "var1", "var2", "supertaxonMod", "geneID"
)

# filter
# filter data
if (selectedTaxa[1] == "all" & selectedGenes[1] != "all") {
# select data from dataHeat for selected sequences only
distributionData <- subset(
Expand All @@ -182,7 +159,7 @@ createVariableDistributionDataSubset <- function(
distributionData <- subset(
distributionDataName, supertaxonMod %in% selectedTaxa
)
} else if (selectedGenes[1] == "all" & selectedTaxa[1] == "all") {
} else if (selectedGenes[1] == "all" & selectedTaxa[1] == "all") {
return(distributionData)
} else {
# select data from dataHeat for selected sequences and taxa
Expand All @@ -198,7 +175,7 @@ createVariableDistributionDataSubset <- function(
#' @description Create distribution plot for one of the additional variable or
#' the percentage of the species present in the supertaxa.
#' @usage createVarDistPlot(data, varName = "var", varType = "var1",
#' percent = c(0, 1), distTextSize = 12)
#' percent = c(0, 1), textSize = 12)
#' @param data dataframe contains data for plotting (see
#' ?createVariableDistributionData, ?createVariableDistributionDataSubset or
#' ?createPercentageDistributionData)
Expand All @@ -207,7 +184,7 @@ createVariableDistributionDataSubset <- function(
#' @param varType type of variable (either "var1", "var2" or "presSpec").
#' Default = "var1".
#' @param percent range of percentage cutoff (between 0 and 1). Default = c(0,1)
#' @param distTextSize text size of the distribution plot (in px). Default = 12.
#' @param textSize text size of the distribution plot (in px). Default = 12.
#' @return A distribution plot for the selected variable as a ggplot object
#' @import ggplot2
#' @author Vinh Tran {tran@bio.uni-frankfurt.de}
Expand All @@ -224,18 +201,17 @@ createVariableDistributionDataSubset <- function(
#' varName <- "Variable abc"
#' varType <- "var1"
#' percent <- c(0,1)
#' distTextSize <- 12
#' textSize <- 12
#' createVarDistPlot(
#' data,
#' varName,
#' varType,
#' percent,
#' distTextSize
#' textSize
#' )

createVarDistPlot <- function(
data, varName = "var", varType = "var1", percent = c(0, 1),
distTextSize = 12
data, varName = "var", varType = "var1", percent = c(0, 1), textSize = 12
) {
if (is.null(data)) stop("Input data cannot be NULL!")
if (varType == "presSpec") {
Expand All @@ -254,22 +230,17 @@ createVarDistPlot <- function(
} else data <- data[!is.na(data[,varType]), ]

data.mean <- mean(data[,varType])

p <- ggplot(data, aes(x = data[,varType])) +
geom_histogram(binwidth = .01, alpha = .5, position = "identity") +
geom_vline(
data = data,
aes(xintercept = data.mean, colour = "red"),
linetype = "dashed",
size = 1
) +
theme_minimal()
data = data, aes(xintercept = data.mean, colour = "red"),
linetype = "dashed", size = 1
) + theme_minimal()
p <- p + theme(
legend.position = "none",
axis.title = element_text(size = distTextSize),
axis.text = element_text(size = distTextSize)
) +
labs(
axis.title = element_text(size = textSize),
axis.text = element_text(size = textSize)
) + labs(
x = paste0(varName," (mean = ", round(mean(data[,varType]), 3),")"),
y = "Frequency"
)
Expand Down
35 changes: 10 additions & 25 deletions R/clusterProfile.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#' Get data for calculating distance matrix from phylogenetic profiles
#' @export
#' @usage getDataClustering(data, profileType = "binary",
#' var1AggregateBy = "max", var2AggregateBy = "max")
#' @usage getDataClustering(data, profileType = "binary", var1AggBy = "max",
#' var2AggBy = "max")
#' @param data a data frame contains processed and filtered profiles (see
#' ?fullProcessedProfile and ?filterProfileData, ?fromInputToProfile)
#' @param profileType type of data used for calculating the distance matrix.
#' Either "binary" (consider only the presence/absence status of orthlogs), or
#' "var1"/"var2" for taking values of the additional variables into account.
#' Default = "binary".
#' @param var1AggregateBy aggregate method for VAR1 (min, max, mean or median).
#' @param var1AggBy aggregate method for VAR1 (min, max, mean or median).
#' Default = "max".
#' @param var2AggregateBy aggregate method for VAR2 (min, max, mean or median).
#' @param var2AggBy aggregate method for VAR2 (min, max, mean or median).
#' Default = "max".
#' @return A wide dataframe contains values for calculating distance matrix.
#' @author Carla Mölbert (carla.moelbert@gmx.de), Vinh Tran
Expand All @@ -25,51 +25,39 @@
#' getDataClustering(data, profileType, var1AggregateBy, var2AggregateBy)

getDataClustering <- function(
data = NULL, profileType = "binary",
var1AggregateBy = "max", var2AggregateBy = "max"
data = NULL, profileType = "binary", var1AggBy = "max", var2AggBy = "max"
) {
if (is.null(data)) stop("Input data cannot be NULL!")
supertaxon <- NULL
presSpec <- NULL

supertaxon <- presSpec <- NULL
# remove lines where there is no found ortholog
subDataHeat <- subset(data, data$presSpec > 0)

# transform data into wide matrix
if (profileType == "binary") {
subDataHeat <- subDataHeat[, c("geneID", "supertaxon", "presSpec")]
subDataHeat$presSpec[subDataHeat$presSpec > 0] <- 1
subDataHeat <- subDataHeat[!duplicated(subDataHeat), ]
wideData <- data.table::dcast(
subDataHeat, geneID ~ supertaxon, value.var = "presSpec"
)
subDataHeat, geneID ~ supertaxon, value.var = "presSpec")
} else {
var <- profileType

subDataHeat <- subDataHeat[, c("geneID", "supertaxon", var)]
subDataHeat <- subDataHeat[!duplicated(subDataHeat), ]

# aggreagte the values by the selected method
if (var == "var1") aggregateBy <- var1AggregateBy
else aggregateBy <- var2AggregateBy

if (var == "var1") aggregateBy <- var1AggBy
else aggregateBy <- var2AggBy
subDataHeat <- aggregate(
subDataHeat[, var],
list(subDataHeat$geneID, subDataHeat$supertaxon),
FUN = aggregateBy
)

colnames(subDataHeat) <- c("geneID", "supertaxon", var)
wideData <- data.table::dcast(
subDataHeat, geneID ~ supertaxon, value.var = var
)
subDataHeat, geneID ~ supertaxon, value.var = var)
}

# set name for wide matrix as gene IDs
dat <- wideData[, 2:ncol(wideData)]
rownames(dat) <- wideData[, 1]
dat[is.na(dat)] <- 0

return(dat)
}

Expand Down Expand Up @@ -100,7 +88,6 @@ getDistanceMatrix <- function(profiles = NULL, method = "mutualInformation") {
if (is.null(profiles)) stop("Profile data cannot be NULL!")
profiles <- profiles[, colSums(profiles != 0) > 0]
profiles <- profiles[rowSums(profiles != 0) > 0, ]

distMethods <- c("euclidean", "maximum", "manhattan", "canberra", "binary")
if (method %in% distMethods) {
distanceMatrix <- stats::dist(profiles, method = method)
Expand All @@ -118,7 +105,6 @@ getDistanceMatrix <- function(profiles = NULL, method = "mutualInformation") {
# are clustered together
matrix <- 1 - matrix
matrix <- as.data.frame(matrix)

profileNames <- rownames(profiles)
colnames(matrix) <- profileNames[seq_len(length(profileNames)) - 1]
rownames(matrix) <- profileNames
Expand All @@ -129,7 +115,6 @@ getDistanceMatrix <- function(profiles = NULL, method = "mutualInformation") {
} else if (method == "pearson") {
distanceMatrix <- bioDist::cor.dist(as.matrix(profiles))
}

return(distanceMatrix)
}

Expand Down
Loading

0 comments on commit 884913e

Please sign in to comment.