Skip to content

Commit

Permalink
Epsilon parameter in topicCoherence
Browse files Browse the repository at this point in the history
  • Loading branch information
lkoppers committed May 3, 2019
1 parent 4e03030 commit 5b84709
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 5 deletions.
9 changes: 5 additions & 4 deletions R/topicCoherence.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
#' @param num.words Integer: Number of topwords used for calculating topic coherence (default: \code{10}).
#' @param by.score Logical: Should the Score from \code{\link[lda]{top.topic.words}} be used (default: \code{TRUE})?
#' @param sym.coherence Logical: Should a symmetric version of the topic coherence used for the calculations? If TRUE the denominator of the topic coherence uses both wordcounts and not just one.
#' @param epsilon Numeric: Smoothing factor to avoid log(0). Default is 1. Stevens et al. recommend a smaller value.
#' @return A vector of topic coherences. the length of the vector corresponds to the number of topics in the model.
#' @references Mimno, David and Wallach, Hannah M. and Talley, Edmund and Leenders, Miriam and McCallum, Andrew. Optimizing semantic coherence in topic models. EMNLP '11 Proceedings of the Conference on Empirical Methods in Natural Language Processing, 2011.
#'
#' Stevens, Keith and Andrzejewski, David and Buttler, David. Exploring topic coherence over many models and many topics. EMNLP-CoNLL '12 Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning, 2012.
#' @examples
#' texts <- list(A="Give a Man a Fish, and You Feed Him for a Day.
#' Teach a Man To Fish, and You Feed Him for a Lifetime",
Expand All @@ -29,7 +30,7 @@
#' \donttest{result <- LDAgen(documents=ldaPrep, K = 3L, vocab=wordlist$words, num.words=3)}
#' \donttest{topicCoherence(ldaresult=result, documents=ldaPrep, num.words=5, by.score=TRUE)}
#' @export topicCoherence
topicCoherence <- function(ldaresult, documents, num.words=10, by.score=TRUE, sym.coherence = FALSE){
topicCoherence <- function(ldaresult, documents, num.words=10, by.score=TRUE, sym.coherence = FALSE, epsilon=1){
stopifnot(is.list(ldaresult), is.list(ldaresult$assignments), length(ldaresult$assignments) == length(documents), is.matrix(ldaresult$topics), is.list(documents),
as.integer(num.words) == num.words, length(num.words) == 1,
is.logical(by.score), length(by.score) == 1)
Expand All @@ -47,8 +48,8 @@ topicCoherence <- function(ldaresult, documents, num.words=10, by.score=TRUE, sy

grid <- cbind(rep(2:num.words, 1:(num.words-1)), unlist(sapply(1:(num.words-1), function(x)1:x)))

if(!sym.coherence){tc <- apply(grid, 1, function(x) (sum(D[,x[1]]*D[,x[2]]) +1) / Dsum[x[2]])}else{
tc <- apply(grid, 1, function(x) (sum(D[,x[1]]*D[,x[2]]) +1) / ((Dsum[x[2]] + Dsum[x[1]])/2))}
if(!sym.coherence){tc <- apply(grid, 1, function(x) (sum(D[,x[1]]*D[,x[2]]) +epsilon) / Dsum[x[2]])}else{
tc <- apply(grid, 1, function(x) (sum(D[,x[1]]*D[,x[2]]) +epsilon) / ((Dsum[x[2]] + Dsum[x[1]])/2))}
topicCoherence[i] <- sum(log(tc))
}
if(any(is.infinite(topicCoherence))){warning("Some Topics have less then num.words allocated words. Return Inf")}
Expand Down
5 changes: 4 additions & 1 deletion man/topicCoherence.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 5b84709

Please sign in to comment.