Merge pull request #14 from Docma-TU/topWords

Top Words per Topic
Docma-TU · Aug 27, 2019 · 25e09b8 · 25e09b8
2 parents f51704f + 2e5ff52
commit 25e09b8
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 1 deletion.
diff --git a/NAMESPACE b/NAMESPACE
@@ -31,6 +31,7 @@ export(filterCount)
 export(filterDate)
 export(filterID)
 export(filterWord)
+export(importance)
 export(intruderTopics)
 export(intruderWords)
 export(is.duplist)
@@ -62,6 +63,7 @@ export(showTexts)
 export(textmeta)
 export(tidy.textmeta)
 export(topTexts)
+export(topWords)
 export(topicCoherence)
 export(topicsInText)
 export(vprecision)

diff --git a/R/topWords.R b/R/topWords.R
@@ -0,0 +1,74 @@
+#' Top Words per Topic
+#'
+#' Determines the top words per topic as \code{\link[lda]{top.topic.words}} do.
+#' In addition, it is possible to request the values that are taken for 
+#' determining the top words per topic. Therefore, the function \code{importance}
+#' is used, which also can be called independently.
+#'
+#' @param topics \code{named matrix}: 
+#' The counts of vocabularies (column wise) in topics (row wise).
+#' @param numWords \code{integer(1)}: 
+#' The number of requested top words per topic.
+#' @param byScore \code{logical(1)}: 
+#' Should the values that are taken for determining the top words per topic be
+#' calculated by the function \code{importance} (\code{TRUE}) or should
+#' the absolute counts be considered (\code{FALSE})?
+#' @param epsilon \code{numeric(1)}: 
+#' Small number to add to logarithmic calculations to overcome the issue of
+#' determining \code{log(0)}.
+#' @param values \code{logical(1)}: 
+#' Should the values that are taken for determining the top words per topic be
+#' returned?
+#' @return Matrix of top words or, if \code{value} is \code{TRUE} a list of
+#' matrices with entries \code{word} and \code{val}.
+#' @examples
+#' texts <- list(A="Give a Man a Fish, and You Feed Him for a Day.
+#' Teach a Man To Fish, and You Feed Him for a Lifetime",
+#' B <- "So Long, and Thanks for All the Fish",
+#' C <- "A very able manipulative mathematician, Fisher enjoys a real mastery
+#' in evaluating complicated multiple integrals.")
+#'
+#' corpus <- textmeta(meta = data.frame(id = c("A", "B", "C", "D"),
+#'   title = c("Fishing", "Don't panic!", "Sir Ronald", "Berlin"),
+#'   date = c("1885-01-02", "1979-03-04", "1951-05-06", "1967-06-02"),
+#'   additionalVariable = 1:4, stringsAsFactors = FALSE), text = texts)
+#'
+#' corpus <- cleanTexts(corpus)
+#' wordlist <- makeWordlist(corpus$text)
+#' ldaPrep <- LDAprep(text = corpus$text, vocab = wordlist$words)
+#'
+#' \donttest{LDA <- LDAgen(documents = ldaPrep, K = 3L, vocab = wordlist$words, num.words = 3)}
+#' \donttest{topWords(LDA$topics)}
+#' 
+#' \donttest{importance(LDA$topics)}
+
+#' @export topWords
+topWords <- function(topics, numWords = 1, byScore = TRUE, epsilon = 1e-5, values = FALSE){
+
+  stopifnot(is.matrix(topics), all(topics >= 0), !is.null(colnames(topics)),
+    length(numWords) == 1, length(byScore) == 1, length(values) == 1,
+    as.integer(numWords) == numWords, is.logical(byScore), is.logical(values))
+
+  if (byScore){
+    topics <- importance(topics, epsilon = epsilon)
+  }
+  words <- apply(topics, 1, function(y) colnames(topics)[head(order(y, decreasing = TRUE), numWords)])
+  if (values){
+    vals <- drop(head(apply(topics, 1, function(y) -sort(-y, partial = seq_len(numWords))), numWords))
+    return(list(word = words, val = vals))
+  }else{
+    return(words)
+  }
+}
+
+#' @rdname topWords
+#' @export
+importance <- function(topics, epsilon = 1e-5){
+
+  stopifnot(is.matrix(topics), all(topics >= 0), length(epsilon) == 1,
+    is.numeric(epsilon), epsilon > 0, epsilon < 2)
+
+  rel <- topics/rowSums(topics)
+  logs <- log(rel + epsilon)
+  rel * (logs - rep(colMeans(logs), each = nrow(topics)))
+}
diff --git a/man/topWords.Rd b/man/topWords.Rd
diff --git a/tests/testthat/test_topTexts.R b/tests/testthat/test_topTexts.R
@@ -1,4 +1,4 @@
-context("topArticles")
+context("topTexts")
 
 test_that("topTexts", {
 

diff --git a/tests/testthat/test_topWords.R b/tests/testthat/test_topWords.R
@@ -0,0 +1,22 @@
+context("topWords")
+
+test_that("topWords", {
+
+  load("data/test-k3i20b70s24601alpha0.33eta0.33.RData")
+
+  topics <- result$topics
+
+  expect_equal(topWords(topics = topics),
+    lda::top.topic.words(topics = topics, num.words = 1, by.score = TRUE))
+  expect_equal(topWords(topics = topics, numWords = 20, byScore = FALSE),
+    lda::top.topic.words(topics = topics))
+  expect_error(topWords(topics = topics, epsilon = 0))
+
+  tw <- topWords(topics = topics, values = TRUE)
+  expect_true(is.list(tw))
+  expect_true(all(names(tw) == c("word", "val")))
+
+  imp <- importance(topics = topics)
+  expect_equal(diag(imp[1:3, tw$word]), tw$val)
+
+})