Skip to content

Commit

Permalink
Merge pull request #14 from Docma-TU/topWords
Browse files Browse the repository at this point in the history
Top Words per Topic
  • Loading branch information
lkoppers committed Aug 27, 2019
2 parents f51704f + 2e5ff52 commit 25e09b8
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 1 deletion.
2 changes: 2 additions & 0 deletions NAMESPACE
Expand Up @@ -31,6 +31,7 @@ export(filterCount)
export(filterDate)
export(filterID)
export(filterWord)
export(importance)
export(intruderTopics)
export(intruderWords)
export(is.duplist)
Expand Down Expand Up @@ -62,6 +63,7 @@ export(showTexts)
export(textmeta)
export(tidy.textmeta)
export(topTexts)
export(topWords)
export(topicCoherence)
export(topicsInText)
export(vprecision)
Expand Down
74 changes: 74 additions & 0 deletions R/topWords.R
@@ -0,0 +1,74 @@
#' Top Words per Topic
#'
#' Determines the top words per topic as \code{\link[lda]{top.topic.words}} do.
#' In addition, it is possible to request the values that are taken for
#' determining the top words per topic. Therefore, the function \code{importance}
#' is used, which also can be called independently.
#'
#' @param topics \code{named matrix}:
#' The counts of vocabularies (column wise) in topics (row wise).
#' @param numWords \code{integer(1)}:
#' The number of requested top words per topic.
#' @param byScore \code{logical(1)}:
#' Should the values that are taken for determining the top words per topic be
#' calculated by the function \code{importance} (\code{TRUE}) or should
#' the absolute counts be considered (\code{FALSE})?
#' @param epsilon \code{numeric(1)}:
#' Small number to add to logarithmic calculations to overcome the issue of
#' determining \code{log(0)}.
#' @param values \code{logical(1)}:
#' Should the values that are taken for determining the top words per topic be
#' returned?
#' @return Matrix of top words or, if \code{value} is \code{TRUE} a list of
#' matrices with entries \code{word} and \code{val}.
#' @examples
#' texts <- list(A="Give a Man a Fish, and You Feed Him for a Day.
#' Teach a Man To Fish, and You Feed Him for a Lifetime",
#' B <- "So Long, and Thanks for All the Fish",
#' C <- "A very able manipulative mathematician, Fisher enjoys a real mastery
#' in evaluating complicated multiple integrals.")
#'
#' corpus <- textmeta(meta = data.frame(id = c("A", "B", "C", "D"),
#' title = c("Fishing", "Don't panic!", "Sir Ronald", "Berlin"),
#' date = c("1885-01-02", "1979-03-04", "1951-05-06", "1967-06-02"),
#' additionalVariable = 1:4, stringsAsFactors = FALSE), text = texts)
#'
#' corpus <- cleanTexts(corpus)
#' wordlist <- makeWordlist(corpus$text)
#' ldaPrep <- LDAprep(text = corpus$text, vocab = wordlist$words)
#'
#' \donttest{LDA <- LDAgen(documents = ldaPrep, K = 3L, vocab = wordlist$words, num.words = 3)}
#' \donttest{topWords(LDA$topics)}
#'
#' \donttest{importance(LDA$topics)}

#' @export topWords
topWords <- function(topics, numWords = 1, byScore = TRUE, epsilon = 1e-5, values = FALSE){

stopifnot(is.matrix(topics), all(topics >= 0), !is.null(colnames(topics)),
length(numWords) == 1, length(byScore) == 1, length(values) == 1,
as.integer(numWords) == numWords, is.logical(byScore), is.logical(values))

if (byScore){
topics <- importance(topics, epsilon = epsilon)
}
words <- apply(topics, 1, function(y) colnames(topics)[head(order(y, decreasing = TRUE), numWords)])
if (values){
vals <- drop(head(apply(topics, 1, function(y) -sort(-y, partial = seq_len(numWords))), numWords))
return(list(word = words, val = vals))
}else{
return(words)
}
}

#' @rdname topWords
#' @export
importance <- function(topics, epsilon = 1e-5){

stopifnot(is.matrix(topics), all(topics >= 0), length(epsilon) == 1,
is.numeric(epsilon), epsilon > 0, epsilon < 2)

rel <- topics/rowSums(topics)
logs <- log(rel + epsilon)
rel * (logs - rep(colMeans(logs), each = nrow(topics)))
}
63 changes: 63 additions & 0 deletions man/topWords.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test_topTexts.R
@@ -1,4 +1,4 @@
context("topArticles")
context("topTexts")

test_that("topTexts", {

Expand Down
22 changes: 22 additions & 0 deletions tests/testthat/test_topWords.R
@@ -0,0 +1,22 @@
context("topWords")

test_that("topWords", {

load("data/test-k3i20b70s24601alpha0.33eta0.33.RData")

topics <- result$topics

expect_equal(topWords(topics = topics),
lda::top.topic.words(topics = topics, num.words = 1, by.score = TRUE))
expect_equal(topWords(topics = topics, numWords = 20, byScore = FALSE),
lda::top.topic.words(topics = topics))
expect_error(topWords(topics = topics, epsilon = 0))

tw <- topWords(topics = topics, values = TRUE)
expect_true(is.list(tw))
expect_true(all(names(tw) == c("word", "val")))

imp <- importance(topics = topics)
expect_equal(diag(imp[1:3, tw$word]), tw$val)

})

0 comments on commit 25e09b8

Please sign in to comment.