Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from Docma-TU/topWords
Top Words per Topic
- Loading branch information
Showing
5 changed files
with
162 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#' Top Words per Topic | ||
#' | ||
#' Determines the top words per topic as \code{\link[lda]{top.topic.words}} do. | ||
#' In addition, it is possible to request the values that are taken for | ||
#' determining the top words per topic. Therefore, the function \code{importance} | ||
#' is used, which also can be called independently. | ||
#' | ||
#' @param topics \code{named matrix}: | ||
#' The counts of vocabularies (column wise) in topics (row wise). | ||
#' @param numWords \code{integer(1)}: | ||
#' The number of requested top words per topic. | ||
#' @param byScore \code{logical(1)}: | ||
#' Should the values that are taken for determining the top words per topic be | ||
#' calculated by the function \code{importance} (\code{TRUE}) or should | ||
#' the absolute counts be considered (\code{FALSE})? | ||
#' @param epsilon \code{numeric(1)}: | ||
#' Small number to add to logarithmic calculations to overcome the issue of | ||
#' determining \code{log(0)}. | ||
#' @param values \code{logical(1)}: | ||
#' Should the values that are taken for determining the top words per topic be | ||
#' returned? | ||
#' @return Matrix of top words or, if \code{value} is \code{TRUE} a list of | ||
#' matrices with entries \code{word} and \code{val}. | ||
#' @examples | ||
#' texts <- list(A="Give a Man a Fish, and You Feed Him for a Day. | ||
#' Teach a Man To Fish, and You Feed Him for a Lifetime", | ||
#' B <- "So Long, and Thanks for All the Fish", | ||
#' C <- "A very able manipulative mathematician, Fisher enjoys a real mastery | ||
#' in evaluating complicated multiple integrals.") | ||
#' | ||
#' corpus <- textmeta(meta = data.frame(id = c("A", "B", "C", "D"), | ||
#' title = c("Fishing", "Don't panic!", "Sir Ronald", "Berlin"), | ||
#' date = c("1885-01-02", "1979-03-04", "1951-05-06", "1967-06-02"), | ||
#' additionalVariable = 1:4, stringsAsFactors = FALSE), text = texts) | ||
#' | ||
#' corpus <- cleanTexts(corpus) | ||
#' wordlist <- makeWordlist(corpus$text) | ||
#' ldaPrep <- LDAprep(text = corpus$text, vocab = wordlist$words) | ||
#' | ||
#' \donttest{LDA <- LDAgen(documents = ldaPrep, K = 3L, vocab = wordlist$words, num.words = 3)} | ||
#' \donttest{topWords(LDA$topics)} | ||
#' | ||
#' \donttest{importance(LDA$topics)} | ||
|
||
#' @export topWords | ||
topWords <- function(topics, numWords = 1, byScore = TRUE, epsilon = 1e-5, values = FALSE){ | ||
|
||
stopifnot(is.matrix(topics), all(topics >= 0), !is.null(colnames(topics)), | ||
length(numWords) == 1, length(byScore) == 1, length(values) == 1, | ||
as.integer(numWords) == numWords, is.logical(byScore), is.logical(values)) | ||
|
||
if (byScore){ | ||
topics <- importance(topics, epsilon = epsilon) | ||
} | ||
words <- apply(topics, 1, function(y) colnames(topics)[head(order(y, decreasing = TRUE), numWords)]) | ||
if (values){ | ||
vals <- drop(head(apply(topics, 1, function(y) -sort(-y, partial = seq_len(numWords))), numWords)) | ||
return(list(word = words, val = vals)) | ||
}else{ | ||
return(words) | ||
} | ||
} | ||
|
||
#' @rdname topWords | ||
#' @export | ||
importance <- function(topics, epsilon = 1e-5){ | ||
|
||
stopifnot(is.matrix(topics), all(topics >= 0), length(epsilon) == 1, | ||
is.numeric(epsilon), epsilon > 0, epsilon < 2) | ||
|
||
rel <- topics/rowSums(topics) | ||
logs <- log(rel + epsilon) | ||
rel * (logs - rep(colMeans(logs), each = nrow(topics))) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
context("topArticles") | ||
context("topTexts") | ||
|
||
test_that("topTexts", { | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
context("topWords") | ||
|
||
test_that("topWords", { | ||
|
||
load("data/test-k3i20b70s24601alpha0.33eta0.33.RData") | ||
|
||
topics <- result$topics | ||
|
||
expect_equal(topWords(topics = topics), | ||
lda::top.topic.words(topics = topics, num.words = 1, by.score = TRUE)) | ||
expect_equal(topWords(topics = topics, numWords = 20, byScore = FALSE), | ||
lda::top.topic.words(topics = topics)) | ||
expect_error(topWords(topics = topics, epsilon = 0)) | ||
|
||
tw <- topWords(topics = topics, values = TRUE) | ||
expect_true(is.list(tw)) | ||
expect_true(all(names(tw) == c("word", "val"))) | ||
|
||
imp <- importance(topics = topics) | ||
expect_equal(diag(imp[1:3, tw$word]), tw$val) | ||
|
||
}) |