# Comprehensive STM Workflow
1. STM run with k=0
1. NbClustering run over prelim STMs' theta to find better number of topics
1. STM viewer webapp STM json data outputted
1. Manual STM refinement / tweaking

In [None]:
library(tidyverse)
library(magrittr)
library(stm)
library(jsonlite)
library(doMC)
library(foreach)
library(NbClust)
library(cluster)

In [None]:
load('../data_processing/tidy_questions.Rda')
source('stmjson.R')

In [None]:
# Set number of cores to use on following computations
registerDoMC(cores=3)

## Preliminary STM
Used to find baseline topic number using STM library methods (K=0)

In [None]:
# STM does not produce meaningful clusters for these questions and are best removed.
questions %>% names %>% as.data.frame %>% slice(4:6)
questions <- questions[-c(4,5,6)]

In [None]:
start <- Sys.time()
verbosity <- FALSE

procs <- foreach(n = seq(length(questions))) %dopar% textProcessor(documents = questions[[n]][[1]],
                                                                  metadata = questions[[n]][2],
                                                                  customstopwords = c('art','arts'),
                                                                  verbose = verbosity)

docs <- foreach(n = seq(length(questions))) %dopar% prepDocuments(documents = procs[[n]]$documents, 
                                                                 vocab = procs[[n]]$vocab, meta = procs[[n]]$meta,
                                                                 lower.thresh = ifelse(procs[[n]]$documents %>%
                                                                                       length > 1000, 4, 3),
                                                                 verbose = verbosity)

prelim_stms <- foreach(n = seq(length(questions))) %dopar% stm(documents = docs[[n]]$documents,
                                                               vocab = docs[[n]]$vocab, K = 0, 
                                                               data = docs[[n]]$meta, verbose = verbosity)

time_taken <- Sys.time() - start
time_taken

## NbClust Cluster Analysis
Used to find very close estimates of the best number of topics for each question

In [None]:
pcas <- foreach(n = seq(length(prelim_stms))) %dopar% prcomp(x = (prelim_stms[[n]]$theta), scale. = T)

nbcs <- foreach(n = seq(length(pcas))) %dopar% NbClust(data = select(data.frame(pca$x),
                                                                     1:(stmobj$settings$dim$K - 5)),
                                                       diss = daisy(pca$x),
                                                       distance=NULL,
                                                       min.nc=3,
                                                       max.nc=27,
                                                       method='complete',
                                                       index='all')

In [None]:
# The methods filtered out seem to always choose the lowest number of clusters considered every time.
k_canidates <- c()
for(i in seq(nbcs)) {
    num_clust <- data.frame(method=nbcs[[i]]$Best.nc %>% t %>% rownames,
               nc=nbcs[[i]]$Best.nc %>% t %>% as.data.frame() %>% pull(1)) %>% 
                    filter(method != 'Cindex' & method != 'DB' & method != 'Silhouette' &
                           method != 'Duda' & method != 'PseudoT2' & method != 'Beale' &
                           method != 'McClain' & method != 'Hubert' & method != 'Dindex')
    num_clust %<>% pull(2) %>% table %>% data.frame %>% arrange(-Freq) %>% slice(1) %>% pull(1)
    k_canidates %<>% c(num_clust)
}

## Improved STMs
Using NbClust recommended numbers of topics

In [None]:
start <- Sys.time()
verbosity <- FALSE

improved_stms <- foreach(n = seq(length(questions))) %dopar% stm(documents = docs[[n]]$documents,
                                                                 vocab = docs[[n]]$vocab, K = k_canidates[n],
                                                                 data = docs[[n]]$meta, verbose = verbosity)

time_taken <- Sys.time() - start
time_taken

### Outputting STM Data
To be used with the webapp

In [None]:
question_names <- c()
for(i in seq(questions)) {
    question_names %<>% c(names(questions[[i]][1]))
}

In [None]:
directory = './'

foreach(n = seq(length(questions))) %dopar% create_json(
    stm = improved_stms[[n]],
    documents_raw = questions[[n]][question_names[n]] %>% slice(-procs[[n]]$docs.removed) %>% 
                                                                       slice(-docs[[n]]$docs.removed) %>% 
                                                                       pull,
    documents_matrix = docs[[n]]$documents,
    column_name = question_names[[n]],
    title = names(questions[n]),
    clustering_thresh = 1.4, #should be as low as possible without errors (raise in 0.1 steps if errors)
    verbose = T,
    directory = directory
)

## STM Refinement
Use this space to change the number of topics, lower.thresh, and stopwords of questions to try to make a qualitatively better model after having inspecting/comparing the model in the STM viewer webapp. A good place to start is looking at how well defined the no/none topic is.

#/ use this space to write down the question number, best number of topics, and custom stop words 
c(15, 9, )
c(16, 11, c('art','arts','grow','growth','develop','development','way'))
c(17, 13, c('art','arts','positive','negative','helped','major','really',
              'much','made','think','dont','don\'t','experience','experiences',
              'college','most','life','role','provided')))

In [None]:
i <- 19
ntopics <- 11

procs[[i]] <- textProcessor(documents = questions[[i]][[1]], 
              metadata = questions[[i]][2],
              customstopwords = c('art','arts','grow','growth','develop','development','way'))
#               customstopwords = c('art','arts','positive','negative','helped','major','really',
#                                   'much','made','think','dont','don\'t','experience','experiences',
#                                   'college','most','life','role','provided'))
#               customstopwords = c('art','arts'))

docs[[i]] <- prepDocuments(documents = procs[[i]]$documents,
              vocab = procs[[i]]$vocab,
              meta = procs[[i]]$meta,
              lower.thresh = 3)
              #lower.thresh = ifelse(procs[[i]]$documents %>% length > 1000, 4, 3))

In [None]:
start <- Sys.time()
stmobj <- stm(documents = docs[[i]]$documents,
                vocab = docs[[i]]$vocab,
                K = ntopics,
                data=docs[[i]]$meta,
                verbose=F)
print(Sys.time() - start)

In [None]:
# comment out one of the following lines
labels <- read_json('labels/sr_othergrowth_labels.json')
# labels <- NULL

if (is.null(labels)) {
    labels$topics <- NULL
    labels$clusters <- NULL
}

create_json(
    stm = stmobj,
    documents_raw = questions[[i]][question_names[i]] %>% slice(-procs[[i]]$docs.removed) %>% 
                                                                       slice(-docs[[i]]$docs.removed) %>% 
                                                                       pull,
    documents_matrix = docs[[i]]$documents,
    column_name = question_names[[i]],
    title = names(questions[i]),
    clustering_thresh = 1.4, # should be as low as possible w/o errors
    instant = T, # names the json data.json
    topic_labels = labels$topics,
    cluster_labels = labels$clusters,
    directory = './'
)