# following this tutorial...
- https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

In [17]:
### COMMON
import pandas as pd


### LDA
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim, logging, cython, os


In [64]:
## SETTINGS

lang = "english"
inpath = ""
save = False
outpath = "../out/"
save_vis = True
outpath_vis = "../plots/"


# Load data

In [28]:
### LOAD DATA

## DUMMY DATA
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

df_raw = pd.DataFrame(data = {"text":doc_set})

## READ CSV
#df_raw = pd.read_csv(inpath,sep=";")


### quick view
display(df_raw.head())
len(df_raw)

Unnamed: 0,text
0,Brocolli is good to eat. My brother likes to e...
1,My mother spends a lot of time driving my brot...
2,Some health experts suggest that driving may c...
3,I often feel pressure to perform well at schoo...
4,Health professionals say that brocolli is good...


5

In [13]:
# Data frame cleaning
# TODO any other cleaning <--

txt = df_raw["text"].tolist();


# LDA model

## Prep data

In [19]:

stop = set(stopwords.words(lang))
stemmer = SnowballStemmer(lang) 

### Matches all alphanumeric
# tokenizer = RegexpTokenizer(r'\w+')
### Matches all chars but not numbers (not (NonAlphanumeric or numeric))
tokenizer = RegexpTokenizer(r'[^\W\d]+')


In [22]:
texts = []
for i in txt:
   # clean and tokenize document string
    raw = i.lower() #.decode('utf-8','ignore')
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens and words shorter than 2 chars
    stopped_tokens = [i for i in tokens if not i in stop and len(i)>2]
    
    # stem tokens
    stemmed_tokens = [stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)    
    

In [24]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

In [25]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

## model

In [27]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(
    corpus
    , num_topics=20
    , id2word = dictionary
    , passes=20) 

# ngrams / bigrams...
# http://www.mimno.org/articles/phrases/

## Save model

In [32]:
# Save dict, corpus and model
if save:
    dictionary.save(outpath+"v1.dict")
    corpora.MmCorpus.serialize(outpath+"v1.mm",corpus)
    ldamodel.save(outpath+"v1.model")

# Visualize

In [61]:
## VISUALIZE
# https://github.com/bmabey/pyLDAvis
import pyLDAvis
import pyLDAvis.gensim
model_prep = pyLDAvis.gensim.prepare(
    ldamodel
    , corpus
    , dictionary
    , mds='mmds')


In [63]:
pyLDAvis.display(model_prep)
#pyLDAvis.show(model_prep)

## save visualizing

In [66]:
if save_vis:
    pyLDAvis.save_html(model_prep,outpath_vis+"model_vis.html")


# Appendix

## Practical notes

- LDA in sklearn: 
    - https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
- Tips and tricks: 
    - https://www.researchgate.net/publication/307303102_What_is_Wrong_with_Topic_Modeling_and_How_to_Fix_it_Using_Search-based_SE
- Determine number of topics:
    - https://github.com/scikit-learn/scikit-learn/issues/9134
    - https://www.quora.com/Latent-Dirichlet-Allocation-LDA-What-is-the-best-way-to-determine-k-number-of-topics-in-topic-modeling


## Theory

- explaining LDA:
    - Paper: http://ai.stanford.edu/~ang/papers/jair03-lda.pdf
    - Friendly explanation: http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/

- Coherence metric:
    - http://nbviewer.jupyter.org/github/dsquareindia/gensim/blob/a4b2629c0fdb0a7932db24dfcf06699c928d112f/docs/notebooks/topic_coherence_tutorial.ipynb
    - explained http://qpleple.com/topic-coherence-to-evaluate-topic-models/
    - https://rare-technologies.com/what-is-topic-coherence/