# Latent Dirichlet Allocation

This is the takehome notebook for the NLP engineer position at Contenda. 



In [22]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from itertools import chain
import numpy as np

from gensim.models import Phrases, CoherenceModel
from gensim import corpora, models, parsing

import nltk

import os
from typing import List, Text

In [23]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samanthawilcoxson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samanthawilcoxson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samanthawilcoxson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/samanthawilcoxson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/samanthawilcoxson/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [24]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [56]:
# Preprocessing Functions

custom_stopwords = set(['gon', 'na', 'yep'])
stopwords = set(parsing.preprocessing.STOPWORDS).union(custom_stopwords)

def remove_stopwords(list_tokens: List[Text]):
    return [w for w in list_tokens if w not in stopwords]

def preprocess(list_text: List[Text]) -> List:
    df = pd.DataFrame(list_text)
    df.columns = ["documents"]
    df['sentences'] = df.documents.map(sent_tokenize)
    df['tokens_sentences'] = df['sentences'].map(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
    df['POS_tokens'] = df['tokens_sentences'].map(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
    df['tokens_sentences_lemmatized'] = df['POS_tokens'].map(
        lambda list_tokens_POS: [
            [
                lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
                if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
            ] 
            for tokens_POS in list_tokens_POS
        ]
    )
    
    df['tokens'] = df['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))
    df['tokens'] = df['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha()])
    
    # remove stopwords
    df['tokens'] = df['tokens'].map(lambda tokens: remove_stopwords(tokens))

    # obtain unigrams, bigrams, trigrams
    tokens = df['tokens'].tolist()
    bigram_model = Phrases(tokens)
    trigram_model = Phrases(bigram_model[tokens], min_count=1)
    tokens = list(trigram_model[bigram_model[tokens]])

    return tokens

In [191]:
def build_lda_model(list_text, num_topics):
    # preprocess
    tokens = preprocess(list_text)

    # build dictionary
    dictionary_LDA = corpora.Dictionary(tokens)
    dictionary_LDA.filter_extremes(no_below=0)
    corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]
    
    # build model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                      id2word=dictionary_LDA, \
                                      passes=4, alpha=[0.01]*num_topics, \
                                      eta=[0.01]*len(dictionary_LDA.keys()),
                                      random_state=10)
    
    return lda_model, dictionary_LDA

def eval_model(model, test_corpus):
    results = {
        'perplexity': model.log_perplexity(test_corpus),
        'coherence':  CoherenceModel(model=model, corpus=test_corpus, coherence='u_mass').get_coherence()
    }
    return results

In [192]:
# load data

train = [open(f"./training_transcriptions/{fname}").read() for fname in os.listdir("./training_transcriptions")]
test = [open(f"./testing_transcriptions/{fname}").read() for fname in os.listdir("./testing_transcriptions")]


In [214]:
model, dictionary_LDA = build_lda_model(train, num_topics=3)
model.print_topics()

[(0,
  '0.003*"corgi" + 0.003*"cli" + 0.003*"laughter" + 0.002*"chat" + 0.002*"link" + 0.002*"application" + 0.002*"rust" + 0.002*"github" + 0.002*"function" + 0.002*"javascript"'),
 (1,
  '0.006*"design" + 0.005*"button" + 0.005*"database" + 0.004*"mongodb" + 0.004*"target" + 0.004*"engineer" + 0.003*"mmhmm" + 0.003*"image" + 0.003*"color" + 0.003*"search"'),
 (2,
  '0.004*"snake" + 0.003*"image" + 0.003*"game" + 0.003*"typescript" + 0.003*"site" + 0.003*"laughter" + 0.002*"react" + 0.002*"twilioquest" + 0.002*"chat" + 0.002*"request"')]

In [215]:
# preprocess data in model input format
train_corpus = [dictionary_LDA.doc2bow(doc) for doc in preprocess(train)]  # we'll use this for evaluation later
test_corpus = [dictionary_LDA.doc2bow(doc) for doc in preprocess(test)]

preds = [model[doc] for doc in test_corpus]
print(preds[0])


[(0, 0.3078507), (1, 0.23757707), (2, 0.45457223)]


In [217]:
# get preds in df format
preds_dicts = [dict(x) for x in preds]
preds_df = pd.DataFrame(preds_dicts)


df = pd.DataFrame({
    'documents': test,
    'predictions': preds
})

for i in range(len(model.get_topics())):
    df[i] = preds_df[i] if i in preds_df else np.nan

df = df.fillna(0)

df.to_csv("./results.csv", index=False)

df.head()

Unnamed: 0,documents,predictions,0,1,2
0,Why we build. We build because we see potentia...,"[(0, 0.3078507), (1, 0.23757707), (2, 0.454572...",0.307851,0.237577,0.454572
1,"Hello, everyone, and welcome to another episod...","[(0, 0.5027197), (1, 0.033822425), (2, 0.46345...",0.50272,0.033822,0.463458
2,"Tamao: Leandro here has fantastic story, you’r...","[(0, 0.39150605), (1, 0.19409886), (2, 0.41439...",0.391506,0.194099,0.414395
3,"Hello, everyone, and welcome to another episod...","[(0, 0.39435107), (1, 0.073617354), (2, 0.5320...",0.394351,0.073617,0.532032
4,So who remembers GeoCities? Thank you. Yeah. O...,"[(0, 0.30412647), (1, 0.2323362), (2, 0.463537...",0.304126,0.232336,0.463537


In [199]:
results_on_train = eval_model(model, [dictionary_LDA.doc2bow(tok) for tok in preprocess(train)])
results_on_test = eval_model(model, test_corpus)
print(results_on_train)
print(results_on_test)

{'perplexity': -9.737015962832338, 'coherence': -1.6580351979459576}
{'perplexity': -11.807010213357163, 'coherence': -3.3234591095030943}


# Results

To determine the number of topics to extract, I tried running the model on `num_topics={2,3,5,10}` and eyeballing the topic keywords (`model.print_topics()`) to see if they made sense. 3 seemed to be the most interpretable. 2 just grouped the documents into devrel vs. Learn With Jason, which didn't provide much new information on the test documents. The topics in 5 were too similar to each other, and 10 was too many to identify clear stratification.

Looking strictly at the keywords, our 3 topics can be described by the following categories:

0. general tech terminology (seems to come from the tutorials)
1. application development and design
2. games (and application development)

This seems to represent our data with rough accuracy; if we take a probability of 0.1 as the classification threshold, the Facebook talk (`test[0]`) covers all 3 topics, the Brandon Roberts episode of Learn With Jason (`test[1]`) covers topics 0 and 2, and the devrel talk starting with Tamao (`test[2]`) covers all 3 as well. Except `test[2]` doesn't really talk about gaming! It seems like 3 isn't really the magic number of topics, here, but within the allotted time, it provides the most "explainable" categories.

I used log perplexity and coherence as evaluation metrics as outlined in this tutorial, which seems consistent with the industry standard:
https://www.tutorialspoint.com/gensim/gensim_using_lda_topic_model.htm

Here's a good breakdown of why perplexity is useful for this task:
https://cfss.uchicago.edu/notes/topic-modeling/#:~:text=Perplexity%20is%20a%20statistical%20measure,of%20words%20in%20your%20documents

These are the sources I used to interpret the coherence score, since that tutorial doesn't provide a great explanation:
https://www.baeldung.com/cs/topic-modeling-coherence-score (good breakdown of types of coherence, but their interpretation of what a greater number means is incorrect according to other sources)
https://aclanthology.org/D12-1087.pdf (nice visual representation of UMass vs UCI coherence)
https://ciir-publications.cs.umass.edu/getpdf.php?id=956 (see 4.2 - Topic Coherence)

According to these metrics, a score closer to 0 means that the document is most similar to the training set, or documents that the model has seen before. Our perplexity on our testing set is `-11.807010213357163`, and coherence on our testing set is `-3.3234591095030943`. While these numbers don't mean much on their own, they seem similar enough to the training data (`perplexity=-9.737015962832338, coherence=-1.6580351979459576`). This implies the model is relatively confident in its ability to predict topics from the test set.

If I had more time, there are 2 clear improvements that would make the results more interpretable: fine tuning num_topics with a grid search or similar, allowing us to compare the coherence and perplexity scores to find the best n to stratify with, and better stopword pruning to filter out noise.

I learned a lot about LDA and evaluation as I went, and ran out of time before implementing some (now obvious) changes that would make the results more useful in a production environment. I hope that this writeup provides some insight into how I handle unfamiliar problems, interpret results, and explain my interpretations.

How did you allocate your time?
* ~20 mins looking at data and familiarizing self with task
* ~20 mins researching LDA architecture and how it works
* ~20 mins researching LDA evaluation
* ~90 mins implementing and experimenting
* ~30 mins writeup/reflections


What are the tradeoffs between training a model on transcripts as a corpus versus written articles?
* Transcripts are noisier, as they are a representation of speech. Transcripts often contain more filler words, self-corrections, backtracking, and other quirks that come with spontaneous language production. As a result, parsing may be more of a challenge, but a model trained on transcripts will do a lot better in the spoken language domain than one trained on newswire data.


What is something you'd like to try if you had another 8 hours? 3 days? 1 week?
* in order of least to most time consuming:
    - output top 3 topics for each doc to a separate column for easy viewing
    - add to stopword list (names, adjectives, filler words like 'mhmm', etc)
    - experiment with num_topics
    - tune other hyperparameters
    - try a spacy model for parsing: https://spacy.io/models/en
    - try a BERT embedding-based topic model: https://github.com/MaartenGr/KeyBERT
    - manually annotate a subset of train/test data for more interpetable evaluation

What are the tradeoffs between using LDA for topic modeling vs other methods?
* LDA doesn't take word context into account since it uses a bag-of-words method. That makes it a lot simpler and faster to run than a context based model, but your topics are limited. Evaluation is also difficult because LDA is unsupervised. Supervised methods are easier to evaluate and very interpretable, but require time and resources to annotate, topics must be selected beforehand, and might be very narrow or very broad. Any keyword-based topic model is subsequently going to be limited by the words that occur in the document, but might generalize better.

Machine transcriptions have more mistakes than human transcriptions. If we only had access to a large amount of machine transcriptions, what are some strategies we could try to still have decent topic modeling?
- run a parser over sentences in each document to make sure sentences are grammatical
- manually check a subsection and calculate WER
- identify points of confusion using a masked/warped language model: https://www.amazon.science/blog/using-warped-language-models-to-correct-speech-recognition-errors
