# Topic Modeling
In this Python notebook, we are doing topic modeling based on available Python packages in this research field.

**IMPORTANT**: this notebook requires the `java` module. If it was not loaded in JupyterLab, you must close this notebook, stop the kernel, load the `java` module and re-open the current notebook.

In [None]:
!which java

## Loading required Python packages

In [None]:
# Standard and scientific packages
print('- Loading standard modules...')
import os
import re
import numpy as np
import pandas as pd
from pprint import pprint
from pathlib import Path
import json

# NLTK - Natural Language Toolkit
print('- Loading NLTK...')
import nltk
nltk.download('stopwords')  # Only required on the first execution

# Gensim
print('- Loading Gensim...')
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LdaMulticore

# spaCy for lemmatization
print('- Loading spaCy...')
import spacy

# Plotting tools
print('- Loading visualization tools...')
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt

# Enable logging for gensim - optional
print('- End of configuration...')
import logging
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

print('Done.')

## Loading the data
* Load stop words from NLTK

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

# See the default list
print('Default list:\n', stop_words)

# Add your custom stop words
stop_words.extend([])

# See the final list of stop words
print('\nFinal list:\n', stop_words)

* Get the list of filenames

In [None]:
# Specify the path and extension of text files
txt_folder = Path('data/').rglob('*.txt')

files = sorted([x for x in txt_folder])  # Gather text files paths in a list
print(files[:3], '...', files[-3:])  # Print first 3 and last 3 filenames
print(f' => total {len(files)} files')

* Create a dictionary that will populate a Pandas DataFrame with two columns:
  * `target_names`: the filename without its path
  * `content`: the original text data of the file in single line

In [None]:
text_dict = {'target_names': [], 'content': []}

# For each text file
for name in files:
    f = open(name, 'r', encoding='utf-8')
    basename = os.path.basename(name)

    # Print the progression at every 10 filenames
    if name in files[::10]:
        print(f'Reading {basename} ...')

    # Save the filename and the file content
    text_dict['target_names'].append(basename)
    text_dict['content'].append(' '.join(f.readlines()))
    f.close()

# Convert the dictionary to a pandas data frame 
df = pd.DataFrame.from_dict(text_dict)
print(f'Total: {len(df)} rows. Here are the first five:')
df.head()

## Cleaning the text data
* Remove roman numerals and multiple spaces

In [None]:
# Select all file contents
data = text_dict['content']

# Remove roman numerals
data = [re.sub('[MDCLXVI]+(\.|\b\w\n)', ' ', sentence) for sentence in data]

# Replace new line characters and multiple spaces by a single space
data = [re.sub('\s+', ' ', sentence) for sentence in data]

# Remove distracting quotes
#data = [re.sub("\'", "", sentence) for sentence in data]

print('First cleaned sentence:\n', data[0])
print('\nLast cleaned sentence:\n', data[-1])

* Remove punctuation symbols and transform each text into a list of words

In [None]:
def sentences_to_words(sentences):
    """
    Generator - For each sentence, return a processed list of words
    
    Returns:
    -------
    Each sentence processed by gensim.utils.simple_preprocess(), which
    removes the punctuation and collects all the individual words.
    """
    for sentence in sentences:
        # Setting deacc=True removes punctuations
        yield(simple_preprocess(sentence, deacc=True))

# Create a list of lists of words - one list of words per sentence
data_words = list(sentences_to_words(data))

print('First list of words:', data_words[0])
print('\nLast list of words:', data_words[-1])

## Topic Modeling
We will start by using:
* Gensim's [Phrases class](https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phrases) - an instance of it "detects phrases based on collocation counts"
* Gensim's [Phraser class](https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phraser) - an alias of [FrozenPhrases](https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.FrozenPhrases) which "cuts down memory consumption of Phrases, by discarding model state not strictly needed for the phrase detection task".

In [None]:
# Build the bigram and trigram models - higher threshold => fewer phrases
bigram = gensim.models.phrases.Phrases(data_words, min_count=4, threshold=8)
trigram = gensim.models.phrases.Phrases(bigram[data_words], threshold=8)

# Faster way to get a sentence identified as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See a trigram example
for word in trigram_mod[bigram_mod[data_words[90]]]:
    if len(word.split('_')) == 3:
        print(word)

* Define functions for stopwords, bigrams, trigrams and lemmatization

In [None]:
def remove_stopwords(texts):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

* Complete the cleanup of word lists

In [None]:
print('- Removing Stop Words...')
data_words_nostops = remove_stopwords(data_words)

print('- Forming bigrams...')
data_words_bigrams = make_bigrams(data_words_nostops)

print('- Forming trigrams...')
data_words_trigrams = make_trigrams(data_words_bigrams)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
print('- Initializing the spaCy model...')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
print('- Lemmatisation...')
data_lemmatized = lemmatization(data_words_trigrams,
                                allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[90])

* Create the dictionnary and corpus

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

# Readable format of corpus
[[(id2word[id], freq) for id, freq in cp[:10]] for cp in corpus[:4]]

In [None]:
start = 2   # Set the minium number of topics your model will run
limit = 10  # Choose the max ceiling for number of topics
step = 2    # Set the step width for number of topics per model
multiple_num_topics = range(start, limit + 1, step)

model_list = []
coherence_values = []

for num_topics in multiple_num_topics:
    print(f'With {num_topics} topics...')

    model = LdaMulticore(
        corpus=corpus,
        num_topics=num_topics,
        id2word=id2word,
        workers=1)
    model_list.append(model)

    coherencemodel = CoherenceModel(
        model=model,
        texts=data_lemmatized,
        dictionary=id2word,
        coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

print('Done.')

In [None]:
# Show graph
plt.plot(multiple_num_topics, coherence_values)

plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')

plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(multiple_num_topics, coherence_values):
    print(f'Num Topics = {m:2d},',
          f'has Coherence Value of {round(cv, 4)}')

In [None]:
# Choose which model in the list you think is the best
# Remember python started indexing from 0
optimal_model = model_list[3]

# Showing different topics
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
# Now run just that model with the exact number of topics you want
ldamallet = LdaMulticore(corpus=corpus, num_topics=8, id2word=id2word, workers=1)

In [None]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# See the Coherence Score
coherence_model_ldamallet = CoherenceModel(
    model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [None]:
def format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=df):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)

        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series(
                        [int(topic_num), round(prop_topic,4), topic_keywords]),
                    ignore_index=True)
            else:
                break

    sent_topics_df.columns = [
        'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = texts
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
# Prepare final results
df_topic_sents_keywords = format_topics_sentences(
    ldamodel=ldamallet, corpus=corpus, texts=df)

df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
    'Document number',
    'Dominant_Topic',
    'Topic_Perc_Contrib',
    'Keywords',
    'file_name',
    'Text']

In [None]:
# Show
df_dominant_topic