# Latent Dirichlet Allocation (LDA) Pipeline Example in Python

Below is a tutorial on how to process data and to train an LDA on it. 

Knowing that the LDA perform better with stop words removal and with lemmatized words, it's results are often ugly on the lemmatized words. To fix that, it's a good thing to be able to do an inverse-lemmatization on the topic words (or topic n-grams) yield by the LDA once trained on the data. 

So here we are: let's do a pipeline that looks like that: 

1. Load a dataset of many comments (or documents)
2. Transform comments to remove stop words
3. Lemmatize the comments without stop words for a better LDA
4. Perform LDA topic modeling
5. Recover words from inverse (backwards) lemmatization on topic words 
6. Clean topic words are available

Note: The classes imported are clean and have unit tests. Don't hesitate to dive in and to check what's under the hood!


In [None]:

from app.data.load_sample_data import load_sample_data
from app.logic.stop_words_remover import StopWordsRemover
from app.logic.stemmer import Stemmer
from app.logic.lda import LDA
from app.logic.count_vectorizer import CountVectorizer


## Load a dataset of many comments (or documents)

In [None]:
messages, comments = load_sample_data()

## Transform comments to remove stop words

In [None]:
fr_en_stopwords = StopWordsRemover()
comments_without_stopwords = fr_en_stopwords.transform(comments)

## Lemmatize the comments without stop words for clean texts

In [None]:
french_stemmer = Stemmer(language='french').fit()
stemmed_comments = [french_stemmer.transform(thread) for thread in comments_without_stopwords]

## Perform LDA topic modeling

In [None]:
# lda = LDA(n_components=2, max_iter=5, learning_method='online', learning_offset=50.)
# lda_sklearn, feature_names = lda.fit(stemmed_comments[-4])

## Recover words from inverse (backwards) lemmatization on topic words 

## Some clean topic words (or expressions) are then available!

## TODO: clean below

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


lda_pipeline = Pipeline([
    ('stopwords', StopWordsRemover()),
    ('stemmer', Stemmer()),
    ('count_vect', CountVectorizer()),
    ('lda', LDA()),
])

# Params where many options are in lists represents choice in the later grid search. 
lda_pipeline_params = {
    'stemmer__language': ['french'],
    'count_vect__max_df': [0.98, 0.95],
    'count_vect__min_df': [2, 3],
    'count_vect__max_features': [10000],
    'count_vect__ngram_range': [(1, 1), (1, 2)],
    'count_vect__strip_accents': ['ascii', 'unicode', None],
    'lda__n_components': [2],
    'lda__max_iter': [100],  # TODO: find good balance here.
    'lda__learning_decay': [0.5, 0.7, 1.0],
    'lda__learning_method': ['online'],
    'lda__learning_offset': [5, 10],
    'lda__batch_size': [1, 10, 25],
    # 'lda__n_jobs': -1,  # Use all CPUs
}
"""
lda_pipeline_params = {
    'stemmer__language': ['french'],
    'count_vect__max_df': [0.98],
    'count_vect__min_df': [2],
    'count_vect__max_features': [10000],
    'count_vect__ngram_range': [(1, 1)],
    'count_vect__strip_accents': [None],
    'lda__n_components': [2],
    'lda__max_iter': [1000],  # TODO: find good balance here.
    'lda__learning_decay': [0.5],
    'lda__learning_method': ['online'],
    'lda__learning_offset': [10],
    'lda__batch_size': [25],
    # 'lda__n_jobs': -1,  # Use all CPUs
}
"""


# TODO: Might need to code a custom parameter search
gs = GridSearchCV(lda_pipeline, lda_pipeline_params, n_jobs=-1, verbose=1, cv=2)
gs.fit(comments[-2])
print("Best score: {}".format(gs.best_score_))



best_params = gs.best_estimator_.get_params()
# import pprint
# pp = pprint.PrettyPrinter(indent=4)
# print(type(best_params))
# pp.pprint(best_params)

best_pipeline = Pipeline(best_params['steps'])
best_pipeline.fit(comments[-2])
print("score:", best_pipeline.score(comments[-2]))

features = best_pipeline.named_steps['count_vect'].get_feature_names()
lda = best_pipeline.named_steps['lda']



In [None]:
lda = best_pipeline.named_steps['lda']
topics = lda.components_
print(topics)
print("")
ans = best_pipeline.inverse_transform(X=None)
print("")
print(ans[0])
print(ans[1])

In [None]:
# messages[-2], comments[-2]
# # features = best_pipeline.named_steps['count_vect'].get_feature_names()

In [None]:

lda = best_pipeline.named_steps['lda']
count_vect = best_pipeline.named_steps['count_vect']
stemmer = best_pipeline.named_steps['stemmer']

lda.print_top_words(count_vect.get_feature_names())
print("")

print(lda.components_)
topics = lda.inverse_transform()
print(topics)
topics_words = count_vect.inverse_transform(topics)
print(topics_words)
final_words = stemmer.inverse_transform(topics_words)
print(final_words[0])
print(final_words[1])


In [None]:

# train, test = sklearn.model_selection.train_test_split(stemmed_comments[-2], test_size=0.2)
# print(len(train), len(test))

# lda_sklearn, feature_names = lda.fit(train)

# print(lda.score(train), lda.perplexity(train))
# print(lda.score(test), lda.perplexity(test))
# -182.51029019702543   27.615270885701467
# -52.05494670051718    41.19061662552155   
# -41.973876576664225  189.9450029464927    
# -32.432059761463556 3320.9791408284505    
# perplexity should go down and score should go down too (more negative). 

In [None]:
# len(stemmed_comments[-2]), len(stemmed_comments[-3]), len(stemmed_comments[-4])

In [None]:
# ([(len(i), a) for a, i in enumerate(stemmed_comments)])

In [None]:
# fr_en_stopwords = StopWordsRemover()
# fr_en_stopwords.remove_from_string("Le chat s'est assis sur le tapis aujour'hui!!! il est très comfortable et n'est pas déçu, tout ronron!")