In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [2]:
with open(r"../data/interim/corpus.pkl", "rb") as input_file:
    corpus = pickle.load(input_file)

In [3]:
with open(r"../data/interim/id2word.pkl", "rb") as input_file:
    id2word = pickle.load(input_file)

In [9]:
with open(r"../data/interim/data_lemmatized.pkl", "rb") as input_file:
    data_lemmatized = pickle.load(input_file)

## Latent Dirichlet Allocation

![Latent Dirichlet Allocation](https://miro.medium.com/max/1400/1*fCc0JT3W-1ViYyw0hJ7rdA.jpeg)

In [4]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [5]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

[(0,
  '0.036*"write" + 0.030*"line" + 0.028*"would" + 0.021*"know" + 0.020*"be" + '
  '0.020*"go" + 0.018*"organization" + 0.017*"get" + 0.016*"host" + '
  '0.015*"article"'),
 (1,
  '0.042*"people" + 0.036*"law" + 0.033*"government" + 0.028*"child" + '
  '0.027*"kill" + 0.022*"death" + 0.021*"state" + 0.021*"right" + '
  '0.017*"country" + 0.015*"attack"'),
 (2,
  '0.115*"key" + 0.037*"chip" + 0.033*"encryption" + 0.024*"distribution" + '
  '0.023*"security" + 0.023*"engine" + 0.022*"algorithm" + 0.020*"block" + '
  '0.019*"public" + 0.019*"use"'),
 (3,
  '0.187*"space" + 0.059*"launch" + 0.047*"orbit" + 0.039*"mission" + '
  '0.039*"satellite" + 0.038*"vote" + 0.037*"earth" + 0.033*"flight" + '
  '0.028*"schedule" + 0.026*"shuttle"'),
 (4,
  '0.024*"report" + 0.022*"science" + 0.021*"patient" + 0.018*"technology" + '
  '0.017*"study" + 0.017*"research" + 0.014*"product" + 0.013*"datum" + '
  '0.013*"provide" + 0.012*"effect"'),
 (5,
  '0.075*"die" + 0.055*"season" + 0.044*"brain" + 

In [6]:
lda_model.alpha

array([7.2278395 , 0.55834854, 0.3315322 , 0.10451327, 0.73439157,
       0.2437226 , 0.31260777, 0.49960196, 0.42463207, 0.60634536,
       1.3226882 , 3.8760204 , 1.6370218 , 0.04474121, 0.20247199,
       0.28307682, 0.3751149 , 0.21903265, 0.07500331, 0.11380772],
      dtype=float32)

In [7]:
doc_lda = lda_model[corpus]

In [11]:
#Perplejidad
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Score de coherencia
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -12.27859394052328

Coherence Score:  0.47785807626836246


In [10]:
# Visualizamos los temas
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))
