LDA (LatentDirichletAllocation): [link](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0)

In [11]:
import pandas as pd
import re

from sklearn.datasets import fetch_20newsgroups

In [12]:
seed=94487

In [28]:
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

In [14]:
df = pd.DataFrame({'text':train.data})
df.sample(5, random_state=seed)

Unnamed: 0,text
8110,From: stevela@csulb.edu (Steve La)\nSubject: C...
673,From: donb@netcom.com (Don Baldwin)\nSubject: ...
5054,From: marc@ccvi.ccv.FR (Marc Bassini)\nSubject...
685,From: wally@Auspex.COM (Wally Bass)\nSubject: ...
4139,From: usenet@news.cso.uiuc.edu (Net Noise owne...


In [15]:
# Remove punctuation
df['text_processed'] = \
df['text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
df['text_processed'] = \
df['text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
df.sample(5, random_state=seed)

  df['text'].map(lambda x: re.sub('[,\.!?]', '', x))


Unnamed: 0,text,text_processed
8110,From: stevela@csulb.edu (Steve La)\nSubject: C...,from: stevela@csulbedu (steve la)\nsubject: cd...
673,From: donb@netcom.com (Don Baldwin)\nSubject: ...,from: donb@netcomcom (don baldwin)\nsubject: r...
5054,From: marc@ccvi.ccv.FR (Marc Bassini)\nSubject...,from: marc@ccviccvfr (marc bassini)\nsubject: ...
685,From: wally@Auspex.COM (Wally Bass)\nSubject: ...,from: wally@auspexcom (wally bass)\nsubject: r...
4139,From: usenet@news.cso.uiuc.edu (Net Noise owne...,from: usenet@newscsouiucedu (net noise owner)\...


In [16]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = df.text_processed.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carlosmorote/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['lerxst', 'wamumdedu', 'thing', 'car', 'nntp', 'posting', 'host', 'rac', 'wamumdedu', 'organization', 'university', 'maryland', 'college', 'park', 'lines', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'door', 'sports', 'car', 'looked', 'late', 'early', 'called', 'bricklin']


In [17]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 5), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]


In [20]:
from pprint import pprint

# number of topics
num_topics = 4

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 5 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.006*"organization" + 0.006*"would" + 0.006*"lines" + 0.004*"one" + '
  '0.003*"posting" + 0.003*"article" + 0.003*"writes" + 0.003*"get" + '
  '0.003*"know" + 0.003*"god"'),
 (1,
  '0.006*"organization" + 0.005*"lines" + 0.005*"one" + 0.004*"people" + '
  '0.004*"writes" + 0.004*"like" + 0.004*"would" + 0.004*"article" + '
  '0.004*"posting" + 0.003*"know"'),
 (2,
  '0.013*"ax" + 0.007*"lines" + 0.005*"organization" + 0.005*"one" + '
  '0.005*"would" + 0.004*"article" + 0.004*"writes" + 0.004*"university" + '
  '0.003*"think" + 0.003*"people"'),
 (3,
  '0.103*"ax" + 0.008*"max" + 0.006*"lines" + 0.005*"organization" + '
  '0.004*"one" + 0.004*"writes" + 0.004*"would" + 0.003*"article" + '
  '0.003*"like" + 0.003*"host"')]


In [22]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

  default_term_info = default_term_info.sort_values(


In [50]:
from gensim.test.utils import common_corpus, common_dictionary

interest_themes = [['soccer']]
interest_themes_corpus = [common_dictionary.doc2bow(text) for text in interest_themes]
sorted(lda_model[interest_themes_corpus[0]], key=lambda x: x[1])

[(0, 0.25), (1, 0.25), (2, 0.25), (3, 0.25)]