https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [5]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import nltk

# spacy for lemmatization
import spacy

# Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  and should_run_async(code)


In [8]:
import pyLDAvis
# from pyLDAvis import pyLDAvis.gensim  # don't skip this

In [9]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [10]:
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [11]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [14]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [16]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp_posting', 'host', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring']]


In [17]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1)]]


In [19]:
## Check a single word
# id2word[0]

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('name', 1),
  ('nntp_posting', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('thank', 1),
  ('thing', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

In [20]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [21]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.042*"include" + 0.036*"program" + 0.032*"source" + 0.028*"available" + '
  '0.027*"information" + 0.023*"provide" + 0.021*"may" + 0.020*"number" + '
  '0.019*"code" + 0.019*"also"'),
 (1,
  '0.824*"ax" + 0.063*"max" + 0.009*"dual" + 0.005*"film" + 0.004*"correction" '
  '+ 0.004*"brand" + 0.004*"plot" + 0.003*"payment" + 0.001*"bare" + '
  '0.000*"arrow"'),
 (2,
  '0.158*"file" + 0.072*"entry" + 0.048*"graphic" + 0.037*"notice" + '
  '0.029*"sorry" + 0.029*"format" + 0.028*"printer" + 0.025*"convert" + '
  '0.025*"hot" + 0.022*"cool"'),
 (3,
  '0.056*"image" + 0.056*"bit" + 0.050*"color" + 0.046*"chip" + '
  '0.042*"display" + 0.038*"memory" + 0.036*"slow" + 0.035*"monitor" + '
  '0.034*"character" + 0.026*"board"'),
 (4,
  '0.065*"drug" + 0.052*"mount" + 0.046*"wing" + 0.045*"headache" + '
  '0.041*"recommend" + 0.041*"treatment" + 0.039*"external" + 0.036*"doctor" + '
  '0.034*"boy" + 0.031*"should"'),
 (5,
  '0.118*"space" + 0.037*"launch" + 0.033*"science" + 0.031*"scient

In [25]:
from gensim.test.utils import common_corpus, common_dictionary
lda = gensim.models.LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   num_topics=20, 
                   random_state=100,
                   chunksize=100,
                   passes=10,
                   per_word_topics=True)

In [26]:
# Print the Keyword in the 10 topics
pprint(lda.print_topics())
doc_lda = lda[corpus]

[(0,
  '0.023*"gun" + 0.016*"write" + 0.014*"would" + 0.013*"article" + '
  '0.011*"line" + 0.009*"be" + 0.009*"think" + 0.008*"well" + 0.007*"point" + '
  '0.006*"get"'),
 (1,
  '0.017*"say" + 0.014*"go" + 0.011*"write" + 0.009*"see" + 0.009*"people" + '
  '0.008*"come" + 0.008*"would" + 0.008*"article" + 0.008*"line" + '
  '0.007*"know"'),
 (2,
  '0.027*"file" + 0.013*"program" + 0.013*"entry" + 0.008*"use" + 0.008*"font" '
  '+ 0.008*"wire" + 0.007*"include" + 0.007*"line" + 0.007*"output" + '
  '0.006*"character"'),
 (3,
  '0.018*"space" + 0.010*"launch" + 0.008*"system" + 0.007*"orbit" + '
  '0.007*"mission" + 0.006*"write" + 0.006*"earth" + 0.006*"would" + '
  '0.006*"satellite" + 0.006*"science"'),
 (4,
  '0.873*"ax" + 0.065*"max" + 0.004*"di_di" + 0.001*"pl_pl" + 0.001*"ei_ei" + '
  '0.001*"qq" + 0.001*"qax" + 0.001*"bhj_bhj" + 0.001*"giz_giz" + '
  '0.000*"chlorine"'),
 (5,
  '0.011*"israeli" + 0.009*"report" + 0.008*"kill" + 0.007*"attack" + '
  '0.006*"year" + 0.006*"would" 

In [27]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -12.233887401191655

Coherence Score:  0.5030077312264531


In [29]:
# Visualize the topics
pyLDAvis.enable_notebook()

In [34]:
vis = pyLDAvis.prepare(lda_model, vocab=corpus, term_frequency=id2word)
vis

TypeError: prepare() missing 2 required positional arguments: 'doc_topic_dists' and 'doc_lengths'

In [None]:
corpus

In [38]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

[(0,
  '0.042*"include" + 0.036*"program" + 0.032*"source" + 0.028*"available" + '
  '0.027*"information" + 0.023*"provide" + 0.021*"may" + 0.020*"number" + '
  '0.019*"code" + 0.019*"also"'),
 (1,
  '0.824*"ax" + 0.063*"max" + 0.009*"dual" + 0.005*"film" + 0.004*"correction" '
  '+ 0.004*"brand" + 0.004*"plot" + 0.003*"payment" + 0.001*"bare" + '
  '0.000*"arrow"'),
 (2,
  '0.158*"file" + 0.072*"entry" + 0.048*"graphic" + 0.037*"notice" + '
  '0.029*"sorry" + 0.029*"format" + 0.028*"printer" + 0.025*"convert" + '
  '0.025*"hot" + 0.022*"cool"'),
 (3,
  '0.056*"image" + 0.056*"bit" + 0.050*"color" + 0.046*"chip" + '
  '0.042*"display" + 0.038*"memory" + 0.036*"slow" + 0.035*"monitor" + '
  '0.034*"character" + 0.026*"board"'),
 (4,
  '0.065*"drug" + 0.052*"mount" + 0.046*"wing" + 0.045*"headache" + '
  '0.041*"recommend" + 0.041*"treatment" + 0.039*"external" + 0.036*"doctor" + '
  '0.034*"boy" + 0.031*"should"'),
 (5,
  '0.118*"space" + 0.037*"launch" + 0.033*"science" + 0.031*"scient

In [41]:
doc_lda = lda_model[corpus]
print(doc_lda)

<gensim.interfaces.TransformedCorpus object at 0x00000292269D3EE0>
