# Topic Modeling using LDA and guidedLDA

## Modules


In [1]:
import gensim

In [2]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\duygu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\duygu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\duygu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\duygu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Data IO

In [7]:
txt = [
    'I like to eat broccoli and bananas.',
    'I munched a banana and spinach smoothie for breakfast.',
    'Chinchillas and kittens are cute.',
    'My sister adopted a kitten yesterday.',
    'Look at this cute hamster munching on a piece of broccoli.'
]

# Preprocessing

In [8]:
# simplify Penn tags to n (NOUN), v (VERB), a (ADJECTIVE) or r (ADVERB)
def simplify(penn_tag):
    pre = penn_tag[0]
    if (pre == 'J'):
        return 'a'
    elif (pre == 'R'):
        return 'r'
    elif (pre == 'V'):
        return 'v'
    else:
        return 'n'

In [9]:
# define function preprocess(text):
# gensim provides some basic preprocessing steps
def preprocess(text, language='english'):
    stop_words = stopwords.words(language) # stopwords from nltk
    toks = gensim.utils.simple_preprocess(str(text), deacc=True) # gensim
    wn = WordNetLemmatizer() # from nltk
    # return
    return [wn.lemmatize(tok, simplify(pos)) for tok, pos in nltk.pos_tag(toks) if tok not in stop_words]

In [10]:
# Apply function to input text (which is a simple list!)
corp = [preprocess(line, language='english') for line in txt]
corp

[['like', 'eat', 'broccoli', 'banana'],
 ['munch', 'banana', 'spinach', 'smoothie', 'breakfast'],
 ['chinchilla', 'kitten', 'cute'],
 ['sister', 'adopt', 'kitten', 'yesterday'],
 ['look', 'cute', 'hamster', 'munch', 'piece', 'broccoli']]

In [11]:
# build dictionary with gensim
dictionary = gensim.corpora.Dictionary(corp) # gensim
len(dictionary)

17

In [12]:
# Convert into bag-of-words format
bow = [dictionary.doc2bow(line) for line in corp]
bow

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (9, 1), (10, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1)],
 [(1, 1), (5, 1), (9, 1), (14, 1), (15, 1), (16, 1)]]

### Exercise: Please run the Topic Model and check the results

Until 14:50h

In [13]:
#del ldaModel

NameError: name 'ldaModel' is not defined

In [14]:
numberOfTopics = 2

In [15]:
ldaModel = gensim.models.LdaModel(bow, num_topics=numberOfTopics, \
                                  id2word=dictionary, \
                                  passes=30, alpha=[0.01]*numberOfTopics, \
                                  eta=[0.01]*len(dictionary.keys()))

In [16]:
ldaModel.show_topics(formatted=True, num_topics=numberOfTopics, num_words=20)

[(0,
  '0.219*"cute" + 0.110*"chinchilla" + 0.110*"hamster" + 0.110*"kitten" + 0.110*"look" + 0.110*"piece" + 0.110*"broccoli" + 0.110*"munch" + 0.001*"smoothie" + 0.001*"spinach" + 0.001*"breakfast" + 0.001*"like" + 0.001*"eat" + 0.001*"adopt" + 0.001*"sister" + 0.001*"yesterday" + 0.001*"banana"'),
 (1,
  '0.153*"banana" + 0.077*"broccoli" + 0.077*"eat" + 0.077*"like" + 0.077*"breakfast" + 0.077*"munch" + 0.077*"smoothie" + 0.077*"spinach" + 0.077*"yesterday" + 0.077*"kitten" + 0.077*"adopt" + 0.077*"sister" + 0.001*"hamster" + 0.001*"chinchilla" + 0.001*"look" + 0.001*"cute" + 0.001*"piece"')]

In [17]:
#!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models

  from imp import reload
  _np_version_forbids_neg_powint = LooseVersion(numpy.__version__) >= LooseVersion('1.12.0b1')
  _np_version_forbids_neg_powint = LooseVersion(numpy.__version__) >= LooseVersion('1.12.0b1')


In [18]:
vis = pyLDAvis.gensim_models.prepare(topic_model=ldaModel, corpus=bow, dictionary=dictionary)

  default_term_info = default_term_info.sort_values(


In [19]:
pyLDAvis.enable_notebook()

In [20]:
pyLDAvis.display(vis)

## Next exercise: Please apply to review data

Until 15:15h
amzon review data

> `<PandasDataFrame>.tolist()`

> `','.join(<PandasDataFrame>.tolist())`

In [21]:
URL = "https://raw.githubusercontent.com/microsoft/ML-Server-Python-Samples/master/microsoftml/202/data/sentiment_analysis/amazon_cells_labelled.txt"


In [23]:
import pandas as pd

In [25]:
data_amazon = pd.read_table(URL, names=["Review","Label"])

In [27]:
data_amazon.Review

0      So there is no way for me to plug it in here i...
1                            Good case, Excellent value.
2                                 Great for the jawbone.
3      Tied to charger for conversations lasting more...
4                                      The mic is great.
                             ...                        
995    The screen does get smudged easily because it ...
996    What a piece of junk.. I lose more calls on th...
997                         Item Does Not Match Picture.
998    The only thing that disappoint me is the infra...
999    You can not answer calls with the unit, never ...
Name: Review, Length: 1000, dtype: object

In [28]:
reviews= data_amazon.Review.tolist()

In [30]:
reviews

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
 'The mic is great.',
 'I have to jiggle the plug to get it to line up right to get decent volume.',
 'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.',
 'If you are Razr owner...you must have this!',
 'Needless to say, I wasted my money.',
 'What a waste of money and time!.',
 'And the sound quality is great.',
 'He was very impressed when going from the original battery to the extended battery.',
 'If the two were seperated by a mere 5+ ft I started to notice excessive static and garbled sound from the headset.',
 'Very good quality though',
 'The design is very odd, as the ear "clip" is not very comfortable at all.',
 'Highly recommend for any one who has a blue tooth phone.',
 'I advise EVERYO

In [31]:
reviews[:3]

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.']

In [33]:
# Apply function to input text (which is a simple list!)
corp = [preprocess(line, language='english') for line in reviews]
corp

[['way', 'plug', 'u', 'unless', 'go', 'converter'],
 ['good', 'case', 'excellent', 'value'],
 ['great', 'jawbone'],
 ['tie', 'charger', 'conversation', 'last', 'minute', 'major', 'problem'],
 ['mic', 'great'],
 ['jiggle', 'plug', 'get', 'line', 'right', 'get', 'decent', 'volume'],
 ['several',
  'dozen',
  'several',
  'hundred',
  'contact',
  'imagine',
  'fun',
  'send',
  'one',
  'one'],
 ['razr', 'owner', 'must'],
 ['needless', 'say', 'wasted', 'money'],
 ['waste', 'money', 'time'],
 ['sound', 'quality', 'great'],
 ['impressed', 'go', 'original', 'battery', 'extended', 'battery'],
 ['two',
  'seperated',
  'mere',
  'ft',
  'start',
  'notice',
  'excessive',
  'static',
  'garble',
  'sound',
  'headset'],
 ['good', 'quality', 'though'],
 ['design', 'odd', 'ear', 'clip', 'comfortable'],
 ['highly', 'recommend', 'one', 'blue', 'tooth', 'phone'],
 ['advise', 'everyone', 'fool'],
 ['far', 'good'],
 ['work', 'great'],
 ['click',
  'place',
  'way',
  'make',
  'wonder',
  'long',
  

In [34]:
corp[:3]

[['way', 'plug', 'u', 'unless', 'go', 'converter'],
 ['good', 'case', 'excellent', 'value'],
 ['great', 'jawbone']]

In [35]:
dictionary = gensim.corpora.Dictionary(corp) # gensim
len(dictionary)

1430

In [36]:
# Convert into bag-of-words format
bow = [dictionary.doc2bow(line) for line in corp]
bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1)],
 [(10, 1), (11, 1)],
 [(12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
 [(10, 1), (19, 1)],
 [(2, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2), (32, 1), (33, 2)],
 [(34, 1), (35, 1), (36, 1)],
 [(37, 1), (38, 1), (39, 1), (40, 1)],
 [(37, 1), (41, 1), (42, 1)],
 [(10, 1), (43, 1), (44, 1)],
 [(1, 1), (45, 2), (46, 1), (47, 1), (48, 1)],
 [(44, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1)],
 [(8, 1), (43, 1), (59, 1)],
 [(60, 1), (61, 1), (62, 1), (63, 1), (64, 1)],
 [(31, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1)],
 [(70, 1), (71, 1), (72, 1)],
 [(8, 1), (73, 1)],
 [(10, 1), (74, 1)],
 [(5, 1),
  (14, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1)],
 [(1, 1), (21, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), 

In [44]:
NumberOfTopics= 20


In [45]:
ldaModelReviews = gensim.models.LdaModel(bow, num_topics=NumberOfTopics, \
                                  id2word=dictionary, \
                                  passes=30, alpha=[0.01]*NumberOfTopics, \
                                  eta=[0.01]*len(dictionary.keys()))

In [46]:
ldaModelReviews.show_topics(formatted=True, num_topics=NumberOfTopics, num_words=20)

[(0,
  '0.063*"phone" + 0.044*"battery" + 0.027*"ear" + 0.027*"also" + 0.027*"life" + 0.024*"cell" + 0.023*"great" + 0.017*"one" + 0.014*"make" + 0.014*"first" + 0.014*"try" + 0.014*"hand" + 0.011*"well" + 0.011*"easy" + 0.010*"many" + 0.010*"disappointing" + 0.010*"charge" + 0.010*"need" + 0.010*"nice" + 0.010*"avoid"'),
 (1,
  '0.096*"product" + 0.044*"great" + 0.026*"go" + 0.021*"first" + 0.017*"happy" + 0.013*"make" + 0.013*"find" + 0.013*"beep" + 0.013*"best" + 0.013*"picture" + 0.013*"mic" + 0.013*"deal" + 0.013*"review" + 0.013*"bar" + 0.011*"put" + 0.010*"quality" + 0.009*"use" + 0.009*"purchase" + 0.009*"one" + 0.009*"plug"'),
 (2,
  '0.092*"love" + 0.040*"headset" + 0.037*"phone" + 0.029*"bluetooth" + 0.025*"charger" + 0.018*"work" + 0.018*"good" + 0.016*"best" + 0.013*"get" + 0.013*"think" + 0.013*"color" + 0.013*"sharp" + 0.013*"reception" + 0.013*"absolutely" + 0.013*"cell" + 0.013*"market" + 0.013*"pleased" + 0.013*"great" + 0.010*"use" + 0.009*"well"'),
 (3,
  '0.099*"ph

In [47]:
vis = pyLDAvis.gensim_models.prepare(topic_model=ldaModelReviews, corpus=bow, dictionary=dictionary)

  default_term_info = default_term_info.sort_values(


In [48]:
pyLDAvis.display(vis)