# Topic Modeling using LDA and guidedLDA

## Modules


In [1]:
import gensim

In [2]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\duygu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\duygu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\duygu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\duygu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Data IO

In [7]:
txt = [
    'I like to eat broccoli and bananas.',
    'I munched a banana and spinach smoothie for breakfast.',
    'Chinchillas and kittens are cute.',
    'My sister adopted a kitten yesterday.',
    'Look at this cute hamster munching on a piece of broccoli.'
]

# Preprocessing

In [8]:
# simplify Penn tags to n (NOUN), v (VERB), a (ADJECTIVE) or r (ADVERB)
def simplify(penn_tag):
    pre = penn_tag[0]
    if (pre == 'J'):
        return 'a'
    elif (pre == 'R'):
        return 'r'
    elif (pre == 'V'):
        return 'v'
    else:
        return 'n'

In [9]:
# define function preprocess(text):
# gensim provides some basic preprocessing steps
def preprocess(text, language='english'):
    stop_words = stopwords.words(language) # stopwords from nltk
    toks = gensim.utils.simple_preprocess(str(text), deacc=True) # gensim
    wn = WordNetLemmatizer() # from nltk
    # return
    return [wn.lemmatize(tok, simplify(pos)) for tok, pos in nltk.pos_tag(toks) if tok not in stop_words]

In [10]:
# Apply function to input text (which is a simple list!)
corp = [preprocess(line, language='english') for line in txt]
corp

[['like', 'eat', 'broccoli', 'banana'],
 ['munch', 'banana', 'spinach', 'smoothie', 'breakfast'],
 ['chinchilla', 'kitten', 'cute'],
 ['sister', 'adopt', 'kitten', 'yesterday'],
 ['look', 'cute', 'hamster', 'munch', 'piece', 'broccoli']]

In [11]:
# build dictionary with gensim
dictionary = gensim.corpora.Dictionary(corp) # gensim
len(dictionary)

17

In [12]:
# Convert into bag-of-words format
bow = [dictionary.doc2bow(line) for line in corp]
bow

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (9, 1), (10, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1)],
 [(1, 1), (5, 1), (9, 1), (14, 1), (15, 1), (16, 1)]]

### Exercise: Please run the Topic Model and check the results

Until 14:50h

In [13]:
del ldaModel

NameError: name 'ldaModel' is not defined

In [15]:
numberOfTopics = 2

In [16]:
ldaModel = gensim.models.LdaModel(bow, num_topics=numberOfTopics, \
                                  id2word=dictionary, \
                                  passes=30, alpha=[0.01]*numberOfTopics, \
                                  eta=[0.01]*len(dictionary.keys()))

In [17]:
ldaModel.show_topics(formatted=True, num_topics=numberOfTopics, num_words=20)

[(0,
  '0.105*"banana" + 0.105*"broccoli" + 0.105*"munch" + 0.053*"breakfast" + 0.053*"smoothie" + 0.053*"spinach" + 0.053*"hamster" + 0.053*"yesterday" + 0.053*"sister" + 0.053*"adopt" + 0.053*"piece" + 0.053*"cute" + 0.053*"look" + 0.053*"like" + 0.053*"eat" + 0.053*"kitten" + 0.001*"chinchilla"'),
 (1,
  '0.319*"chinchilla" + 0.319*"kitten" + 0.319*"cute" + 0.003*"hamster" + 0.003*"yesterday" + 0.003*"sister" + 0.003*"adopt" + 0.003*"look" + 0.003*"banana" + 0.003*"smoothie" + 0.003*"munch" + 0.003*"breakfast" + 0.003*"like" + 0.003*"eat" + 0.003*"broccoli" + 0.003*"spinach" + 0.003*"piece"')]

In [18]:
#!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models

  from imp import reload
  _np_version_forbids_neg_powint = LooseVersion(numpy.__version__) >= LooseVersion('1.12.0b1')
  _np_version_forbids_neg_powint = LooseVersion(numpy.__version__) >= LooseVersion('1.12.0b1')


In [19]:
vis = pyLDAvis.gensim_models.prepare(topic_model=ldaModel, corpus=bow, dictionary=dictionary)

  default_term_info = default_term_info.sort_values(


In [20]:
pyLDAvis.enable_notebook()

In [21]:
pyLDAvis.display(vis)

## Next exercise: Please apply to review data

Until 15:15h
amzon review data

> `<PandasDataFrame>.tolist()`

> `','.join(<PandasDataFrame>.tolist())`