In [None]:
#!pip install pyLDAvis

In [1]:
import pandas as pd
import numpy as np

# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns

# Model 1: Food and Animals

In [2]:
doc_1 = 'I like to eat broccoli and bananas.'
doc_2 = 'I ate a banana and spinach smoothie for breakfast.'
doc_3 = 'Chinchillas and kittens are cute.'
doc_4 = 'My sister adopted a kitten yesterday.'
doc_5 = 'Look at this cute hamster munching on a piece of broccoli.'

## Step 1: Preprocess our text.

In [3]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Stems
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # stemming
    porter_stemmer = PorterStemmer()
    
    text_processed = [porter_stemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return text_processed ## <-- we're keeping our words distinct

In [5]:
texts = [text_process(doc_1),
         text_process(doc_2),
         text_process(doc_3),
         text_process(doc_4),
         text_process(doc_5)]

In [6]:
texts

[['like', 'eat', 'broccoli', 'banana'],
 ['ate', 'banana', 'spinach', 'smoothi', 'breakfast'],
 ['chinchilla', 'kitten', 'cute'],
 ['sister', 'adopt', 'kitten', 'yesterday'],
 ['look', 'cute', 'hamster', 'munch', 'piec', 'broccoli']]

## Step 2: Fit LDA Model.

In [4]:
from gensim import corpora, models

In [7]:
import pyLDAvis.gensim

In [8]:
pyLDAvis.enable_notebook() # In order for the visualization to show up

In [None]:
# np.random.seed(42)

In [9]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

In [13]:
for text in texts:
    print(dictionary.doc2bow(text))

[(0, 1), (1, 1), (2, 1), (3, 1)]
[(0, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(8, 1), (9, 1), (10, 1)]
[(10, 1), (11, 1), (12, 1), (13, 1)]
[(1, 1), (9, 1), (14, 1), (15, 1), (16, 1), (17, 1)]


In [14]:
ldamodel = models.ldamodel.LdaModel(corpus,
                                    2, # number of topics
                                    id2word=dictionary, # connect each word to its dict key
                                    passes=5, #similar to epochs
                                    minimum_probability=0.01) # only include significant results

## Step 3: Visualize LDA model.

In [15]:
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

## Step 4: Update model with new data!

In [None]:
doc_6 = 'That cat is so cute! It looks good enough to eat.'

# Model 2: Yelp Reviews

In [None]:
review = pd.read_json("./yelp_academic_dataset_review.json")