# Topic Modeling Exploration
Topic modeling is a type of unsupervised learning for natural language processing problems that groups observations together by topic. 

## Imports

In [None]:
#!pip install spacy

In [None]:
#!pip install pyLDAvis

In [1]:
import re
import numpy as np
import pandas as pd

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import models

#nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
df = pd.read_csv('./Data/Lemmatized_df.csv')

In [3]:
df.head()

Unnamed: 0,rest_cost,cuisine_type,rest_name,rest_rating,rest_review,address_only,rest_borough,rest_zip_code,lemmatized
0,2.0,Pizza,Mama’s Too,8.3,A typical NYC slice shop has a few basic eleme...,"2750 Broadway, New York, NY 10025",Manhattan,10025,typical nyc slice shop basic element counter g...
1,4.0,Japanese,Omakase Room By Tatsu,7.7,Tolerance for group trips can vary widely. Som...,"14 Christopher St, New York, NY 10014",Manhattan,10014,tolerance group trip vary widely people intern...
2,4.0,Japanese,Sushi Azabu,8.5,When the apocalypse eventually comes for New Y...,"428 Greenwich St., New York, NY 10013",Manhattan,10013,apocalypse eventually come new york city going...
3,3.0,Seafood,Saint Julivert Fisherie,7.7,"When you’re young, you don’t have to think muc...","264 Clinton St, New York, NY 11201",Brooklyn,11201,young think much decision know attempt deep en...
4,2.0,Russian,Farida,8.0,"There’s a bleak, sweaty place on 34th Street w...","498 9th Ave, New York, NY 10018",Manhattan,10018,bleak sweaty place 34th street adult run 6 yar...


# Text Preprocessing
Topic modeling with gensim and spaCy is different than other other NLP preprocessing and unsupervised learning techniques I've worked with in the past. This is because the libraries offer different capabilities. The process for preparing data is:
1. Create a list of the documents to analyze
2. Tokenize each document
3. Clean text - remove stopwords, punctuation and non-alphabetical characters
4. Create a gensim `Dictionary` object: this is a way of keeping track of words with a numeric id.
5. Create a gensim `Corpus` using gensim's `doc2bow`. This is a list of tuples for each document that refers to the numeric id and how many times it appears in that document.<br>

Below, I have created a function that does all the text preprocesseing and returns the variables I am interested in keeping. I have also shown how to do this step by step.


#### 1. Create a list of the documents to analyze

In [4]:
data = list(df['rest_review'].values)

In [5]:
#create a variable for the length of the data
data_len = len(data)

#### 2. Tokenize each document

In [6]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [7]:
STOPWORDS = stopwords.words('english')

#### 3. Clean text - remove stopwords, punctuation and non-alphabetical characters

In [8]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_cleaned = []
for review in data:
    tokenized_cleaned.append(clean_text(review))

#### 4. Create a gensim `Dictionary` object: this is a way of keeping track of words with a numeric id.

In [9]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_cleaned)

#### 5. Create a gensim `Corpus` using gensim's `doc2bow`. This is a list of tuples for each document that refers to the numeric id and how many times it appears in that document.

In [10]:
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_cleaned]

In [11]:
#Example of how the corpus looks
print(corpus[20])

[(1, 1), (36, 2), (46, 2), (51, 1), (53, 1), (54, 1), (55, 2), (65, 5), (66, 2), (75, 1), (78, 1), (84, 2), (85, 1), (87, 1), (90, 11), (98, 1), (102, 1), (104, 1), (110, 2), (115, 3), (122, 1), (128, 3), (129, 1), (140, 1), (154, 1), (157, 1), (160, 1), (167, 1), (176, 2), (177, 1), (178, 1), (181, 1), (186, 1), (188, 1), (189, 1), (191, 1), (195, 1), (199, 1), (202, 1), (214, 1), (218, 3), (244, 1), (246, 3), (251, 2), (255, 1), (256, 1), (266, 1), (279, 2), (283, 3), (294, 2), (296, 4), (304, 3), (318, 5), (319, 1), (321, 1), (328, 3), (337, 2), (338, 1), (347, 1), (351, 1), (353, 1), (354, 1), (356, 1), (360, 2), (367, 1), (378, 1), (389, 2), (401, 1), (408, 1), (409, 1), (413, 1), (420, 2), (424, 1), (427, 1), (434, 1), (436, 1), (447, 1), (448, 1), (450, 1), (456, 3), (465, 3), (477, 1), (483, 1), (484, 2), (494, 1), (502, 1), (515, 1), (527, 1), (529, 1), (530, 1), (538, 1), (552, 1), (564, 1), (569, 1), (573, 1), (583, 1), (590, 1), (593, 1), (604, 1), (616, 1), (624, 1), (633,

#### Function to complete text preprocessing

In [13]:
def topic_model_preprocess(text, STOPWORDS):
    #tokenize text
    tokenized_cleaned = []
    for review in text:
        tokenized_text = word_tokenize(review.lower()) #word_tokenize is an nltk function
    
    #clean text
        cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    
    #List of lists
        tokenized_cleaned.append(cleaned_text)
        
    #Create dictionary
    dictionary = corpora.Dictionary(tokenized_cleaned)
    
    #Create corpus
    corpus = [dictionary.doc2bow(word) for word in tokenized_cleaned]
    
    return dictionary, corpus
    

In [14]:
d_trial, c_trial = topic_model_preprocess(data, STOPWORDS)

### Build the Model
The necessary arguments are:
- `corpus`
- `num_topics`
- `id2word` 

In [16]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, 
                            num_topics=10, #10 Topics
                            id2word=dictionary,
                            random_state=100,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True)

In [17]:
# Print the Keywords for each of the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.028*"pizza" + 0.015*"slice" + 0.008*"pie" + 0.008*"line" + 0.006*"game" + 0.006*"oven" + 0.006*"pies" + 0.006*"porchetta" + 0.005*"hype" + 0.005*"expected"'), (1, '0.018*"artichoke" + 0.009*"ribbon" + 0.008*"caracas" + 0.006*"fast" + 0.005*"fajitas" + 0.005*"establishments" + 0.005*"abc" + 0.005*"arepas" + 0.005*"salsa" + 0.005*"box"'), (2, '0.041*"ramen" + 0.010*"broth" + 0.009*"thai" + 0.008*"noodle" + 0.007*"flex" + 0.007*"ippudo" + 0.007*"noodles" + 0.006*"totto" + 0.006*"soup" + 0.005*"dropped"'), (3, '0.014*"like" + 0.012*"food" + 0.012*"restaurant" + 0.011*"good" + 0.011*"one" + 0.010*"place" + 0.008*"get" + 0.007*"new" + 0.007*"people" + 0.007*"also"'), (4, '0.020*"lobster" + 0.013*"chicken" + 0.010*"roll" + 0.007*"blue" + 0.007*"salad" + 0.007*"crazy" + 0.006*"mussels" + 0.006*"burger" + 0.006*"bone" + 0.006*"fried"'), (5, '0.026*"sushi" + 0.013*"fish" + 0.008*"japanese" + 0.006*"per" + 0.005*"decor" + 0.004*"midtown" + 0.004*"pieces" + 0.004*"gari" + 0.004*"tuna" + 0.

In [18]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_cleaned, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.28319857337966

Coherence Score:  0.43562434498342484


In [19]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### It seems like there is a large topic with pretty general words. I will add these to a list of stopwords and remove these in the preprocessing step.

In [20]:
lda_model.get_topic_terms(3)

[(90, 0.014497536),
 (367, 0.012296135),
 (279, 0.011550464),
 (78, 0.011302702),
 (115, 0.01078258),
 (129, 0.010493399),
 (73, 0.00758488),
 (401, 0.007042163),
 (266, 0.006795499),
 (1, 0.0066784373)]

We can extract the top keywords for each topic easily, using `lda_model.top_topics(corpus)` and reference the index for the topic of interest and then the `[0]` index will give a list of tuples with the first value being the percentage and the second being the actual word.

In [21]:
def get_new_stopwords(model, idx):
    new_stops = []
    for i in model.top_topics(corpus)[idx][0]:
        new_stops.append(i[1])
    return new_stops

In [22]:
new_stopwords = get_new_stopwords(lda_model, 0)

### Complicated way to get words using `print_topics`.

In [None]:
topics_new = []
for i in lda_model.print_topics()[3][1].split('*'):
    if '+' in i:
        topics_new.append(i)

In [None]:
topics_to_add = []
for i in topics_new:
    topics_to_add.append(i.split('+')[0].replace('"', '').replace(' ', ''))

In [None]:
topics_to_add = get_new_stopwords(lda_model, 3)

In [None]:
topics_to_add

In [None]:
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [None]:
lsi_model.get_topics()

In [None]:
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))

### Build a second model after adding more stop words

In [57]:
stpwrds = stopwords.words('english')

In [58]:
stpwrds.extend(new_stopwords)

In [40]:
stpwrds.extend(new_stopwords)

In [25]:
topic_model_preprocess(text=data, STOPWORDS=new_stops)

TypeError: argument of type 'NoneType' is not iterable

In [None]:
# Build the LDA model
lda_model_2 = models.LdaModel(corpus=corpus, 
                            num_topics=10, #10 Topics
                            id2word=dictionary,
                            random_state=100,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True)

In [None]:
# Print the Keywords for each of the 10 topics
print(lda_model_2.print_topics())

doc_lda = lda_model_2[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_2.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_2 = CoherenceModel(model=lda_model_2, texts=tokenized_cleaned, dictionary=dictionary, coherence='c_v')
coherence_lda_2 = coherence_model_lda_2.get_coherence()
print('\nCoherence Score: ', coherence_lda_2)

In [None]:
vis_2 = pyLDAvis.gensim.prepare(lda_model_2, corpus, dictionary)
vis_2

In [None]:
#Try with 5 topics
# Build the LDA model
lda_model_3 = models.LdaModel(corpus=corpus, 
                            num_topics=5, #5 Topics
                            id2word=dictionary,
                            random_state=100,
                            update_every=1,
                            chunksize=100,
                            passes=10,
                            alpha='auto',
                            per_word_topics=True)

# Compute Perplexity
print('\nPerplexity: ', lda_model_3.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_3 = CoherenceModel(model=lda_model_3, texts=tokenized_cleaned, dictionary=dictionary, coherence='c_v')
coherence_lda_3 = coherence_model_lda_3.get_coherence()
print('\nCoherence Score: ', coherence_lda_3)

In [None]:
vis_3 = pyLDAvis.gensim.prepare(lda_model_3, corpus, dictionary)
vis_3

## Use Gensim's build in similarity function to build a recommender

In [None]:
index = 83 #have & meyer

In [None]:
text = '''At Bed Bath & Beyond, you can return a waffle iron that you bought two years prior. And if you get a membership to Costco, you can eat as many free samples as your shame will allow (typically, many). We aren’t being paid by these companies to say any of this - we just think that these are perks that probably shouldn’t exist. Sort of like the wine list at Have & Meyer.
Have & Meyer is an Italian wine bar in Williamsburg where you can choose from over 90 natural wines by the glass. If this sounds ridiculous, that’s because it is. It’s sort of like if a movie theater decided to show every current release along with the entire filmography of Nicolas Cage. It’s great and irrational - and, if all of this wine sounds intimidating, you should know that this place is surprisingly unpretentious. If you mispronounce a word, your server will get the gist, and if your wine knowledge is limited to whatever is currently on sale at Trader Joe’s, you might even learn something here. 
To soak up all this wine, there’s some pasta, charcuterie, and larger things like lamb chops and octopus. None of it is really worth traveling for on its own, but it doesn’t need to be. The wine is the star here, and, much like the non-pig animals in Babe and the anaconda in Anaconda, the food plays a perfectly fine supporting role. Have some meat and cheese and maybe a pasta while you enjoy a few wines that you never knew existed, and you’ll be happy.
If you aren’t, the servers will notice, and they’ll try to do something about it. The people here are weirdly nice (and tend to have Italian accents), and you’ll probably leave thinking that you’re more likable than you actually are. It’s a comforting place that feels like something you’d find behind a rotating fireplace in a mansion on a hill where there’s occasionally lightning. There are vintage-y lamps hanging from the ceiling and little antiques lying around, and the walls are lined with enough wine to calm every nervous flyer currently departing from JFK. 
Add all of this up, and it’s surprising that more people don’t know about Have & Meyer. It’s an ideal place to try some wine that would normally require a transatlantic trip, and you can get a seat here any night. So bring a date. Or stop by after you’ve had a long day and want to pretend that you own a wine cellar. Your server might open a bottle just for you - because that’s just what happens when a place has four tables, 15 barstools, and around 100 wines by the glass. And if your server winds up opening several bottles just so you can try a few things, don’t feel bad. Much like a quality return policy, you should be taking advantage of Have & Meyer'''
bow = dictionary.doc2bow(clean_text(text))

In [None]:
from gensim import similarities
 

In [None]:
for i in lda_model[corpus][:10]:
    print(i)

In [None]:
def gensim_recommender(restaurant, ):
    

In [None]:
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 

In [None]:
index_list = []
for i in similarities[:6]:
    index_list.append(i[0])

In [None]:
index_list

In [None]:
# Let's see what's the most similar document
document_id, similarity = similarities[1]
print(data[document_id][:1000])

In [None]:
df.columns

In [None]:
for i in index_list:
    print(df.loc[i, 'rest_name'])
    print(df.loc[i, 'rest_rating'])
    print(df.loc[i, 'rest_cost'])
    print(df.loc[i, 'cuisine_type'])

In [None]:
from my_functions import recommendations
# create cosine sim

In [None]:
recommendations

In [None]:
df.loc[521,'rest_review']

## Try with LSA model

## Find the dominant topic for each sentence

In [None]:
type(lda_model[corpus])

In [None]:
type(lda_model_3[corpus])

In [None]:
lda_model_3[corpus]

In [None]:
for i, row in enumerate(lda_model[corpus]):
    print(sorted(row, key=lambda x: (x[1]), reverse=True))
    #print(sorted(row, key=lambda x: (x[1]), reverse=True))

In [None]:
for i, row in enumerate(lda_model_3[corpus]):
    print(sorted(row[0], key=lambda x: (x[1]), reverse=True))
    #print(sorted(row, key=lambda x: (x[1]), reverse=True))

In [None]:
def format_topics_sentences(model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(model=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
df_dominant_topic['Dominant_Topic'].value_counts()